def testWriteMethodsOnReadOnlyFile(self): with BZ2File(self.filename, "w") as bz2f: bz2f.write(b"abc") with BZ2File(self.filename, "r") as bz2f: self.assertRaises(IOError, bz2f.write, b"a") self.assertRaises(IOError, bz2f.writelines, [b"a"])
def testSeekable(self): bz2f = BZ2File(BytesIO(self.DATA)) try: self.assertTrue(bz2f.seekable()) bz2f.read() self.assertTrue(bz2f.seekable()) finally: bz2f.close() self.assertRaises(ValueError, bz2f.seekable) bz2f = BZ2File(BytesIO(), "w") try: self.assertFalse(bz2f.seekable()) finally: bz2f.close() self.assertRaises(ValueError, bz2f.seekable) src = BytesIO(self.DATA) src.seekable = lambda: False bz2f = BZ2File(src) try: self.assertFalse(bz2f.seekable()) finally: bz2f.close() self.assertRaises(ValueError, bz2f.seekable)
def testAppend(self): with BZ2File(self.filename, "w") as bz2f: self.assertRaises(TypeError, bz2f.write) bz2f.write(self.TEXT) with BZ2File(self.filename, "a") as bz2f: self.assertRaises(TypeError, bz2f.write) bz2f.write(self.TEXT) with open(self.filename, 'rb') as f: self.assertEqual(self.decompress(f.read()), self.TEXT * 2)
def testMultiStreamOrdering(self): # Test the ordering of streams when reading a multi-stream archive. data1 = b"foo" * 1000 data2 = b"bar" * 1000 with BZ2File(self.filename, "w") as bz2f: bz2f.write(data1) with BZ2File(self.filename, "a") as bz2f: bz2f.write(data2) with BZ2File(self.filename) as bz2f: self.assertEqual(bz2f.read(), data1 + data2)
def testReadlinesNoNewline(self): # Issue #1191043: readlines() fails on a file containing no newline. data = b'BZh91AY&SY\xd9b\x89]\x00\x00\x00\x03\x80\x04\x00\x02\x00\x0c\x00 \x00!\x9ah3M\x13<]\xc9\x14\xe1BCe\x8a%t' with open(self.filename, "wb") as f: f.write(data) with BZ2File(self.filename) as bz2f: lines = bz2f.readlines() self.assertEqual(lines, [b'Test']) with BZ2File(self.filename) as bz2f: xlines = list(bz2f.readlines()) self.assertEqual(xlines, [b'Test'])
def test_read_truncated(self): # Drop the eos_magic field (6 bytes) and CRC (4 bytes). truncated = self.DATA[:-10] with BZ2File(BytesIO(truncated)) as f: self.assertRaises(EOFError, f.read) with BZ2File(BytesIO(truncated)) as f: self.assertEqual(f.read(len(self.TEXT)), self.TEXT) self.assertRaises(EOFError, f.read, 1) # Incomplete 4-byte file header, and block header of at least 146 bits. for i in range(22): with BZ2File(BytesIO(truncated[:i])) as f: self.assertRaises(EOFError, f.read, 1)
def testOpenBytesFilename(self): str_filename = self.filename try: bytes_filename = str_filename.encode("ascii") except UnicodeEncodeError: self.skipTest("Temporary file name needs to be ASCII") with BZ2File(bytes_filename, "wb") as f: f.write(self.DATA) with BZ2File(bytes_filename, "rb") as f: self.assertEqual(f.read(), self.DATA) # Sanity check that we are actually operating on the right file. with BZ2File(str_filename, "rb") as f: self.assertEqual(f.read(), self.DATA)
def testSeekPostEndTwiceMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: bz2f.seek(150000) bz2f.seek(150000) self.assertEqual(bz2f.tell(), len(self.TEXT) * 5) self.assertEqual(bz2f.read(), b"")
def prechew_gp(format, data): """Import data for training""" # # open dump # bz = BZ2File(data) # # iterate over records # training_data = {} for record in tqdm(wdp.Parser(bz), total=820000): # read only German entries if 'language' not in record or record['language'] != 'Deutsch': continue # skipping multi word expressions! elif 'syllables' in record and 'ipa' in record and not any( c in record['title'] for c in string.whitespace) and not any( c in record['ipa'] for c in string.whitespace): graph_rep = "".join(record['syllables']).lower() phon_rep = clean_wiki_re.sub("", record.get('ipa')) if graph_rep not in training_data: training_data[graph_rep] = set() training_data[graph_rep].add(phon_rep) # # print training data # for graph_rep, phon_reps in training_data.items(): for phon_rep in phon_reps: click.echo("%s\t%s" % (graph_rep, phon_rep))
def articlecollector(path_articles_xml, outpath_articles, articleids): print("\nCollecting articles for \'%s\' from %s\n..." % ( wantedCategory, path_articles_xml)) title_path = etree.ETXPath("child::" + Ttitle) id_path = etree.ETXPath("child::" + Tid) text_path = etree.ETXPath("child::" + Trev + "/" + Ttext) extracted_count = 0 start = time.time() try: with BZ2File(outpath_articles, "w", compresslevel=9) as file, \ etree.xmlfile(file, encoding="utf-8") as newfile, \ newfile.element("mediawiki", xmlns=Header): context = etree.iterparse(path_articles_xml, events=("end",), tag={Tnamespaces, Tpage}) for action, elem in context: if elem.tag == Tpage and id_path(elem)[ 0].text in articleids: create_page(elem, title_path, id_path, text_path, articleids, newfile) extracted_count += 1 elif elem.tag == Tnamespaces: create_namespace(elem, newfile) elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] except FileNotFoundError as e: print(e.filename, "not found") raise e end = time.time() printTime(start, end) return extracted_count
def testMixedIterationAndReads(self): self.createTempFile() linelen = len(self.TEXT_LINES[0]) halflen = linelen // 2 with BZ2File(self.filename) as bz2f: bz2f.read(halflen) self.assertEqual(next(bz2f), self.TEXT_LINES[0][halflen:]) self.assertEqual(bz2f.read(), self.TEXT[linelen:]) with BZ2File(self.filename) as bz2f: bz2f.readline() self.assertEqual(next(bz2f), self.TEXT_LINES[1]) self.assertEqual(bz2f.readline(), self.TEXT_LINES[2]) with BZ2File(self.filename) as bz2f: bz2f.readlines() self.assertRaises(StopIteration, next, bz2f) self.assertEqual(bz2f.readlines(), [])
def _compression_wrapper(file_obj, filename, mode): """ This function will wrap the file_obj with an appropriate [de]compression mechanism based on the extension of the filename. file_obj must either be a filehandle object, or a class which behaves like one. If the filename extension isn't recognized, will simply return the original file_obj. """ _, ext = os.path.splitext(filename) if _need_to_buffer(file_obj, mode, ext): warnings.warn('streaming gzip support unavailable, see %s' % _ISSUE_189_URL) file_obj = io.BytesIO(file_obj.read()) if ext == '.bz2': return BZ2File(file_obj, mode) elif ext == '.gz': return gzip.GzipFile(fileobj=file_obj, mode=mode) elif ext == '.xz': return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) else: return file_obj
def get_url_html(tree, doc_id): path_info = id2path(doc_id, tree) if path_info != 'wrong id': path = '/home/luocheng/zhengyukun/index_build/sogouTSample/sample_result/sogout_data.' + path_info[0] + \ '.comp/sogout_data.' + path_info[0] + '.comp.part-m-' + path_info[1] + '.sample.bz2' f = BZ2File(path, 'r') # print 'begin' # f.read() # print 'end' cnt = 0 for line in f: cnt += 1 if cnt == path_info[2]: try: return json.loads(line) except: return 'wrong id' f.close() # try: # line = linecache.getline(path, path_info[2]) # if url_flag == True: # return json.loads(line)['url'] # else: # return json.loads(line)['content'] # except: # return 'wrong id' return 'wrong id'
def testWriteBytesIO(self): with BytesIO() as bio: with BZ2File(bio, "w") as bz2f: self.assertRaises(TypeError, bz2f.write) bz2f.write(self.TEXT) self.assertEqual(self.decompress(bio.getvalue()), self.TEXT) self.assertFalse(bio.closed)
def wrap_fp(fp): if suffix == ".gz": fp = GzipFile(fileobj=fp, mode=mode) elif suffix == ".bz2": try: fp = BZ2File(fp, mode=mode) except TypeError: if sys.version_info < (3, 0, 0): raise NotImplementedError( "built-in BZ2File is partially broken in python 2, install bz2file from pypi or use a compression setting other than 'bz2'" ) else: raise elif suffix == ".xz": fp = LZMAFile(fp, mode=mode) if (suffix or sys.version_info < (3, )) and "b" not in mode: # If mode is not binary (and we expect to be able to # write() str values, not bytes), need need to create # an additional encoding wrapper. That encoder can # probably use UTF-8 without any need for additional # configuration if "r" in mode and "w" in mode: fp = StreamReaderWriter(fp, codecs.getreader("utf-8"), codecs.getwriter("utf-8")) elif "w" in mode: fp = codecs.getwriter("utf-8")(fp) elif suffix: fp = codecs.getreader("utf-8")(fp) fp.realname = filename return fp
def testPeek(self): self.createTempFile() with BZ2File(self.filename) as bz2f: pdata = bz2f.peek() self.assertNotEqual(len(pdata), 0) self.assertTrue(self.TEXT.startswith(pdata)) self.assertEqual(bz2f.read(), self.TEXT)
def testPeekBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: pdata = bz2f.peek() self.assertNotEqual(len(pdata), 0) self.assertTrue(self.TEXT.startswith(pdata)) self.assertEqual(bz2f.read(), self.TEXT)
def almost_smart_open(fname, mode='r'): _, ext = path.splitext(fname) if ext == '.bz2': return BZ2File(fname, mode) if ext == '.gz': return gzip.open(fname, mode) return open(fname, mode)
def smart_open(fname, mode='r'): _, ext = path.splitext(fname) if ext == '.bz2': return closing(BZ2File(fname, mode)) if ext == '.gz': return closing(gzip.open(fname, mode)) return open(fname, mode)
def testOpenDel(self): if platform.python_implementation() != "CPython": self.skipTest("Test depends on CPython refcounting semantics") self.createTempFile() for i in range(10000): o = BZ2File(self.filename) del o
def testWritable(self): bz2f = BZ2File(BytesIO(self.DATA)) try: self.assertFalse(bz2f.writable()) bz2f.read() self.assertFalse(bz2f.writable()) finally: bz2f.close() self.assertRaises(ValueError, bz2f.writable) bz2f = BZ2File(BytesIO(), "w") try: self.assertTrue(bz2f.writable()) finally: bz2f.close() self.assertRaises(ValueError, bz2f.writable)
def testClosedIteratorDeadlock(self): # Issue #3309: Iteration on a closed BZ2File should release the lock. self.createTempFile() bz2f = BZ2File(self.filename) bz2f.close() self.assertRaises(ValueError, next, bz2f) # This call will deadlock if the above call failed to release the lock. self.assertRaises(ValueError, bz2f.readlines)
def testSeekBackwardsAcrossStreams(self): self.createTempFile(streams=2) with BZ2File(self.filename) as bz2f: readto = len(self.TEXT) + 100 while readto > 0: readto -= len(bz2f.read(readto)) bz2f.seek(-150, 1) self.assertEqual(bz2f.read(), self.TEXT[100 - 150:] + self.TEXT)
def testWriteLines(self): with BZ2File(self.filename, "w") as bz2f: self.assertRaises(TypeError, bz2f.writelines) bz2f.writelines(self.TEXT_LINES) # Issue #1535500: Calling writelines() on a closed BZ2File # should raise an exception. self.assertRaises(ValueError, bz2f.writelines, ["a"]) with open(self.filename, 'rb') as f: self.assertEqual(self.decompress(f.read()), self.TEXT)
def testFileno(self): self.createTempFile() with open(self.filename, 'rb') as rawf: bz2f = BZ2File(rawf) try: self.assertEqual(bz2f.fileno(), rawf.fileno()) finally: bz2f.close() self.assertRaises(ValueError, bz2f.fileno)
def best_guess_open(file_name): """ Use bz2file to iterate over a compressed file, regular open otherwise.""" if file_name.endswith('.bz2'): return BZ2File(file_name) elif file_name.endswith('.gz'): return gzip.open(file_name) else: return open(file_name)
def testReadChunk10MultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: text = b'' while True: str = bz2f.read(10) if not str: break text += str self.assertEqual(text, self.TEXT * 5)
def testWriteChunks10(self): with BZ2File(self.filename, "w") as bz2f: n = 0 while True: str = self.TEXT[n * 10:(n + 1) * 10] if not str: break bz2f.write(str) n += 1 with open(self.filename, 'rb') as f: self.assertEqual(self.decompress(f.read()), self.TEXT)
def test_can_read_multistream_bz2(self): if PY2: # this is a backport from Python 3 from bz2file import BZ2File else: from bz2 import BZ2File test_file = self.create_temp_bz2(streams=5) with BZ2File(test_file) as bz2f: self.assertEqual(bz2f.read(), self.TEXT * 5) self.cleanup_temp_bz2(test_file)
def testContextProtocol(self): f = None with BZ2File(self.filename, "wb") as f: f.write(b"xxx") f = BZ2File(self.filename, "rb") f.close() try: with f: pass except ValueError: pass else: self.fail("__enter__ on a closed file didn't raise an exception") try: with BZ2File(self.filename, "wb") as f: 1 / 0 except ZeroDivisionError: pass else: self.fail("1/0 didn't raise an exception")