예제 #1
0
    def testWriteMethodsOnReadOnlyFile(self):
        with BZ2File(self.filename, "w") as bz2f:
            bz2f.write(b"abc")

        with BZ2File(self.filename, "r") as bz2f:
            self.assertRaises(IOError, bz2f.write, b"a")
            self.assertRaises(IOError, bz2f.writelines, [b"a"])
예제 #2
0
    def testSeekable(self):
        bz2f = BZ2File(BytesIO(self.DATA))
        try:
            self.assertTrue(bz2f.seekable())
            bz2f.read()
            self.assertTrue(bz2f.seekable())
        finally:
            bz2f.close()
        self.assertRaises(ValueError, bz2f.seekable)

        bz2f = BZ2File(BytesIO(), "w")
        try:
            self.assertFalse(bz2f.seekable())
        finally:
            bz2f.close()
        self.assertRaises(ValueError, bz2f.seekable)

        src = BytesIO(self.DATA)
        src.seekable = lambda: False
        bz2f = BZ2File(src)
        try:
            self.assertFalse(bz2f.seekable())
        finally:
            bz2f.close()
        self.assertRaises(ValueError, bz2f.seekable)
예제 #3
0
 def testAppend(self):
     with BZ2File(self.filename, "w") as bz2f:
         self.assertRaises(TypeError, bz2f.write)
         bz2f.write(self.TEXT)
     with BZ2File(self.filename, "a") as bz2f:
         self.assertRaises(TypeError, bz2f.write)
         bz2f.write(self.TEXT)
     with open(self.filename, 'rb') as f:
         self.assertEqual(self.decompress(f.read()), self.TEXT * 2)
예제 #4
0
 def testMultiStreamOrdering(self):
     # Test the ordering of streams when reading a multi-stream archive.
     data1 = b"foo" * 1000
     data2 = b"bar" * 1000
     with BZ2File(self.filename, "w") as bz2f:
         bz2f.write(data1)
     with BZ2File(self.filename, "a") as bz2f:
         bz2f.write(data2)
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(bz2f.read(), data1 + data2)
예제 #5
0
 def testReadlinesNoNewline(self):
     # Issue #1191043: readlines() fails on a file containing no newline.
     data = b'BZh91AY&SY\xd9b\x89]\x00\x00\x00\x03\x80\x04\x00\x02\x00\x0c\x00 \x00!\x9ah3M\x13<]\xc9\x14\xe1BCe\x8a%t'
     with open(self.filename, "wb") as f:
         f.write(data)
     with BZ2File(self.filename) as bz2f:
         lines = bz2f.readlines()
     self.assertEqual(lines, [b'Test'])
     with BZ2File(self.filename) as bz2f:
         xlines = list(bz2f.readlines())
     self.assertEqual(xlines, [b'Test'])
예제 #6
0
 def test_read_truncated(self):
     # Drop the eos_magic field (6 bytes) and CRC (4 bytes).
     truncated = self.DATA[:-10]
     with BZ2File(BytesIO(truncated)) as f:
         self.assertRaises(EOFError, f.read)
     with BZ2File(BytesIO(truncated)) as f:
         self.assertEqual(f.read(len(self.TEXT)), self.TEXT)
         self.assertRaises(EOFError, f.read, 1)
     # Incomplete 4-byte file header, and block header of at least 146 bits.
     for i in range(22):
         with BZ2File(BytesIO(truncated[:i])) as f:
             self.assertRaises(EOFError, f.read, 1)
예제 #7
0
 def testOpenBytesFilename(self):
     str_filename = self.filename
     try:
         bytes_filename = str_filename.encode("ascii")
     except UnicodeEncodeError:
         self.skipTest("Temporary file name needs to be ASCII")
     with BZ2File(bytes_filename, "wb") as f:
         f.write(self.DATA)
     with BZ2File(bytes_filename, "rb") as f:
         self.assertEqual(f.read(), self.DATA)
     # Sanity check that we are actually operating on the right file.
     with BZ2File(str_filename, "rb") as f:
         self.assertEqual(f.read(), self.DATA)
예제 #8
0
 def testSeekPostEndTwiceMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(150000)
         bz2f.seek(150000)
         self.assertEqual(bz2f.tell(), len(self.TEXT) * 5)
         self.assertEqual(bz2f.read(), b"")
예제 #9
0
def prechew_gp(format, data):
    """Import data for training"""

    #
    # open dump
    #
    bz = BZ2File(data)

    #
    # iterate over records
    #
    training_data = {}
    for record in tqdm(wdp.Parser(bz), total=820000):
        # read only German entries
        if 'language' not in record or record['language'] != 'Deutsch':
            continue
        # skipping multi word expressions!
        elif 'syllables' in record and 'ipa' in record and not any(
                c in record['title'] for c in string.whitespace) and not any(
                    c in record['ipa'] for c in string.whitespace):
            graph_rep = "".join(record['syllables']).lower()
            phon_rep = clean_wiki_re.sub("", record.get('ipa'))
            if graph_rep not in training_data:
                training_data[graph_rep] = set()
            training_data[graph_rep].add(phon_rep)

    #
    # print training data
    #
    for graph_rep, phon_reps in training_data.items():
        for phon_rep in phon_reps:
            click.echo("%s\t%s" % (graph_rep, phon_rep))
예제 #10
0
def articlecollector(path_articles_xml, outpath_articles, articleids):
    print("\nCollecting articles for \'%s\' from %s\n..." % (
        wantedCategory, path_articles_xml))
    title_path = etree.ETXPath("child::" + Ttitle)
    id_path = etree.ETXPath("child::" + Tid)
    text_path = etree.ETXPath("child::" + Trev + "/" + Ttext)
    extracted_count = 0
    start = time.time()
    try:
        with BZ2File(outpath_articles, "w", compresslevel=9) as file, \
                etree.xmlfile(file, encoding="utf-8") as newfile, \
                newfile.element("mediawiki",
                                xmlns=Header):
            context = etree.iterparse(path_articles_xml,
                                      events=("end",),
                                      tag={Tnamespaces, Tpage})
            for action, elem in context:
                if elem.tag == Tpage and id_path(elem)[
                    0].text in articleids:
                    create_page(elem, title_path, id_path, text_path,
                                articleids, newfile)
                    extracted_count += 1
                elif elem.tag == Tnamespaces:
                    create_namespace(elem, newfile)
                elem.clear()
                while elem.getprevious() is not None:
                    del elem.getparent()[0]
    except FileNotFoundError as e:
        print(e.filename, "not found")
        raise e
    end = time.time()
    printTime(start, end)
    return extracted_count
예제 #11
0
 def testMixedIterationAndReads(self):
     self.createTempFile()
     linelen = len(self.TEXT_LINES[0])
     halflen = linelen // 2
     with BZ2File(self.filename) as bz2f:
         bz2f.read(halflen)
         self.assertEqual(next(bz2f), self.TEXT_LINES[0][halflen:])
         self.assertEqual(bz2f.read(), self.TEXT[linelen:])
     with BZ2File(self.filename) as bz2f:
         bz2f.readline()
         self.assertEqual(next(bz2f), self.TEXT_LINES[1])
         self.assertEqual(bz2f.readline(), self.TEXT_LINES[2])
     with BZ2File(self.filename) as bz2f:
         bz2f.readlines()
         self.assertRaises(StopIteration, next, bz2f)
         self.assertEqual(bz2f.readlines(), [])
예제 #12
0
def _compression_wrapper(file_obj, filename, mode):
    """
    This function will wrap the file_obj with an appropriate
    [de]compression mechanism based on the extension of the filename.

    file_obj must either be a filehandle object, or a class which behaves
        like one.

    If the filename extension isn't recognized, will simply return the original
    file_obj.
    """
    _, ext = os.path.splitext(filename)

    if _need_to_buffer(file_obj, mode, ext):
        warnings.warn('streaming gzip support unavailable, see %s' % _ISSUE_189_URL)
        file_obj = io.BytesIO(file_obj.read())

    if ext == '.bz2':
        return BZ2File(file_obj, mode)
    elif ext == '.gz':
        return gzip.GzipFile(fileobj=file_obj, mode=mode)
    elif ext == '.xz':
        return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ)
    else:
        return file_obj
예제 #13
0
def get_url_html(tree, doc_id):
    path_info = id2path(doc_id, tree)
    if path_info != 'wrong id':
        path = '/home/luocheng/zhengyukun/index_build/sogouTSample/sample_result/sogout_data.' + path_info[0] + \
               '.comp/sogout_data.' + path_info[0] + '.comp.part-m-' + path_info[1] + '.sample.bz2'
        f = BZ2File(path, 'r')
        # print 'begin'
        # f.read()
        # print 'end'
        cnt = 0
        for line in f:
            cnt += 1
            if cnt == path_info[2]:
                try:
                    return json.loads(line)
                except:
                    return 'wrong id'
        f.close()
        # try:
        # line = linecache.getline(path, path_info[2])
        # if url_flag == True:
        # return json.loads(line)['url']
        # else:
        # return json.loads(line)['content']
        # except:
        # return 'wrong id'
    return 'wrong id'
예제 #14
0
 def testWriteBytesIO(self):
     with BytesIO() as bio:
         with BZ2File(bio, "w") as bz2f:
             self.assertRaises(TypeError, bz2f.write)
             bz2f.write(self.TEXT)
         self.assertEqual(self.decompress(bio.getvalue()), self.TEXT)
         self.assertFalse(bio.closed)
예제 #15
0
 def wrap_fp(fp):
     if suffix == ".gz":
         fp = GzipFile(fileobj=fp, mode=mode)
     elif suffix == ".bz2":
         try:
             fp = BZ2File(fp, mode=mode)
         except TypeError:
             if sys.version_info < (3, 0, 0):
                 raise NotImplementedError(
                     "built-in BZ2File is partially broken in python 2, install bz2file from pypi or use a compression setting other than 'bz2'"
                 )
             else:
                 raise
     elif suffix == ".xz":
         fp = LZMAFile(fp, mode=mode)
     if (suffix or sys.version_info < (3, )) and "b" not in mode:
         # If mode is not binary (and we expect to be able to
         # write() str values, not bytes), need need to create
         # an additional encoding wrapper. That encoder can
         # probably use UTF-8 without any need for additional
         # configuration
         if "r" in mode and "w" in mode:
             fp = StreamReaderWriter(fp, codecs.getreader("utf-8"),
                                     codecs.getwriter("utf-8"))
         elif "w" in mode:
             fp = codecs.getwriter("utf-8")(fp)
         elif suffix:
             fp = codecs.getreader("utf-8")(fp)
     fp.realname = filename
     return fp
예제 #16
0
 def testPeek(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         pdata = bz2f.peek()
         self.assertNotEqual(len(pdata), 0)
         self.assertTrue(self.TEXT.startswith(pdata))
         self.assertEqual(bz2f.read(), self.TEXT)
예제 #17
0
 def testPeekBytesIO(self):
     with BytesIO(self.DATA) as bio:
         with BZ2File(bio) as bz2f:
             pdata = bz2f.peek()
             self.assertNotEqual(len(pdata), 0)
             self.assertTrue(self.TEXT.startswith(pdata))
             self.assertEqual(bz2f.read(), self.TEXT)
예제 #18
0
def almost_smart_open(fname, mode='r'):
    _, ext = path.splitext(fname)
    if ext == '.bz2':
        return BZ2File(fname, mode)
    if ext == '.gz':
        return gzip.open(fname, mode)
    return open(fname, mode)
예제 #19
0
def smart_open(fname, mode='r'):
    _, ext = path.splitext(fname)
    if ext == '.bz2':
        return closing(BZ2File(fname, mode))
    if ext == '.gz':
        return closing(gzip.open(fname, mode))
    return open(fname, mode)
예제 #20
0
 def testOpenDel(self):
     if platform.python_implementation() != "CPython":
         self.skipTest("Test depends on CPython refcounting semantics")
     self.createTempFile()
     for i in range(10000):
         o = BZ2File(self.filename)
         del o
예제 #21
0
    def testWritable(self):
        bz2f = BZ2File(BytesIO(self.DATA))
        try:
            self.assertFalse(bz2f.writable())
            bz2f.read()
            self.assertFalse(bz2f.writable())
        finally:
            bz2f.close()
        self.assertRaises(ValueError, bz2f.writable)

        bz2f = BZ2File(BytesIO(), "w")
        try:
            self.assertTrue(bz2f.writable())
        finally:
            bz2f.close()
        self.assertRaises(ValueError, bz2f.writable)
예제 #22
0
 def testClosedIteratorDeadlock(self):
     # Issue #3309: Iteration on a closed BZ2File should release the lock.
     self.createTempFile()
     bz2f = BZ2File(self.filename)
     bz2f.close()
     self.assertRaises(ValueError, next, bz2f)
     # This call will deadlock if the above call failed to release the lock.
     self.assertRaises(ValueError, bz2f.readlines)
예제 #23
0
 def testSeekBackwardsAcrossStreams(self):
     self.createTempFile(streams=2)
     with BZ2File(self.filename) as bz2f:
         readto = len(self.TEXT) + 100
         while readto > 0:
             readto -= len(bz2f.read(readto))
         bz2f.seek(-150, 1)
         self.assertEqual(bz2f.read(), self.TEXT[100 - 150:] + self.TEXT)
예제 #24
0
 def testWriteLines(self):
     with BZ2File(self.filename, "w") as bz2f:
         self.assertRaises(TypeError, bz2f.writelines)
         bz2f.writelines(self.TEXT_LINES)
     # Issue #1535500: Calling writelines() on a closed BZ2File
     # should raise an exception.
     self.assertRaises(ValueError, bz2f.writelines, ["a"])
     with open(self.filename, 'rb') as f:
         self.assertEqual(self.decompress(f.read()), self.TEXT)
예제 #25
0
 def testFileno(self):
     self.createTempFile()
     with open(self.filename, 'rb') as rawf:
         bz2f = BZ2File(rawf)
         try:
             self.assertEqual(bz2f.fileno(), rawf.fileno())
         finally:
             bz2f.close()
     self.assertRaises(ValueError, bz2f.fileno)
예제 #26
0
 def best_guess_open(file_name):
     """
     Use bz2file to iterate over a compressed file,
     regular open otherwise."""
     if file_name.endswith('.bz2'):
         return BZ2File(file_name)
     elif file_name.endswith('.gz'):
         return gzip.open(file_name)
     else:
         return open(file_name)
예제 #27
0
 def testReadChunk10MultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         text = b''
         while True:
             str = bz2f.read(10)
             if not str:
                 break
             text += str
         self.assertEqual(text, self.TEXT * 5)
예제 #28
0
 def testWriteChunks10(self):
     with BZ2File(self.filename, "w") as bz2f:
         n = 0
         while True:
             str = self.TEXT[n * 10:(n + 1) * 10]
             if not str:
                 break
             bz2f.write(str)
             n += 1
     with open(self.filename, 'rb') as f:
         self.assertEqual(self.decompress(f.read()), self.TEXT)
예제 #29
0
    def test_can_read_multistream_bz2(self):
        if PY2:
            # this is a backport from Python 3
            from bz2file import BZ2File
        else:
            from bz2 import BZ2File

        test_file = self.create_temp_bz2(streams=5)
        with BZ2File(test_file) as bz2f:
            self.assertEqual(bz2f.read(), self.TEXT * 5)
        self.cleanup_temp_bz2(test_file)
예제 #30
0
 def testContextProtocol(self):
     f = None
     with BZ2File(self.filename, "wb") as f:
         f.write(b"xxx")
     f = BZ2File(self.filename, "rb")
     f.close()
     try:
         with f:
             pass
     except ValueError:
         pass
     else:
         self.fail("__enter__ on a closed file didn't raise an exception")
     try:
         with BZ2File(self.filename, "wb") as f:
             1 / 0
     except ZeroDivisionError:
         pass
     else:
         self.fail("1/0 didn't raise an exception")