Example #1
0
def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None,
                         compresslevel=9, format=None, check=-1, preset=None, filters=None,
                         compression=None):
    # pylint: disable=unexpected-keyword-arg,no-member
    if compression is None:
        return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
                             newline=newline)
    elif compression in ('gz', 'gzip'):
        if six.PY2:
            return gzip.open(path, mode=mode, compresslevel=compresslevel)
        else:
            return gzip.open(path, mode=mode, compresslevel=compresslevel,
                             errors=errors, newline=newline, encoding=encoding)
    elif compression in ('lzma', 'xz'):
        try:
            import lzma
        except ImportError:
            from backports import lzma
        return lzma.open(path, mode=mode, format=format, check=check, preset=preset,
                         filters=filters, encoding=encoding, errors=errors, newline=newline)
    elif compression == 'bz2':
        if six.PY2 or '__pypy__' in sys.builtin_module_names:
            import bz2file as bz2  # pylint: disable=import-error
        else:
            import bz2

        return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding,
                        errors=errors, newline=newline)
    else:
        raise ValueError(
            'compression must be None, gz, gzip, lzma, or xz and was {0}'.format(compression))
Example #2
0
	def bz2(self):
		""" bz2 type """
		if self.format = "bz2":
			with bz2file.open(self.path,'r') as F:
				for line in F:
					line = line.decode('utf-8')
					print (line)
Example #3
0
def open_files():
    wiki_bz_file = bz2file.open(config.wiki_file, mode="rt", encoding='utf8')
    output_file = open(config.parsed_names_and_alternats_file,
                       mode="w",
                       encoding='utf8')

    return wiki_bz_file, output_file
Example #4
0
def universal_write_open(path,
                         mode,
                         buffering=-1,
                         encoding=None,
                         errors=None,
                         newline=None,
                         compresslevel=9,
                         format=None,
                         check=-1,
                         preset=None,
                         filters=None,
                         compression=None):
    # pylint: disable=unexpected-keyword-arg,no-member
    if compression is None:
        return builtins.open(path,
                             mode=mode,
                             buffering=buffering,
                             encoding=encoding,
                             errors=errors,
                             newline=newline)
    elif compression == 'gz' or compression == 'gzip':
        if six.PY2:
            return gzip.open(path, mode=mode, compresslevel=compresslevel)
        else:
            return gzip.open(path,
                             mode=mode,
                             compresslevel=compresslevel,
                             errors=errors,
                             newline=newline,
                             encoding=encoding)
    elif compression == 'lzma' or compression == 'xz':
        try:
            import lzma
        except ImportError:
            from backports import lzma
        return lzma.open(path,
                         mode=mode,
                         format=format,
                         check=check,
                         preset=preset,
                         filters=filters,
                         encoding=encoding,
                         errors=errors,
                         newline=newline)
    elif compression == 'bz2':
        if six.PY2 or '__pypy__' in sys.builtin_module_names:
            import bz2file as bz2  # pylint: disable=import-error
        else:
            import bz2

        return bz2.open(path,
                        mode=mode,
                        compresslevel=compresslevel,
                        encoding=encoding,
                        errors=errors,
                        newline=newline)
    else:
        raise ValueError(
            'compression must be None, gz, gzip, lzma, or xz and was {0}'.
            format(compression))
def get_wiki2():
    reobj1 = re.compile(
        r"[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+")
    reobj2 = re.compile(r"\n+")
    reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}")
    reuseful = re.compile('^[a-zA-Z]+:')
    redirect = re.compile(r"^#")

    def wiki_replace(s):
        s = filter_wiki(s)
        s = reobj1.sub("", s)  # 为上传阿里云剔除竖线(|)符号
        s = reobj2.sub("#", s)
        s = reobj3.sub("", s)
        return s

    wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2'))
    with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f:
        i = 0
        filelist = []
        for d in tqdm(wiki):
            if not reuseful.findall(d[0]) and not redirect.findall(d[1]):
                i += 1
                filelist.append(
                    reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n")
                if i % 1000 == 0:
                    s = ("".join(filelist))
                    f.write(s)
                    filelist = []
        if filelist:
            s = ("".join(filelist))
            f.write(s)
Example #6
0
 def get_file(self):
     """
     Downloads the resource if necessary and opens an
     uncompressed stream for reading.
     """
     local_filename = download.retrieve(self)
     return bz2.open(local_filename, mode='rt')
Example #7
0
def preprocess_sentence():
    i = 0
    line = ''
    wiki = extract_pages(
        bz2file.open('./zhwiki-20190301-pages-articles.xml.bz2')
    )  # 用gensim的extract_pages来提取每个页面
    with open('./zhwiki_sentence.txt', 'w') as f:
        for text in wiki:
            if not re.findall('^[a-zA-Z]+:',
                              text[0]) and text[0] and not re.findall(
                                  u'^#', text[1]):  # 去掉帮助页面以及重定向的页面
                converted = opencc.convert(text[1]).strip()  # 繁体转简体
                converted = re.sub('\|\w*\]', '', converted)
                for x in converted:
                    if len(x.encode(
                            'utf-8')) == 3 and x not in stop_punctuation(
                                './stop_punctuation.txt'):
                        line += x
                    if x in ['\n', '。', '?', '!', ',', ';', ':'
                             ] and line != '\n':  # 以部分中文符号为分割换行
                        f.write(line.strip() + '\n')  # 按行存入语料文件
                        line = ''
                i += 1
            if i == 10:
                print("选取中文维基百科的文章篇数:", i)
                break
def decomp_burrows_wheeler(filename):
    start_time = time.time()
    outfilename = filename + '.bz'
    try:
        with open(filename, 'rb') as f, bz2file.open(outfilename, 'wb') as out:
            out.write(bz2.decompress(bytes(f.read())))
    finally:
        out.close()
Example #9
0
def tweet_num_bz2(file_list):
    import bz2file
    count = list()
    for file in file_list:
        fp = bz2file.open(file, 'r')
        count.append(len(fp.readlines()))
        fp.close()
    return count
Example #10
0
 def read(self):
     with bz2.open(self.path,
                   mode=self.mode,
                   compresslevel=self.compresslevel,
                   encoding=self.encoding,
                   errors=self.errors,
                   newline=self.newline) as file_content:
         return file_content.read()
Example #11
0
def _open_bz2(filename, mode):
    if bz2 is None:
        raise ImportError("Cannot open bz2 files: The bz2 module is not available")
    if _PY3:
        return bz2.open(filename, mode)
    else:
        if mode[0] == 'a':
            raise ValueError("Mode '{}' not supported with BZ2 compression".format(mode))
        return bz2.BZ2File(filename, mode)
def get_wiki():
    from opencc import OpenCC
    # 参考这篇博客注释
    # https://kexue.fm/archives/4176
    opencc1 = OpenCC("t2s")
    resub1 = re.compile(':*{\|[\s\S]*?\|}')
    resub2 = re.compile('<gallery>[\s\S]*?</gallery>')
    resub3 = re.compile('(.){{([^{}\n]*?\|[^{}\n]*?)}}')
    resub4 = re.compile('\* *\n|\'{2,}')
    resub5 = re.compile('\n+')
    resub6 = re.compile('\n[:;]|\n +')
    resub7 = re.compile('\n==')

    refind1 = re.compile('^[a-zA-Z]+:')
    refind2 = re.compile('^#')

    p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
    p2 = re.compile(r'[(\(][,;。?!\s]*[)\)]')
    p3 = re.compile(r'[「『]')
    p4 = re.compile(r'[」』]')

    def wiki_replace(s):
        s = filter_wiki(s)
        s = resub1.sub('', s)
        s = resub2.sub('', s)
        s = resub3.sub('\\1[[\\2]]', s)
        s = resub4.sub('', s)
        s = resub5.sub('\n', s)
        s = resub6.sub('\n', s)
        s = resub7.sub('\n\n==', s)
        s = p1.sub(r'\2', s)
        s = p2.sub(r'', s)
        s = p3.sub(r'“', s)
        s = p4.sub(r'”', s)
        return opencc1.convert(s).strip()

    wiki = extract_pages(
        bz2file.open(
            r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2'
        ))

    # wiki=WikiCorpus('zhwiki-latest-pages-articles.xml.bz2',lemmatize=False,dictionary={})

    with codecs.open('wiki.csv', 'w', encoding='utf-8') as f:
        i = 0
        filelist = []
        for d in tqdm(wiki):
            if not refind1.findall(d[0]) and d[0] and not refind2.findall(
                    d[1]):
                filelist.append(d[0] + "\n" + d[1])
                line = d[1]

                i += 1
                if i % 100 == 0:
                    s = wiki_replace("\n\n".join(filelist))
                    f.write(s)
                    filelist = []
Example #13
0
 def __iter__(self):
     with bz2.open(self.path,
                   mode=self.mode,
                   compresslevel=self.compresslevel,
                   encoding=self.encoding,
                   errors=self.errors,
                   newline=self.newline) as file_content:
         for line in file_content:
             yield line
def burrows_wheeler(filename, level):
    start_time = time.time()
    outfilename = filename + '.bz'
    try:
        with open(filename, 'rb') as f, bz2file.open(outfilename, 'wb',
                                                     level) as out:
            out.write(bz2.compress(bytes(f.read())))
    finally:
        print("closing")
        comprsn_details(filename, outfilename, start_time)
Example #15
0
    def read(self):
        if six.PY2 or '__pypy__' in sys.builtin_module_names:
            import bz2file as bz2  # pylint: disable=import-error
        else:
            import bz2

        with bz2.open(self.path, mode=self.mode, compresslevel=self.compresslevel,
                      encoding=self.encoding, errors=self.errors,
                      newline=self.newline) as file_content:
            return file_content.read()
Example #16
0
def main():
    if (len(sys.argv) < 3):
        print("Use: python2.7 wiki.py input.xml output.txt")
        return
    handler = wikihandler()
    saxparser = make_parser()
    saxparser.setContentHandler(handler)
    #source=open(sys.argv[1],"r")
    source = bz2file.open("enwiki-latest-pages-articles-multistream.xml.bz2",
                          "r")
    saxparser.parse(source)
Example #17
0
def load_twarr_from_bz2(bz2_file):
    fp = bz2file.open(bz2_file, 'r')
    twarr = list()
    for line in fp.readlines():
        try:
            json_obj = json.loads(line.decode('utf8'))
            twarr.append(json_obj)
        except:
            print('Error when parsing ' + bz2_file + ': ' + line)
            continue
    fp.close()
    return twarr
Example #18
0
 def _extract_article_onebyone(self):
     wiki_pages = extract_pages(
         bz2file.open(self.download_wiki_articles_dump()))
     counter = 0
     w = tqdm(wiki_pages, desc=u'get 0 article')
     for d in w:
         if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(
                 u'^#', d[1]):
             yield d
             counter += 1
         if counter % 100 == 0:
             w.set_description(u'processed %s article' % counter)
Example #19
0
    def read(self):
        if six.PY2 or '__pypy__' in sys.builtin_module_names:
            import bz2file as bz2  # pylint: disable=import-error
        else:
            import bz2

        with bz2.open(self.path,
                      mode=self.mode,
                      compresslevel=self.compresslevel,
                      encoding=self.encoding,
                      errors=self.errors,
                      newline=self.newline) as file_content:
            return file_content.read()
Example #20
0
def universal_write_open(path,
                         mode,
                         buffering=-1,
                         encoding=None,
                         errors=None,
                         newline=None,
                         compresslevel=9,
                         format=None,
                         check=-1,
                         preset=None,
                         filters=None,
                         compression=None):
    # pylint: disable=unexpected-keyword-arg
    if compression is None:
        return builtins.open(path,
                             mode=mode,
                             buffering=buffering,
                             encoding=encoding,
                             errors=errors,
                             newline=newline)
    elif compression == 'gz' or compression == 'gzip':
        if six.PY2:
            return gzip.open(path, mode=mode, compresslevel=compresslevel)
        else:
            return gzip.open(path,
                             mode=mode,
                             compresslevel=compresslevel,
                             errors=errors,
                             newline=newline,
                             encoding=encoding)
    elif compression == 'lzma' or compression == 'xz':
        return lzma.open(path,
                         mode=mode,
                         format=format,
                         check=check,
                         preset=preset,
                         filters=filters,
                         encoding=encoding,
                         errors=errors,
                         newline=newline)
    elif compression == 'bz2':
        return bz2.open(path,
                        mode=mode,
                        compresslevel=compresslevel,
                        encoding=encoding,
                        errors=errors,
                        newline=newline)
    else:
        raise ValueError(
            'compression must be None, gz, gzip, lzma, or xz and was {0}'.
            format(compression))
Example #21
0
    def test_to_file_compressed(self):
        tmp_path = 'functional/test/data/tmp/output.txt'
        sequence = self.seq(1, 2, 3, 4)
        sequence.to_file(tmp_path, compression='gzip')
        with gzip.open(tmp_path, 'rt') as output:
            self.assertEqual('[1, 2, 3, 4]', output.readlines()[0])

        sequence.to_file(tmp_path, compression='lzma')
        with lzma.open(tmp_path, 'rt') as output:
            self.assertEqual('[1, 2, 3, 4]', output.readlines()[0])

        sequence.to_file(tmp_path, compression='bz2')
        with bz2.open(tmp_path, 'rt') as output:
            self.assertEqual('[1, 2, 3, 4]', output.readlines()[0])
Example #22
0
def iter_entities_from(filename, use_list=False):
    """
    bz2 wikipedia n-triples import.
    for each entity returns a set of values for each defined attribute in the filename.
    """
    print('reading', filename)

    with bz2file.open(filename, 'rt') as dump:
        prev_resource = None
        values = None

        for l in dump:
            if l.startswith('#'):
                continue

            l = to_unicode_or_bust(l).strip()

            parts = get_parts(l)
            if not parts:
                continue

            resource = parts['resource']

            if prev_resource != resource:
                if prev_resource != None:
                    yield values

                values = {'resource': resource}
                prev_resource = resource

            value = parts['object']
            if not value:
                continue

            key = parts['predicate']

            if not key in values:
                if use_list:
                    values[key] = []
                else:
                    values[key] = set()

            if use_list:
                values[key].append(value)
            else:
                values[key].add(value)

        if values is not None:
            yield values
Example #23
0
def get_file_writer(file_handle, do_gzip, do_bzip):
    """Generate and return a file object with specified compression."""
    ofile = None

    if do_gzip and do_bzip:
        raise ValueError("Cannot specify both bzip and gzip compression!")

    if do_gzip:
        ofile = gzip.GzipFile(fileobj=file_handle, mode='w')
    elif do_bzip:
        ofile = bz2file.open(file_handle, mode='w')
    else:
        ofile = file_handle

    return ofile
Example #24
0
def get_file_writer(file_handle, do_gzip, do_bzip):
    """Generate and return a file object with specified compression."""
    ofile = None

    if do_gzip and do_bzip:
        raise Exception("Cannot specify both bzip and gzip compression!")

    if do_gzip:
        ofile = gzip.GzipFile(fileobj=file_handle, mode='w')
    elif do_bzip:
        ofile = bz2file.open(file_handle, mode='w')
    else:
        ofile = file_handle

    return ofile
Example #25
0
 def wiki_bz_process(self,language):
     wiki = extract_pages(bz2file.open(self.corpus))
     f = codecs.open(os.path.join(os.path.dirname(self.corpus), 'wiki.txt'),
                     'w', encoding='utf-8')
     w = tqdm(wiki, desc="Currently get 0 files!")
     if language=='zh':
         for i, d in enumerate(w):
             if not re.findall('^[a-zA-Z]+:', d[0]) and not re.findall(u'^#', d[1]):
                 s = self.wiki_replace(d)
                 f.write(s + '\n\n\n')
                 i += 1
                 if i % 100 == 0:
                     w.set_description('Currently got %s files' % i)
     elif language=='en':
         pass
Example #26
0
def bz2_vocab_reader(fpath):
    with bz2file.open(fpath, 'r') as f:
        info = f.readline()
        while True:
            data = f.readline()
            data = data.decode('utf-8').strip()
            if data == "":
                break
            try:
                t_data = data.split()
                word = t_data[0].strip()
                embed = [float(v) for v in t_data[-300:]]
            except Exception:
                continue
            yield word, embed
Example #27
0
def smart_open(filename):
    '''
    Returns an open file object if `filename` is plain text, else assumes
    it is a bzip2 compressed file and returns a file-like object to
    handle it.
    '''
    if isplaintext(filename):
        f = open(filename, 'rt')
    else:
        file_type = mimetype(filename)
        if file_type.find('gzip') > -1:
            f = gzip.GzipFile(filename, 'rt')
        elif file_type.find('bzip2') > -1:
            f = bz2file.open(filename, 'rt')
        else:
            pass  # Not supported format
    return f
Example #28
0
def wiki_process(input_file, save_path):
    # wikicorpus解析
    wiki = extract_pages(bz2file.open(input_file))
    # 处理并导出
    i = 0
    f = codecs.open(save_path, 'w', encoding='utf-8')
    w = tqdm(wiki, desc=u'已获取0篇文章')
    openCC = OpenCC('t2s')
    for d in w:
        if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(
                u'^#', d[1]):
            s = wiki_replace(d)
            f.write(s + '\n\n\n')
            i += 1
            if i % 100 == 0:
                w.set_description(u'已获取%s篇文章' % i)

    f.close()
Example #29
0
def iter_entities_from(filename):
    """
    bz2 wikipedia n-triples import.
    for each entity returns a set of values for each defined attribute in the filename.
    """
    print("reading", filename)

    with bz2file.open(filename, "rt") as dump:
        prev_resource = None
        values = None

        for l in dump:
            if l.startswith("#"):
                continue

            l = to_unicode_or_bust(l).strip()

            parts = get_parts(l)
            if not parts:
                continue

            resource = parts["resource"]

            if prev_resource != resource:
                if prev_resource != None:
                    yield values

                values = {"resource": resource}
                prev_resource = resource

            value = parts["object"]
            if not value:
                continue

            key = parts["predicate"]

            if not key in values:
                values[key] = set()

            values[key].add(value)

        if values is not None:
            yield values
Example #30
0
def load_single_file(file_str):
    textset = []
    retweet = 0
    favorite = 0
    infile = bz2file.open(file_str, 'r')
    lines = infile.readlines()
    data_list = map(json.loads, lines)
    cleanr = re.compile('(@[A-Za-z0-9]+)|([^0-9A-Za-z\t])|(\w+:\/\/\S+)')
    for data in data_list:
        if 'text' in data:
            try:
                if 'blockchain' in data['text'].lower():
                    cleantext = re.sub(cleanr, ' ', data['text']).split()
                    textset.append(cleantext)
                    retweet += data['retweet_count']
                    favorite += data['favorite_count']
            except:
                pass
    return textset, retweet, favorite
Example #31
0
    def __init__(self,filePath,headerSymbols=['@','+']):
        """
        Returns a read-by-read fastQ parser analogous to file.readline().
        
        Exmpl: parser.next()
        -OR-
        Its an iterator so you can do:
        for rec in parser:
            ... do something with rec ...
 
        rec is tuple: (seqHeader,seqStr,qualHeader,qualStr)
        """
        if filePath.endswith('.gz'):
            self._file = gzip.open(filePath)
        elif filePath.endswith('.bz2'):
            self._file = bz2file.open(filePath, 'rt')
        else:
            self._file = open(filePath, 'r+')
        self._currentLineNumber = 0
        self._hdSyms = headerSymbols
Example #32
0
def read_fq_list(filename):
    if filename.endswith(".gz"):
        h = gzip.open(filename)
    elif filename.endswith(".bz2"):
        h = bz2file.open(filename)
    else:
        h = open(filename)
    seq = []
    for n, line in enumerate(h):
        line = line.strip()
        if n % 4 == 0:
            if seq:
                yield seq
            seq = [
                line,
            ]
        else:
            seq.append(line)
    h.close()
    yield seq
Example #33
0
def loadBz2fileDataSet(filePath,
                       symbol,
                       low=-1,
                       high=1000):  # 当给定low>=0时,是一种省内存费时间的切片方法
    dataMat = []
    fr = bz2file.open(filePath, "r")
    if low < 0:
        for line in fr.readlines():
            lineArr = line.strip().split(symbol)
            dataMat.append(map(float, lineArr))
    else:
        count = 1
        for line in fr.readlines():
            if count >= low:
                lineArr = line.strip().split(symbol)
                dataMat.append(map(float, lineArr))
            if count == high:
                break
            count += 1
    return dataMat
def get_wiki2():
    from opencc import OpenCC
    # 参考这篇博客注释
    # https://kexue.fm/archives/4176
    opencc1 = OpenCC("t2s")
    reobj1 = re.compile(
        "[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+")
    reobj2 = re.compile(r"\n+")
    reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}")
    reuseful = re.compile('^[a-zA-Z]+:')
    redirect = re.compile(r"^#")

    def wiki_replace(s):
        s = filter_wiki(s)
        s = reobj1.sub("", s)  # 为上传阿里云剔除竖线(|)符号
        s = reobj2.sub("#", s)
        s = reobj3.sub("", s)
        return opencc1.convert(s).strip()

    wiki = extract_pages(
        bz2file.open(
            r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2'
        ))
    with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f:
        i = 0
        filelist = []
        for d in tqdm(wiki):
            if not reuseful.findall(d[0]) and not redirect.findall(d[1]):
                i += 1
                try:
                    filelist.append(
                        reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n")
                except Exception as e:
                    print(d[0], '=' * 10, d[1])
            if i % 1000 == 0:
                s = ("".join(filelist))
                f.write(s)
                filelist = []
    if filelist:
        s = ("".join(filelist))
        f.write(s)
Example #35
0
    def documents(self):
        """Iterates over pages in the wikinews archive.

    Yields:
      docid, title, url, curid, revid, wiki_doc
    """
        logging.info("Extracting docs from [%s]", self._wikinews_archive)
        file_list = glob.glob(self._wikinews_archive)
        assert file_list, self._wikinews_archive
        for archive in file_list:
            with bz2file.open(archive, "rt", encoding="utf-8",
                              errors="strict") as xf:
                # One line of json as produced by wikiextractor.
                for line in xf:
                    record = json.loads(line)

                    # Extract page title.
                    title = html.unescape(record["title"])
                    curid = record["id"]  # pageid, e.g. 73052
                    revid = record["revid"]  # revision, e.g. 730271
                    url = record[
                        "url"]  # e.g. https://de.wikinews.org/wiki?curid=73052
                    logging.debug("Got title: %s", title)

                    # Skip pages that don't have text.
                    wiki_doc = record.get("text")
                    if not wiki_doc:
                        self._counter["no text"] += 1
                        self._filtered_no_text.append(url)
                        logging.debug("Skip: no text element")
                        continue

                    # Apply manual fixes.
                    wiki_doc = _apply_wiki_overrides(wiki_doc, url)

                    # Create internal document identifier.
                    docid = f"{self._language}-{curid}"
                    logging.debug("Found (%s): %s", docid, title)
                    self._counter["found"] += 1

                    yield docid, title, url, curid, revid, wiki_doc
Example #36
0
def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None,
                         compresslevel=9, format=None, check=-1, preset=None, filters=None,
                         compression=None):
    # pylint: disable=unexpected-keyword-arg
    if compression is None:
        return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors,
                             newline=newline)
    elif compression == 'gz' or compression == 'gzip':
        if six.PY2:
            return gzip.open(path, mode=mode, compresslevel=compresslevel)
        else:
            return gzip.open(path, mode=mode, compresslevel=compresslevel,
                             errors=errors, newline=newline, encoding=encoding)
    elif compression == 'lzma' or compression == 'xz':
        return lzma.open(path, mode=mode, format=format, check=check, preset=preset,
                         filters=filters, encoding=encoding, errors=errors, newline=newline)
    elif compression == 'bz2':
        return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding,
                        errors=errors, newline=newline)
    else:
        raise ValueError(
            'compression must be None, gz, gzip, lzma, or xz and was {0}'.format(compression))
Example #37
0
STOPWORDS = set()
URL_STOPWORDS = set()

with open('labels.json', 'r') as f:
    labels = json.load(f)

with open('stopwords.txt', 'r') as f:
    for line in f:
        STOPWORDS.add(line.rstrip())

with open('url_stopwords.txt', 'r') as f:
    for line in f:
        URL_STOPWORDS.add(line.rstrip())

line_no = 0
with bz2file.open(sys.argv[1], 'rb') as tsvfile:
    line_no += 1
    with open(sys.argv[2], 'w') as arfffile:
        csvwriter = csv.writer(arfffile, quoting=csv.QUOTE_ALL)

        arfffile.write("@relation 40-cat-training\n")
        """
        arfffile.write("@attribute url string\n")
        arfffile.write("@attribute title string\n")
        """
        arfffile.write("@attribute tokens string\n")
        arfffile.write("@attribute klass {{{0}}}\n".format(",".join(labels)))
        arfffile.write("@data\n")

        for line in tsvfile:
            line = line.strip()
Example #38
0
 def __iter__(self):
     with bz2.open(self.path, mode=self.mode, compresslevel=self.compresslevel,
                   encoding=self.encoding, errors=self.errors,
                   newline=self.newline) as file_content:
         for line in file_content:
             yield line
import sys
sys.path.remove(
    '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python'
)
#sys.path.remove('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC')
sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages')
#sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages/opencc.py')

from gensim.corpora.wikicorpus import extract_pages, filter_wiki
import bz2file
import re
import opencc
from tqdm import tqdm
import codecs

wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2'))


def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    return opencc.convert(s).strip()
Example #40
0
 def read(self):
     with bz2.open(self.path, mode=self.mode, compresslevel=self.compresslevel,
                   encoding=self.encoding, errors=self.errors,
                   newline=self.newline) as file_content:
         return file_content.read()
Example #41
0
if __name__ == "__main__":
    # mongo init
    client = MongoClient("mongodb://*****:*****@ds039860.mongolab.com:39860/changesets")
    db = client.changesets
    changesetcollection = db.changesets
    metacollection = db.meta
    # counters
    i = 0
    j = 0
    k = 0
    for root, dirs, files in os.walk(BASEDIR):
        dirs.sort(reverse=True)
        files.sort(reverse=True)
        for name in files:
            if name.endswith('.bz2'):
                f = bz2file.open(os.path.join(root, name), 'rb')
                i += 1
                docs = []
                try:
                    for event, elem in ET.iterparse(f):
                        if elem.tag == "changeset":
                            # check if changeset has bbox metadata (is not empty)
                            if all(k in elem.attrib for k in ('min_lat', 'max_lat', 'min_lon', 'max_lon')):
                                min_lat = float(elem.attrib['min_lat'])
                                min_lon = float(elem.attrib['min_lon'])
                                max_lat = float(elem.attrib['max_lat'])
                                max_lon = float(elem.attrib['max_lon'])
                                bbox = box(min_lon, min_lat, max_lon, max_lat)
                                # send output to stdout
                                if not i % BULK_SIZE:
                                    sys.stdout.write('.')
Example #42
0
 def open(self, *args, **kwargs):
     return bz2file.open(*args, **kwargs)
Example #43
0
			mlp.Layer("Rectifier", units=64, dropout=0.3),
			mlp.Layer("Rectifier", units=48, dropout=0.1),
			mlp.Layer("Rectifier", units=32),
			mlp.Layer("Softmax")],
		learning_rate=0.01,
		learning_rule='rmsprop',
		n_iter=10,
		n_stable=10,
		batch_size=50,
		valid_set=(X,y),
		verbose=1)

try:
	nn.fit(X, y)
except KeyboardInterrupt:
	pass

score = nn.score(X, y)
print('SCORE:', score * 100.0)


print('MISMATCHES:')
yp = nn.predict(X)
y = y.reshape(yp.shape)
for a in numpy.where(y != yp)[0]:
	print(ds.filenames[a], 'was', int(yp[a]), 'not', int(y[a]))


with bz2.open('detector_train.pkl.bz2', 'wb') as f:
	pickle.dump(nn, f)