Ejemplo n.º 1
0
 def __iter__(self):
     """
     Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2),
     yielding one (page id, title, page content) 3-tuple at a time.
     """
     try:
         for title, content, page_id in extract_pages(bzip_open(self.wikicorpus.fname, mode='rt'),
                                                      self.wikicorpus.filter_namespaces):
             yield (page_id, title, content)
     except ValueError:  # Python 2 sucks and can't open bzip in text mode
         for title, content, page_id in extract_pages(bzip_open(self.wikicorpus.fname, mode='r'),
                                                      self.wikicorpus.filter_namespaces):
             yield (page_id, title, content)
Ejemplo n.º 2
0
 def __iter__(self):
     """
     Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2),
     yielding one (page id, title, page content) 3-tuple at a time.
     """
     if PY2 is False:
         for title, content, page_id in extract_pages(
                 open_sesame(self.wikicorpus.fname, mode='rt'),
                 self.wikicorpus.filter_namespaces):
             yield (page_id, title, content)
     else:  # Python 2 sucks and can't open bzip in text mode
         for title, content, page_id in extract_pages(
                 open_sesame(self.wikicorpus.fname, mode='rb'),
                 self.wikicorpus.filter_namespaces):
             yield (page_id, title, content)
Ejemplo n.º 3
0
def fetch_wiki_texts(in_file,
                     namespaces_to_filter=WIKI_DEFAULT_NAMESPACES_TO_FILTER,
                     min_text_length=200):
    return ((title, clean_text, page_id) for title, text, page_id in
            extract_pages(bz2.BZ2File(in_file), namespaces_to_filter)
            for clean_text in (filter_wiki(text), )
            if len(clean_text.strip()) >= min_text_length)
Ejemplo n.º 4
0
    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid)
                 for title, text, pageid in extract_pages(
                     bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for tokens, title, pageid in pool.imap(process_article,
                                                   group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(
                        title.startswith(ignore + ':')
                        for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all,
            ARTICLE_MIN_WORDS)
        self.length = articles  # cache corpus length
Ejemplo n.º 5
0
def get_wiki2():
    reobj1 = re.compile(
        r"[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+")
    reobj2 = re.compile(r"\n+")
    reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}")
    reuseful = re.compile('^[a-zA-Z]+:')
    redirect = re.compile(r"^#")

    def wiki_replace(s):
        s = filter_wiki(s)
        s = reobj1.sub("", s)  # 为上传阿里云剔除竖线(|)符号
        s = reobj2.sub("#", s)
        s = reobj3.sub("", s)
        return s

    wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2'))
    with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f:
        i = 0
        filelist = []
        for d in tqdm(wiki):
            if not reuseful.findall(d[0]) and not redirect.findall(d[1]):
                i += 1
                filelist.append(
                    reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n")
                if i % 1000 == 0:
                    s = ("".join(filelist))
                    f.write(s)
                    filelist = []
        if filelist:
            s = ("".join(filelist))
            f.write(s)
Ejemplo n.º 6
0
def preprocess_sentence():
    i = 0
    line = ''
    wiki = extract_pages(
        bz2file.open('./zhwiki-20190301-pages-articles.xml.bz2')
    )  # 用gensim的extract_pages来提取每个页面
    with open('./zhwiki_sentence.txt', 'w') as f:
        for text in wiki:
            if not re.findall('^[a-zA-Z]+:',
                              text[0]) and text[0] and not re.findall(
                                  u'^#', text[1]):  # 去掉帮助页面以及重定向的页面
                converted = opencc.convert(text[1]).strip()  # 繁体转简体
                converted = re.sub('\|\w*\]', '', converted)
                for x in converted:
                    if len(x.encode(
                            'utf-8')) == 3 and x not in stop_punctuation(
                                './stop_punctuation.txt'):
                        line += x
                    if x in ['\n', '。', '?', '!', ',', ';', ':'
                             ] and line != '\n':  # 以部分中文符号为分割换行
                        f.write(line.strip() + '\n')  # 按行存入语料文件
                        line = ''
                i += 1
            if i == 10:
                print("选取中文维基百科的文章篇数:", i)
                break
Ejemplo n.º 7
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles &
        redirects etc are ignored).

        Note that this iterates over the **texts**; if you want vectors,
        just use the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        texts = ((text, title, pageid)
                 for title, text, pageid in extract_pages(
                     bz2.BZ2File(self.fname), self.filter_namespaces))

        batch_idx = 0
        pool = multiprocessing.Pool(self.processes)
        # Process the corpus in smaller chunks of docs,
        # because multiprocessing.Pool is dumb and would load the entire input
        # into RAM at once...
        for group in chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):
                articles_all += 1
                positions_all += len(tokens)

                # article redirects and short stubs are pruned here
                to_ignored = any(
                    title.startswith(ignore + ':')
                    for ignore in IGNORED_NAMESPACES)
                if len(tokens) < ARTICLE_MIN_WORDS or to_ignored:
                    continue

                articles += 1
                positions += len(tokens)

                if self.metadata:
                    yield title, tokens
                else:
                    yield tokens

            batch_idx += 1
            if self.max_batch and batch_idx == self.max_batch:
                break

        pool.terminate()

        logger.info(
            "Finished iterating over Wikipedia corpus of %i documents with "
            "%i positions (total %i articles, %i positions before pruning "
            "articles shorter than %i words)", articles, positions,
            articles_all, positions_all, ARTICLE_MIN_WORDS)

        self.length = articles
Ejemplo n.º 8
0
def get_wiki():
    from opencc import OpenCC
    # 参考这篇博客注释
    # https://kexue.fm/archives/4176
    opencc1 = OpenCC("t2s")
    resub1 = re.compile(':*{\|[\s\S]*?\|}')
    resub2 = re.compile('<gallery>[\s\S]*?</gallery>')
    resub3 = re.compile('(.){{([^{}\n]*?\|[^{}\n]*?)}}')
    resub4 = re.compile('\* *\n|\'{2,}')
    resub5 = re.compile('\n+')
    resub6 = re.compile('\n[:;]|\n +')
    resub7 = re.compile('\n==')

    refind1 = re.compile('^[a-zA-Z]+:')
    refind2 = re.compile('^#')

    p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-')
    p2 = re.compile(r'[(\(][,;。?!\s]*[)\)]')
    p3 = re.compile(r'[「『]')
    p4 = re.compile(r'[」』]')

    def wiki_replace(s):
        s = filter_wiki(s)
        s = resub1.sub('', s)
        s = resub2.sub('', s)
        s = resub3.sub('\\1[[\\2]]', s)
        s = resub4.sub('', s)
        s = resub5.sub('\n', s)
        s = resub6.sub('\n', s)
        s = resub7.sub('\n\n==', s)
        s = p1.sub(r'\2', s)
        s = p2.sub(r'', s)
        s = p3.sub(r'“', s)
        s = p4.sub(r'”', s)
        return opencc1.convert(s).strip()

    wiki = extract_pages(
        bz2file.open(
            r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2'
        ))

    # wiki=WikiCorpus('zhwiki-latest-pages-articles.xml.bz2',lemmatize=False,dictionary={})

    with codecs.open('wiki.csv', 'w', encoding='utf-8') as f:
        i = 0
        filelist = []
        for d in tqdm(wiki):
            if not refind1.findall(d[0]) and d[0] and not refind2.findall(
                    d[1]):
                filelist.append(d[0] + "\n" + d[1])
                line = d[1]

                i += 1
                if i % 100 == 0:
                    s = wiki_replace("\n\n".join(filelist))
                    f.write(s)
                    filelist = []
Ejemplo n.º 9
0
def _iterate_over_pages(fname):
    """
    Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2),
    yielding one (page id, title, page content) 3-tuple at a time.
    """
    dictionary = Dictionary()
    wiki = WikiCorpus(fname, lemmatize=False, dictionary=dictionary)
    for title, content, page_id in extract_pages(bz2.BZ2File(wiki.fname), wiki.filter_namespaces):
        yield (page_id, title, content)
def load_doc(dump_file, title, filter_namespaces=('0', )):
    """
    Load a wikipedia article from its title.
    """
    bz2_file = bz2.BZ2File(dump_file)
    for page_title, text, pageid in extract_pages(bz2_file, filter_namespaces):
        if page_title == title:
            text = filter_wiki(text)
            tokens = extract_jp_entities(text)
            return tokens, pageid
    return None, None
Ejemplo n.º 11
0
def extract_text_content(xml_dump):
    # article_count = 0
    with open('wiki.en.txt', 'w') as file:
        for title, content, pageid in tqdm(extract_pages(xml_dump)):
            try:
                file.write(filter_wiki(content).strip() + "\n")
                # article_count += 1
                # if article_count % 10000 == 0:
                #     logging.info(f'{article_count} articles processed')
            except Exception as e:
                logging.warning(str(e))
Ejemplo n.º 12
0
    def __iter__(self):
        with bz2.BZ2File(self._dump_file) as f:
            c = 0
            for (title, wiki_text, wiki_id) in wikicorpus.extract_pages(f):
                if any([title.lower().startswith(ns) for ns in self._ignored_ns]):
                    continue
                c += 1

                yield WikiPage(six.text_type(title), self._language, six.text_type(wiki_text))

                if c % 10000 == 0:
                    logger.info('Processed: %d', c)
Ejemplo n.º 13
0
 def _extract_article_onebyone(self):
     wiki_pages = extract_pages(
         bz2file.open(self.download_wiki_articles_dump()))
     counter = 0
     w = tqdm(wiki_pages, desc=u'get 0 article')
     for d in w:
         if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(
                 u'^#', d[1]):
             yield d
             counter += 1
         if counter % 100 == 0:
             w.set_description(u'processed %s article' % counter)
Ejemplo n.º 14
0
def _iterate_over_pages(fname):
    """
    Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2),
    yielding one (page id, title, page content) 3-tuple at a time.
    """
    dictionary = Dictionary()
    wiki = WikiCorpus(fname,
                      lemmatize=False,
                      dictionary=dictionary,
                      filter_namespaces={'0'})
    for title, content, page_id in extract_pages(bz2.BZ2File(wiki.fname),
                                                 wiki.filter_namespaces):
        yield (page_id, title, content)
Ejemplo n.º 15
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)

    filter_namespaces = ('0', )

    out_i = 0
    for in_fname in glob.glob(args.inglob):
        for title, text, pageid in extract_pages(
                bz2.BZ2File(in_fname), filter_namespaces=filter_namespaces):
            if out_i % args.skip == 0:
                with open(os.path.join(args.outdir, f'{pageid}.txt'),
                          'w') as f:
                    f.write(filter_wiki(text))
            out_i += 1
def iterate_wiki(input_path):
    lemmatize = utils.has_pattern()
    filter_namespaces = ('0',)
    texts = ((text, lemmatize, title, pageid) for title, text, pageid in
             extract_pages(bz2.BZ2File(input_path), filter_namespaces))
    for article in texts:
        text, lemmatize, title, pageid = article
        text = utils.to_unicode(text, 'utf8', errors='ignore')
        text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
        text = remove_markup(text)
        tokens = get_all_words(text)
        if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
            continue
        yield title, tokens
Ejemplo n.º 17
0
def loadSimpleWiki():
    #file_name = "C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017/simplewiki-20170820-pages-meta-current.xml.bz2"
    #counter  =0
    #... 179,620 articles found
    file_name = "C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017/simplewiki-20170820-pages-meta-current.xml.bz2"

    wiki = WikiCorpus(file_name, lemmatize=False,
                      dictionary={})  #vocab dict not needed
    i = 0
    #for text in wiki.get_texts():
    allwords = defaultdict(lambda: 0)
    allsents = []
    print("starting...")
    dfcounter_stemmed = defaultdict(lambda: 0)
    dfcounter_nostem = defaultdict(lambda: 0)

    for (title, article, pageid) in extract_pages(
            bz2.BZ2File(file_name),
            filter_namespaces=('0', )):  #filter_namespaces=["0"]):
        if len(article) == 0:
            continue

        text = process_article((filter_wiki(article), False, title, pageid),
                               tokenizer_func=tokenizer)
        text_orig = set(
            text[0][0]
        )  #... We are ONLY interested in whether or not the term appeared in this document NOT how many times or where
        text_stemmed = set(text[0][1])

        for term in text_orig:
            dfcounter_nostem[term] += 1
        for term in text_stemmed:
            dfcounter_stemmed[term] += 1

        i += 1
        if i % 1000 == 0:
            print(i)
            #break
    print(i)

    handle = open("simplewiki_docfreqs_stemmed.txt", "w+")
    for key, val in dfcounter_stemmed.items():
        handle.write(str(key) + "\t" + str(val) + "\n")
    handle.close()

    handle = open("simplewiki_docfreqs_nostem.txt", "w+")
    for key, val in dfcounter_nostem.items():
        handle.write(str(key) + "\t" + str(val) + "\n")
    handle.close()
Ejemplo n.º 18
0
 def wiki_bz_process(self,language):
     wiki = extract_pages(bz2file.open(self.corpus))
     f = codecs.open(os.path.join(os.path.dirname(self.corpus), 'wiki.txt'),
                     'w', encoding='utf-8')
     w = tqdm(wiki, desc="Currently get 0 files!")
     if language=='zh':
         for i, d in enumerate(w):
             if not re.findall('^[a-zA-Z]+:', d[0]) and not re.findall(u'^#', d[1]):
                 s = self.wiki_replace(d)
                 f.write(s + '\n\n\n')
                 i += 1
                 if i % 100 == 0:
                     w.set_description('Currently got %s files' % i)
     elif language=='en':
         pass
Ejemplo n.º 19
0
    def __iter__(self):
        with bz2.BZ2File(self._dump_file) as f:
            c = 0
            for (title, wiki_text, wiki_id) in wikicorpus.extract_pages(f):
                if any(
                    [title.lower().startswith(ns) for ns in self._ignored_ns]
                ):
                    continue
                c += 1

                yield WikiPage(
                    unicode(title), self._language, unicode(wiki_text)
                )

                if c % 10000 == 0:
                    logger.info('Processed: %d', c)
def build_dict(N):
    tuple = extract_pages("enwiki-20181220-pages-articles-multistream.xml")
    page_dict = {}
    elect_id = random.randint(1, 500)
    id = 0
    cnt = 1
    for t in tuple:
        if (cnt > N):
            break
        id += 1
        if (id == elect_id):
            title = t[0]
            interlinks = find_interlinks(str(t))
            outlink_num = len(interlinks)
            page_dict[title] = [cnt, outlink_num, list(interlinks.keys())]
            cnt += 1
            elect_id += random.randint(1, 150)
    return page_dict
Ejemplo n.º 21
0
def wiki_process(input_file, save_path):
    # wikicorpus解析
    wiki = extract_pages(bz2file.open(input_file))
    # 处理并导出
    i = 0
    f = codecs.open(save_path, 'w', encoding='utf-8')
    w = tqdm(wiki, desc=u'已获取0篇文章')
    openCC = OpenCC('t2s')
    for d in w:
        if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(
                u'^#', d[1]):
            s = wiki_replace(d)
            f.write(s + '\n\n\n')
            i += 1
            if i % 100 == 0:
                w.set_description(u'已获取%s篇文章' % i)

    f.close()
Ejemplo n.º 22
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-w', '--wiki-file')

    args = parser.parse_args()

    if args.wiki_file:
        f = bz2.BZ2File(args.wiki_file)
    else:
        f = sys.stdin

    texts = ((text, title, pageid) for title, text, pageid in extract_pages(f))

    for text, title, pageid in texts:
        text.replace('\n', ' ')
        sys.stdout.write(text.encode('utf-8'))
        sys.stdout.write('\n')

    f.close()
Ejemplo n.º 23
0
def prepare_data(filename, destname):
    pages = wc.extract_pages(bz2.BZ2File(filename), ('0',))
    corpus = []
    x = []
    y = []
    count = 0
    for p in pages:
        text = wc.filter_wiki(p[1])
        tokens = [token.encode('utf8') for token in utils.tokenize(text, errors='ignore')
                  if len(token) <= 15 and not token.startswith('_')]
        if len(tokens) >= 50:
            length = 0
            old_i = 0
            for i, token in enumerate(tokens):
                length += len(token)
                if length > MAX_CHAR_LENGTH:
                    corpus.append(tokens[old_i: i])
                    length = len(token)
                    old_i = i
                if i == len(tokens) - 1:
                    corpus.append(tokens[i:])
    count = 0
    for sent in corpus:
        count += 1
        if count >= 100000:
            break
        sent_y = []
        sent_x = []
        for token in sent:
            if all([65 <= c <= 90 or 97 <= c <= 122 for c in token]):
                sent_y.extend([False] * (len(token) - 1) + [True])
                sent_x.extend([c - 64 if c <= 90 else c - 70 for c in token])
        sent_y.extend([False] * (MAX_CHAR_LENGTH - len(sent_x)))
        sent_x.extend([0] * (MAX_CHAR_LENGTH - len(sent_x)))
        y.append(sent_y)
        x.append(sent_x)
        if len(sent_x) != MAX_CHAR_LENGTH:
            print(len(sent_x))

    x = np.array(x)
    y = np.array(y)
    pickle.dump(x, open(os.path.abspath(destname + '_x'), 'wb'))
    pickle.dump(y, open(os.path.abspath(destname + '_y'), 'wb'))
Ejemplo n.º 24
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-w', '--wiki-file')

    args = parser.parse_args()

    if args.wiki_file:
        f = bz2.BZ2File(args.wiki_file)
    else:
        f = sys.stdin

    texts = ((text, title, pageid) for title, text, pageid
             in extract_pages(f))

    for text, title, pageid in texts:
        text.replace('\n', ' ')
        sys.stdout.write(text.encode('utf-8'))
        sys.stdout.write('\n')

    f.close()
Ejemplo n.º 25
0
def get_wiki2():
    from opencc import OpenCC
    # 参考这篇博客注释
    # https://kexue.fm/archives/4176
    opencc1 = OpenCC("t2s")
    reobj1 = re.compile(
        "[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+")
    reobj2 = re.compile(r"\n+")
    reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}")
    reuseful = re.compile('^[a-zA-Z]+:')
    redirect = re.compile(r"^#")

    def wiki_replace(s):
        s = filter_wiki(s)
        s = reobj1.sub("", s)  # 为上传阿里云剔除竖线(|)符号
        s = reobj2.sub("#", s)
        s = reobj3.sub("", s)
        return opencc1.convert(s).strip()

    wiki = extract_pages(
        bz2file.open(
            r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2'
        ))
    with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f:
        i = 0
        filelist = []
        for d in tqdm(wiki):
            if not reuseful.findall(d[0]) and not redirect.findall(d[1]):
                i += 1
                try:
                    filelist.append(
                        reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n")
                except Exception as e:
                    print(d[0], '=' * 10, d[1])
            if i % 1000 == 0:
                s = ("".join(filelist))
                f.write(s)
                filelist = []
    if filelist:
        s = ("".join(filelist))
        f.write(s)
Ejemplo n.º 26
0
def PreProcessing():
    print('begin time of the program is: ', time.ctime())
    tuPle = WIKI.extract_pages(xml_path)
    global tot_word
    #read the xml file into the tuple which is read as type yield
    cnt_time = 0
    while cnt_time < tot_number:
        curr_page = next(tuPle)
        redirects = [
            redirect for keyword, redirect in WIKI.find_interlinks(
                curr_page[1]).items()
        ]
        cnt_time += 1
        # extract the title and the redirect title
        curr_title = curr_page[0]
        if curr_title not in WordDict:
            WordDict[curr_title] = tot_word
            NumDict[tot_word] = curr_title
            tot_word += 1
        org_id = WordDict[curr_title]
        # set the id of the word
        # sum_redirect = len(redirects)
        for redirect_title in redirects:

            if redirect_title not in WordDict:
                WordDict[redirect_title] = tot_word
                NumDict[tot_word] = redirect_title
                #link_id = WordDict[redirect_title]
                tot_word += 1
            link_id = WordDict[redirect_title]
            if org_id not in OutLink:
                OutLink[org_id] = []
            OutLink[org_id].append(link_id)
            if link_id not in InLink:
                InLink[link_id] = []
            InLink[link_id].append(org_id)
            #addtwodimdict(RankScore,org_id,link_id,1/sum_redirect)
    print('end time of the pre-processing is: ', time.ctime())
Ejemplo n.º 27
0
def wikipedia_extract(input_file, output_dir):
    #character chunk
    chunk_size = 50000000
    os.makedirs(output_dir, exist_ok=True)
    fin = xutils.open_file(input_file, 'rt')
    extractor = extract_pages(fin, ['0'])

    fout, counter, chunk = None, chunk_size, -1
    for page in extractor:
        if page[1]:
            text = filter_wiki(page[1])
            if counter >= chunk_size:
                if fout: fout.close()
                counter, chunk = 0, chunk + 1
                output_file = '%s/%s_%d.txt.gz' % (
                    output_dir, os.path.basename(input_file), chunk)
                fout = xutils.open_file(output_file, 'wt')
                print(output_file)

            counter += len(text)
            fout.write(text)
            fout.write('\n\n\n\n')
    fin.close()
Ejemplo n.º 28
0
def summarize_wiki():
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split(
    )

    t1 = time.time()
    successful = 0
    failed = 0
    timedout = 0

    extracted_pages = extract_pages(bz2.BZ2File(wiki_fname), ('0', ))
    for title, text, pageid in extracted_pages:
        if any(title.startswith(ignore + ':') for ignore in ignore_namespaces):
            continue

        try:
            summarize_timeout(text)
            successful += 1
        except TimeoutError:
            print "Timeout summarizing article", title, "with id", pageid
            timedout += 1
        except RuntimeError:
            failed += 1

        if (successful + failed) % 1000 == 0:
            print "Article", successful + failed, "summarized."

        time.sleep(1)

    t2 = time.time()

    print "Successful summaries:", successful
    print "Failed summaries:", failed
    print "Timeout summaries:", timedout

    print "t1:", t1
    print "t2", t2
    print "dt", t2 - t1
proper_names_dict = {
    'ουσιαστικό': 'nouns',
    'επίθετο': 'adjectives',
    'άρθρο': 'dets',
    'επίρρημα': 'adverbs',
    'κύριο όνομα': 'proper_names',
    'μετοχή': 'participles',
    'ρήμα': 'verbs'
}
expected_parts_dict = {}
for expected_part in expected_parts:
    expected_parts_dict[expected_part] = []

other_parts = {}
for title, text, pageid in extract_pages(wiktionary_file_path):
    if text.startswith('#REDIRECT'):
        continue
    title = title.lower()
    all_regex = regex.findall(text)
    all_regex.extend(regex2.findall(text))
    for a in all_regex:
        if a in expected_parts:
            expected_parts_dict[a].append(title)

for i in expected_parts_dict:
    with open('_{0}.py'.format(proper_names_dict[i]), 'w') as f:
        f.write('from __future__ import unicode_literals\n')
        f.write('{} = set(\"\"\"\n'.format(proper_names_dict[i].upper()))
        words = sorted(expected_parts_dict[i])
        line = ''
    argparser = argparse.ArgumentParser(description='Wikipedia Dump Extractor')
    argparser.add_argument('-input_path', type=str, required=True, help='Path to the raw Wikipedia dump')
    argparser.add_argument('-output_path', type=str, required=True, help='Write path for extracted text content')
    return argparser.parse_args()


if __name__ == '__main__':
    arguments = config_argparser()
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    lemmatize = utils.has_pattern()
    filter_namespaces = ('0',)
    texts = ((text, lemmatize, title, pageid) for title, text, pageid in
             extract_pages(bz2.BZ2File(arguments.input_path), filter_namespaces))
    parsed_article_counter = 0
    space = u' '
    output = codecs.open(arguments.output_path, 'w', 'utf-8')
    for article in texts:
        text, lemmatize, title, pageid = article
        text = utils.to_unicode(text, 'utf8', errors='ignore')
        text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
        text = remove_markup(text)
        tokens = get_all_words(text)
        if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
            continue
        output.write("{}\n".format(space.join(tokens) + "\n"))
        parsed_article_counter += 1
    print('Parsed articles: {}', parsed_article_counter)
Ejemplo n.º 31
0
import bz2file
import re
from opencc import OpenCC
from tqdm import tqdm
import codecs
import sys

if len(sys.argv) < 3:
    print("argc less 3")
    sys.exit(1)

inp, outp = sys.argv[1:3]
print("inp file:%s" % inp)
print("outp file:%s" % outp)

wiki = extract_pages(bz2file.open(inp))
c = OpenCC('t2s')


def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    #print("string:%s" % s)
Ejemplo n.º 32
0
import pickle


def fantojan_savepickle(input):
    #this function will save every line into a pickle
    i = 0
    w = tqdm(input, desc=u'已获取0篇文章')  #tqdm is for progressbar prompt
    for d in w:
        if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(
                u'^#', d[1]):
            outputfile = codecs.open(
                '/Users/wisdombeat/PycharmProjects/wiki_txt/wikis/line' +
                str(i) + '.pkl', 'wb+')
            s = wiki_replace(d)
            pickle.dump(s, outputfile)
            outputfile.close()
            i += 1
            if i % 100 == 0:
                w.set_description(u'已获取%s篇文章' % i)
    w.set_description(u'All have done, there are %s papers' % i)
    return


if __name__ == "__main__":
    openbz2 = extract_pages(
        bz2file.open(
            '/Users/wisdombeat/Desktop/zhwiki-latest-pages-articles.xml.bz2'))
    #resultname = '/Users/wisdombeat/PycharmProjects/wiki_txt/wiki.txt'
    #fantojan_savetxt(input=openbz2, output=resultname)
    fantojan_savepickle(openbz2)
proper_names_dict={
    'ουσιαστικό':'nouns',
    'επίθετο':'adjectives',
    'άρθρο':'dets',
    'επίρρημα':'adverbs',
    'κύριο όνομα': 'proper_names',
    'μετοχή': 'participles',
    'ρήμα': 'verbs'
}
expected_parts_dict = {}
for expected_part in expected_parts:
    expected_parts_dict[expected_part] = []

other_parts = {}
for title, text, pageid in extract_pages(wiktionary_file_path):
    if text.startswith('#REDIRECT'):
        continue
    title = title.lower()
    all_regex = regex.findall(text)
    all_regex.extend(regex2.findall(text))
    for a in all_regex:
        if a in expected_parts:
            expected_parts_dict[a].append(title)


for i in expected_parts_dict:
    with open('_{0}.py'.format(proper_names_dict[i]), 'w') as f:
        f.write('from __future__ import unicode_literals\n')
        f.write('{} = set(\"\"\"\n'.format(proper_names_dict[i].upper()))
        words = sorted(expected_parts_dict[i])
Ejemplo n.º 34
0
def get_pos_from_wiktionary():
    import re
    from gensim.corpora.wikicorpus import extract_pages

    regex = re.compile(r"==={{(\w+)\|el}}===")
    regex2 = re.compile(r"==={{(\w+ \w+)\|el}}===")

    # get words based on the Wiktionary dump
    # check only for specific parts

    # ==={{κύριο όνομα|el}}===
    expected_parts = [
        "μετοχή",
        "ρήμα",
        "επίθετο",
        "επίρρημα",
        "ουσιαστικό",
        "κύριο όνομα",
        "άρθρο",
    ]

    wiktionary_file_path = (
        "/data/gsoc2018-spacy/spacy/lang/el/res/elwiktionary-latest-pages-articles.xml"
    )

    proper_names_dict = {
        "ουσιαστικό": "nouns",
        "επίθετο": "adjectives",
        "άρθρο": "dets",
        "επίρρημα": "adverbs",
        "κύριο όνομα": "proper_names",
        "μετοχή": "participles",
        "ρήμα": "verbs",
    }
    expected_parts_dict = {}
    for expected_part in expected_parts:
        expected_parts_dict[expected_part] = []

    for title, text, pageid in extract_pages(wiktionary_file_path):
        if text.startswith("#REDIRECT"):
            continue
        title = title.lower()
        all_regex = regex.findall(text)
        all_regex.extend(regex2.findall(text))
        for a in all_regex:
            if a in expected_parts:
                expected_parts_dict[a].append(title)

    for i in expected_parts_dict:
        with open("_{0}.py".format(proper_names_dict[i]), "w") as f:
            f.write("from __future__ import unicode_literals\n")
            f.write('{} = set("""\n'.format(proper_names_dict[i].upper()))
            words = sorted(expected_parts_dict[i])
            line = ""
            to_write = []
            for word in words:
                if len(line + " " + word) > 79:
                    to_write.append(line)
                    line = ""
                else:
                    line = line + " " + word
            f.write("\n".join(to_write))
            f.write('\n""".split())')
Ejemplo n.º 35
0
from db_connect import get_cursor

dbc = get_cursor()

'''
CREATE TABLE `wiki_pages` (
  `id` varchar(255) NOT NULL,
  `title` varchar(255) NOT NULL,
  `content` mediumtext NOT NULL,
  `is_artist` tinyint(1) NOT NULL,
  `size` int(11) NOT NULL
) DEFAULT CHARSET=utf8;

ALTER TABLE `wiki_pages`
  ADD PRIMARY KEY (`id`),
  ADD KEY `is_artist` (`is_artist`),
  ADD KEY `size` (`size`),
  ADD KEY `title` (`title`);
'''

with open('enwiki-latest-pages-articles.xml') as fh:
    gen = corpus.extract_pages(fh)
    i = 0
    for title, text, pgid in gen:
        text = text.lower()
        if 'infobox musical artist' in text and ':' not in title:
            dbc.execute('INSERT INTO wiki_pages (id, title, content, is_artist) VALUES(%s, %s, %s, 1)',
                        [pgid, title, text])
            print i, title
            i += 1
import sys
sys.path.remove(
    '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python'
)
#sys.path.remove('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC')
sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages')
#sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages/opencc.py')

from gensim.corpora.wikicorpus import extract_pages, filter_wiki
import bz2file
import re
import opencc
from tqdm import tqdm
import codecs

wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2'))


def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    return opencc.convert(s).strip()
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 16 21:09:51 2019

@author: us
"""

from gensim.corpora.wikicorpus import extract_pages,filter_wiki
import bz2file
import re
#import opencc
from opencc import OpenCC
from tqdm import tqdm
import codecs
cc = OpenCC('t2s')  # convert from Simplified Chinese to Traditional Chinese
wiki = extract_pages(bz2file.open('zhwiki-20190720-pages-articles.xml.bz2'))

def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    return cc.convert(s).strip()

i = 0
Ejemplo n.º 38
0
wiki = gensim.corpora.WikiCorpus(input_file, lemmatize=False, dictionary={})

for text in tqdm(list(wiki.get_texts())):
    str_line = bytes.join(b' ', text).decode()
    f.write(str_line + '\n')

# from https://spaces.ac.cn/archives/4176/
from gensim.corpora.wikicorpus import extract_pages, filter_wiki
import bz2file
import re
import opencc
from tqdm import tqdm
import codecs

input_file = "E:\matt\get\wiki\zhwiki-20180301-pages-articles-multistream.xml.bz2"
wiki = extract_pages(bz2file.open(input_file))


def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    return opencc.convert(s).strip()  #