def __iter__(self): """ Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, title, page content) 3-tuple at a time. """ try: for title, content, page_id in extract_pages(bzip_open(self.wikicorpus.fname, mode='rt'), self.wikicorpus.filter_namespaces): yield (page_id, title, content) except ValueError: # Python 2 sucks and can't open bzip in text mode for title, content, page_id in extract_pages(bzip_open(self.wikicorpus.fname, mode='r'), self.wikicorpus.filter_namespaces): yield (page_id, title, content)
def __iter__(self): """ Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, title, page content) 3-tuple at a time. """ if PY2 is False: for title, content, page_id in extract_pages( open_sesame(self.wikicorpus.fname, mode='rt'), self.wikicorpus.filter_namespaces): yield (page_id, title, content) else: # Python 2 sucks and can't open bzip in text mode for title, content, page_id in extract_pages( open_sesame(self.wikicorpus.fname, mode='rb'), self.wikicorpus.filter_namespaces): yield (page_id, title, content)
def fetch_wiki_texts(in_file, namespaces_to_filter=WIKI_DEFAULT_NAMESPACES_TO_FILTER, min_text_length=200): return ((title, clean_text, page_id) for title, text, page_id in extract_pages(bz2.BZ2File(in_file), namespaces_to_filter) for clean_text in (filter_wiki(text), ) if len(clean_text.strip()) >= min_text_length)
def get_texts(self): articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages( bz2.BZ2File(self.fname), self.filter_namespaces)) pool = multiprocessing.Pool(self.processes) # process the corpus in smaller chunks of docs, because multiprocessing.Pool # is dumb and would load the entire input into RAM at once... for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here if len(tokens) < ARTICLE_MIN_WORDS or any( title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 positions += len(tokens) if self.metadata: yield (tokens, (pageid, title)) else: yield tokens pool.terminate() logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions" " (total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) self.length = articles # cache corpus length
def get_wiki2(): reobj1 = re.compile( r"[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+") reobj2 = re.compile(r"\n+") reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}") reuseful = re.compile('^[a-zA-Z]+:') redirect = re.compile(r"^#") def wiki_replace(s): s = filter_wiki(s) s = reobj1.sub("", s) # 为上传阿里云剔除竖线(|)符号 s = reobj2.sub("#", s) s = reobj3.sub("", s) return s wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2')) with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f: i = 0 filelist = [] for d in tqdm(wiki): if not reuseful.findall(d[0]) and not redirect.findall(d[1]): i += 1 filelist.append( reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n") if i % 1000 == 0: s = ("".join(filelist)) f.write(s) filelist = [] if filelist: s = ("".join(filelist)) f.write(s)
def preprocess_sentence(): i = 0 line = '' wiki = extract_pages( bz2file.open('./zhwiki-20190301-pages-articles.xml.bz2') ) # 用gensim的extract_pages来提取每个页面 with open('./zhwiki_sentence.txt', 'w') as f: for text in wiki: if not re.findall('^[a-zA-Z]+:', text[0]) and text[0] and not re.findall( u'^#', text[1]): # 去掉帮助页面以及重定向的页面 converted = opencc.convert(text[1]).strip() # 繁体转简体 converted = re.sub('\|\w*\]', '', converted) for x in converted: if len(x.encode( 'utf-8')) == 3 and x not in stop_punctuation( './stop_punctuation.txt'): line += x if x in ['\n', '。', '?', '!', ',', ';', ':' ] and line != '\n': # 以部分中文符号为分割换行 f.write(line.strip() + '\n') # 按行存入语料文件 line = '' i += 1 if i == 10: print("选取中文维基百科的文章篇数:", i) break
def get_texts(self): """ Iterate over the dump, returning text version of each article as a list of tokens. Only articles of sufficient length are returned (short articles & redirects etc are ignored). Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: >>> for vec in wiki_corpus: >>> print(vec) """ articles, articles_all = 0, 0 positions, positions_all = 0, 0 texts = ((text, title, pageid) for title, text, pageid in extract_pages( bz2.BZ2File(self.fname), self.filter_namespaces)) batch_idx = 0 pool = multiprocessing.Pool(self.processes) # Process the corpus in smaller chunks of docs, # because multiprocessing.Pool is dumb and would load the entire input # into RAM at once... for group in chunkize(texts, chunksize=10 * self.processes, maxsize=1): for tokens, title, pageid in pool.imap(process_article, group): articles_all += 1 positions_all += len(tokens) # article redirects and short stubs are pruned here to_ignored = any( title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES) if len(tokens) < ARTICLE_MIN_WORDS or to_ignored: continue articles += 1 positions += len(tokens) if self.metadata: yield title, tokens else: yield tokens batch_idx += 1 if self.max_batch and batch_idx == self.max_batch: break pool.terminate() logger.info( "Finished iterating over Wikipedia corpus of %i documents with " "%i positions (total %i articles, %i positions before pruning " "articles shorter than %i words)", articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS) self.length = articles
def get_wiki(): from opencc import OpenCC # 参考这篇博客注释 # https://kexue.fm/archives/4176 opencc1 = OpenCC("t2s") resub1 = re.compile(':*{\|[\s\S]*?\|}') resub2 = re.compile('<gallery>[\s\S]*?</gallery>') resub3 = re.compile('(.){{([^{}\n]*?\|[^{}\n]*?)}}') resub4 = re.compile('\* *\n|\'{2,}') resub5 = re.compile('\n+') resub6 = re.compile('\n[:;]|\n +') resub7 = re.compile('\n==') refind1 = re.compile('^[a-zA-Z]+:') refind2 = re.compile('^#') p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-') p2 = re.compile(r'[(\(][,;。?!\s]*[)\)]') p3 = re.compile(r'[「『]') p4 = re.compile(r'[」』]') def wiki_replace(s): s = filter_wiki(s) s = resub1.sub('', s) s = resub2.sub('', s) s = resub3.sub('\\1[[\\2]]', s) s = resub4.sub('', s) s = resub5.sub('\n', s) s = resub6.sub('\n', s) s = resub7.sub('\n\n==', s) s = p1.sub(r'\2', s) s = p2.sub(r'', s) s = p3.sub(r'“', s) s = p4.sub(r'”', s) return opencc1.convert(s).strip() wiki = extract_pages( bz2file.open( r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2' )) # wiki=WikiCorpus('zhwiki-latest-pages-articles.xml.bz2',lemmatize=False,dictionary={}) with codecs.open('wiki.csv', 'w', encoding='utf-8') as f: i = 0 filelist = [] for d in tqdm(wiki): if not refind1.findall(d[0]) and d[0] and not refind2.findall( d[1]): filelist.append(d[0] + "\n" + d[1]) line = d[1] i += 1 if i % 100 == 0: s = wiki_replace("\n\n".join(filelist)) f.write(s) filelist = []
def _iterate_over_pages(fname): """ Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, title, page content) 3-tuple at a time. """ dictionary = Dictionary() wiki = WikiCorpus(fname, lemmatize=False, dictionary=dictionary) for title, content, page_id in extract_pages(bz2.BZ2File(wiki.fname), wiki.filter_namespaces): yield (page_id, title, content)
def load_doc(dump_file, title, filter_namespaces=('0', )): """ Load a wikipedia article from its title. """ bz2_file = bz2.BZ2File(dump_file) for page_title, text, pageid in extract_pages(bz2_file, filter_namespaces): if page_title == title: text = filter_wiki(text) tokens = extract_jp_entities(text) return tokens, pageid return None, None
def extract_text_content(xml_dump): # article_count = 0 with open('wiki.en.txt', 'w') as file: for title, content, pageid in tqdm(extract_pages(xml_dump)): try: file.write(filter_wiki(content).strip() + "\n") # article_count += 1 # if article_count % 10000 == 0: # logging.info(f'{article_count} articles processed') except Exception as e: logging.warning(str(e))
def __iter__(self): with bz2.BZ2File(self._dump_file) as f: c = 0 for (title, wiki_text, wiki_id) in wikicorpus.extract_pages(f): if any([title.lower().startswith(ns) for ns in self._ignored_ns]): continue c += 1 yield WikiPage(six.text_type(title), self._language, six.text_type(wiki_text)) if c % 10000 == 0: logger.info('Processed: %d', c)
def _extract_article_onebyone(self): wiki_pages = extract_pages( bz2file.open(self.download_wiki_articles_dump())) counter = 0 w = tqdm(wiki_pages, desc=u'get 0 article') for d in w: if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall( u'^#', d[1]): yield d counter += 1 if counter % 100 == 0: w.set_description(u'processed %s article' % counter)
def _iterate_over_pages(fname): """ Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, title, page content) 3-tuple at a time. """ dictionary = Dictionary() wiki = WikiCorpus(fname, lemmatize=False, dictionary=dictionary, filter_namespaces={'0'}) for title, content, page_id in extract_pages(bz2.BZ2File(wiki.fname), wiki.filter_namespaces): yield (page_id, title, content)
def main(args): os.makedirs(args.outdir, exist_ok=True) filter_namespaces = ('0', ) out_i = 0 for in_fname in glob.glob(args.inglob): for title, text, pageid in extract_pages( bz2.BZ2File(in_fname), filter_namespaces=filter_namespaces): if out_i % args.skip == 0: with open(os.path.join(args.outdir, f'{pageid}.txt'), 'w') as f: f.write(filter_wiki(text)) out_i += 1
def iterate_wiki(input_path): lemmatize = utils.has_pattern() filter_namespaces = ('0',) texts = ((text, lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(input_path), filter_namespaces)) for article in texts: text, lemmatize, title, pageid = article text = utils.to_unicode(text, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = remove_markup(text) tokens = get_all_words(text) if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue yield title, tokens
def loadSimpleWiki(): #file_name = "C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017/simplewiki-20170820-pages-meta-current.xml.bz2" #counter =0 #... 179,620 articles found file_name = "C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017/simplewiki-20170820-pages-meta-current.xml.bz2" wiki = WikiCorpus(file_name, lemmatize=False, dictionary={}) #vocab dict not needed i = 0 #for text in wiki.get_texts(): allwords = defaultdict(lambda: 0) allsents = [] print("starting...") dfcounter_stemmed = defaultdict(lambda: 0) dfcounter_nostem = defaultdict(lambda: 0) for (title, article, pageid) in extract_pages( bz2.BZ2File(file_name), filter_namespaces=('0', )): #filter_namespaces=["0"]): if len(article) == 0: continue text = process_article((filter_wiki(article), False, title, pageid), tokenizer_func=tokenizer) text_orig = set( text[0][0] ) #... We are ONLY interested in whether or not the term appeared in this document NOT how many times or where text_stemmed = set(text[0][1]) for term in text_orig: dfcounter_nostem[term] += 1 for term in text_stemmed: dfcounter_stemmed[term] += 1 i += 1 if i % 1000 == 0: print(i) #break print(i) handle = open("simplewiki_docfreqs_stemmed.txt", "w+") for key, val in dfcounter_stemmed.items(): handle.write(str(key) + "\t" + str(val) + "\n") handle.close() handle = open("simplewiki_docfreqs_nostem.txt", "w+") for key, val in dfcounter_nostem.items(): handle.write(str(key) + "\t" + str(val) + "\n") handle.close()
def wiki_bz_process(self,language): wiki = extract_pages(bz2file.open(self.corpus)) f = codecs.open(os.path.join(os.path.dirname(self.corpus), 'wiki.txt'), 'w', encoding='utf-8') w = tqdm(wiki, desc="Currently get 0 files!") if language=='zh': for i, d in enumerate(w): if not re.findall('^[a-zA-Z]+:', d[0]) and not re.findall(u'^#', d[1]): s = self.wiki_replace(d) f.write(s + '\n\n\n') i += 1 if i % 100 == 0: w.set_description('Currently got %s files' % i) elif language=='en': pass
def __iter__(self): with bz2.BZ2File(self._dump_file) as f: c = 0 for (title, wiki_text, wiki_id) in wikicorpus.extract_pages(f): if any( [title.lower().startswith(ns) for ns in self._ignored_ns] ): continue c += 1 yield WikiPage( unicode(title), self._language, unicode(wiki_text) ) if c % 10000 == 0: logger.info('Processed: %d', c)
def build_dict(N): tuple = extract_pages("enwiki-20181220-pages-articles-multistream.xml") page_dict = {} elect_id = random.randint(1, 500) id = 0 cnt = 1 for t in tuple: if (cnt > N): break id += 1 if (id == elect_id): title = t[0] interlinks = find_interlinks(str(t)) outlink_num = len(interlinks) page_dict[title] = [cnt, outlink_num, list(interlinks.keys())] cnt += 1 elect_id += random.randint(1, 150) return page_dict
def wiki_process(input_file, save_path): # wikicorpus解析 wiki = extract_pages(bz2file.open(input_file)) # 处理并导出 i = 0 f = codecs.open(save_path, 'w', encoding='utf-8') w = tqdm(wiki, desc=u'已获取0篇文章') openCC = OpenCC('t2s') for d in w: if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall( u'^#', d[1]): s = wiki_replace(d) f.write(s + '\n\n\n') i += 1 if i % 100 == 0: w.set_description(u'已获取%s篇文章' % i) f.close()
def main(): parser = ArgumentParser() parser.add_argument('-w', '--wiki-file') args = parser.parse_args() if args.wiki_file: f = bz2.BZ2File(args.wiki_file) else: f = sys.stdin texts = ((text, title, pageid) for title, text, pageid in extract_pages(f)) for text, title, pageid in texts: text.replace('\n', ' ') sys.stdout.write(text.encode('utf-8')) sys.stdout.write('\n') f.close()
def prepare_data(filename, destname): pages = wc.extract_pages(bz2.BZ2File(filename), ('0',)) corpus = [] x = [] y = [] count = 0 for p in pages: text = wc.filter_wiki(p[1]) tokens = [token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if len(token) <= 15 and not token.startswith('_')] if len(tokens) >= 50: length = 0 old_i = 0 for i, token in enumerate(tokens): length += len(token) if length > MAX_CHAR_LENGTH: corpus.append(tokens[old_i: i]) length = len(token) old_i = i if i == len(tokens) - 1: corpus.append(tokens[i:]) count = 0 for sent in corpus: count += 1 if count >= 100000: break sent_y = [] sent_x = [] for token in sent: if all([65 <= c <= 90 or 97 <= c <= 122 for c in token]): sent_y.extend([False] * (len(token) - 1) + [True]) sent_x.extend([c - 64 if c <= 90 else c - 70 for c in token]) sent_y.extend([False] * (MAX_CHAR_LENGTH - len(sent_x))) sent_x.extend([0] * (MAX_CHAR_LENGTH - len(sent_x))) y.append(sent_y) x.append(sent_x) if len(sent_x) != MAX_CHAR_LENGTH: print(len(sent_x)) x = np.array(x) y = np.array(y) pickle.dump(x, open(os.path.abspath(destname + '_x'), 'wb')) pickle.dump(y, open(os.path.abspath(destname + '_y'), 'wb'))
def main(): parser = ArgumentParser() parser.add_argument('-w', '--wiki-file') args = parser.parse_args() if args.wiki_file: f = bz2.BZ2File(args.wiki_file) else: f = sys.stdin texts = ((text, title, pageid) for title, text, pageid in extract_pages(f)) for text, title, pageid in texts: text.replace('\n', ' ') sys.stdout.write(text.encode('utf-8')) sys.stdout.write('\n') f.close()
def get_wiki2(): from opencc import OpenCC # 参考这篇博客注释 # https://kexue.fm/archives/4176 opencc1 = OpenCC("t2s") reobj1 = re.compile( "[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+") reobj2 = re.compile(r"\n+") reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}") reuseful = re.compile('^[a-zA-Z]+:') redirect = re.compile(r"^#") def wiki_replace(s): s = filter_wiki(s) s = reobj1.sub("", s) # 为上传阿里云剔除竖线(|)符号 s = reobj2.sub("#", s) s = reobj3.sub("", s) return opencc1.convert(s).strip() wiki = extract_pages( bz2file.open( r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2' )) with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f: i = 0 filelist = [] for d in tqdm(wiki): if not reuseful.findall(d[0]) and not redirect.findall(d[1]): i += 1 try: filelist.append( reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n") except Exception as e: print(d[0], '=' * 10, d[1]) if i % 1000 == 0: s = ("".join(filelist)) f.write(s) filelist = [] if filelist: s = ("".join(filelist)) f.write(s)
def PreProcessing(): print('begin time of the program is: ', time.ctime()) tuPle = WIKI.extract_pages(xml_path) global tot_word #read the xml file into the tuple which is read as type yield cnt_time = 0 while cnt_time < tot_number: curr_page = next(tuPle) redirects = [ redirect for keyword, redirect in WIKI.find_interlinks( curr_page[1]).items() ] cnt_time += 1 # extract the title and the redirect title curr_title = curr_page[0] if curr_title not in WordDict: WordDict[curr_title] = tot_word NumDict[tot_word] = curr_title tot_word += 1 org_id = WordDict[curr_title] # set the id of the word # sum_redirect = len(redirects) for redirect_title in redirects: if redirect_title not in WordDict: WordDict[redirect_title] = tot_word NumDict[tot_word] = redirect_title #link_id = WordDict[redirect_title] tot_word += 1 link_id = WordDict[redirect_title] if org_id not in OutLink: OutLink[org_id] = [] OutLink[org_id].append(link_id) if link_id not in InLink: InLink[link_id] = [] InLink[link_id].append(org_id) #addtwodimdict(RankScore,org_id,link_id,1/sum_redirect) print('end time of the pre-processing is: ', time.ctime())
def wikipedia_extract(input_file, output_dir): #character chunk chunk_size = 50000000 os.makedirs(output_dir, exist_ok=True) fin = xutils.open_file(input_file, 'rt') extractor = extract_pages(fin, ['0']) fout, counter, chunk = None, chunk_size, -1 for page in extractor: if page[1]: text = filter_wiki(page[1]) if counter >= chunk_size: if fout: fout.close() counter, chunk = 0, chunk + 1 output_file = '%s/%s_%d.txt.gz' % ( output_dir, os.path.basename(input_file), chunk) fout = xutils.open_file(output_file, 'wt') print(output_file) counter += len(text) fout.write(text) fout.write('\n\n\n\n') fin.close()
def summarize_wiki(): ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split( ) t1 = time.time() successful = 0 failed = 0 timedout = 0 extracted_pages = extract_pages(bz2.BZ2File(wiki_fname), ('0', )) for title, text, pageid in extracted_pages: if any(title.startswith(ignore + ':') for ignore in ignore_namespaces): continue try: summarize_timeout(text) successful += 1 except TimeoutError: print "Timeout summarizing article", title, "with id", pageid timedout += 1 except RuntimeError: failed += 1 if (successful + failed) % 1000 == 0: print "Article", successful + failed, "summarized." time.sleep(1) t2 = time.time() print "Successful summaries:", successful print "Failed summaries:", failed print "Timeout summaries:", timedout print "t1:", t1 print "t2", t2 print "dt", t2 - t1
proper_names_dict = { 'ουσιαστικό': 'nouns', 'επίθετο': 'adjectives', 'άρθρο': 'dets', 'επίρρημα': 'adverbs', 'κύριο όνομα': 'proper_names', 'μετοχή': 'participles', 'ρήμα': 'verbs' } expected_parts_dict = {} for expected_part in expected_parts: expected_parts_dict[expected_part] = [] other_parts = {} for title, text, pageid in extract_pages(wiktionary_file_path): if text.startswith('#REDIRECT'): continue title = title.lower() all_regex = regex.findall(text) all_regex.extend(regex2.findall(text)) for a in all_regex: if a in expected_parts: expected_parts_dict[a].append(title) for i in expected_parts_dict: with open('_{0}.py'.format(proper_names_dict[i]), 'w') as f: f.write('from __future__ import unicode_literals\n') f.write('{} = set(\"\"\"\n'.format(proper_names_dict[i].upper())) words = sorted(expected_parts_dict[i]) line = ''
argparser = argparse.ArgumentParser(description='Wikipedia Dump Extractor') argparser.add_argument('-input_path', type=str, required=True, help='Path to the raw Wikipedia dump') argparser.add_argument('-output_path', type=str, required=True, help='Write path for extracted text content') return argparser.parse_args() if __name__ == '__main__': arguments = config_argparser() program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) lemmatize = utils.has_pattern() filter_namespaces = ('0',) texts = ((text, lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(arguments.input_path), filter_namespaces)) parsed_article_counter = 0 space = u' ' output = codecs.open(arguments.output_path, 'w', 'utf-8') for article in texts: text, lemmatize, title, pageid = article text = utils.to_unicode(text, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = remove_markup(text) tokens = get_all_words(text) if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue output.write("{}\n".format(space.join(tokens) + "\n")) parsed_article_counter += 1 print('Parsed articles: {}', parsed_article_counter)
import bz2file import re from opencc import OpenCC from tqdm import tqdm import codecs import sys if len(sys.argv) < 3: print("argc less 3") sys.exit(1) inp, outp = sys.argv[1:3] print("inp file:%s" % inp) print("outp file:%s" % outp) wiki = extract_pages(bz2file.open(inp)) c = OpenCC('t2s') def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s #print("string:%s" % s)
import pickle def fantojan_savepickle(input): #this function will save every line into a pickle i = 0 w = tqdm(input, desc=u'已获取0篇文章') #tqdm is for progressbar prompt for d in w: if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall( u'^#', d[1]): outputfile = codecs.open( '/Users/wisdombeat/PycharmProjects/wiki_txt/wikis/line' + str(i) + '.pkl', 'wb+') s = wiki_replace(d) pickle.dump(s, outputfile) outputfile.close() i += 1 if i % 100 == 0: w.set_description(u'已获取%s篇文章' % i) w.set_description(u'All have done, there are %s papers' % i) return if __name__ == "__main__": openbz2 = extract_pages( bz2file.open( '/Users/wisdombeat/Desktop/zhwiki-latest-pages-articles.xml.bz2')) #resultname = '/Users/wisdombeat/PycharmProjects/wiki_txt/wiki.txt' #fantojan_savetxt(input=openbz2, output=resultname) fantojan_savepickle(openbz2)
proper_names_dict={ 'ουσιαστικό':'nouns', 'επίθετο':'adjectives', 'άρθρο':'dets', 'επίρρημα':'adverbs', 'κύριο όνομα': 'proper_names', 'μετοχή': 'participles', 'ρήμα': 'verbs' } expected_parts_dict = {} for expected_part in expected_parts: expected_parts_dict[expected_part] = [] other_parts = {} for title, text, pageid in extract_pages(wiktionary_file_path): if text.startswith('#REDIRECT'): continue title = title.lower() all_regex = regex.findall(text) all_regex.extend(regex2.findall(text)) for a in all_regex: if a in expected_parts: expected_parts_dict[a].append(title) for i in expected_parts_dict: with open('_{0}.py'.format(proper_names_dict[i]), 'w') as f: f.write('from __future__ import unicode_literals\n') f.write('{} = set(\"\"\"\n'.format(proper_names_dict[i].upper())) words = sorted(expected_parts_dict[i])
def get_pos_from_wiktionary(): import re from gensim.corpora.wikicorpus import extract_pages regex = re.compile(r"==={{(\w+)\|el}}===") regex2 = re.compile(r"==={{(\w+ \w+)\|el}}===") # get words based on the Wiktionary dump # check only for specific parts # ==={{κύριο όνομα|el}}=== expected_parts = [ "μετοχή", "ρήμα", "επίθετο", "επίρρημα", "ουσιαστικό", "κύριο όνομα", "άρθρο", ] wiktionary_file_path = ( "/data/gsoc2018-spacy/spacy/lang/el/res/elwiktionary-latest-pages-articles.xml" ) proper_names_dict = { "ουσιαστικό": "nouns", "επίθετο": "adjectives", "άρθρο": "dets", "επίρρημα": "adverbs", "κύριο όνομα": "proper_names", "μετοχή": "participles", "ρήμα": "verbs", } expected_parts_dict = {} for expected_part in expected_parts: expected_parts_dict[expected_part] = [] for title, text, pageid in extract_pages(wiktionary_file_path): if text.startswith("#REDIRECT"): continue title = title.lower() all_regex = regex.findall(text) all_regex.extend(regex2.findall(text)) for a in all_regex: if a in expected_parts: expected_parts_dict[a].append(title) for i in expected_parts_dict: with open("_{0}.py".format(proper_names_dict[i]), "w") as f: f.write("from __future__ import unicode_literals\n") f.write('{} = set("""\n'.format(proper_names_dict[i].upper())) words = sorted(expected_parts_dict[i]) line = "" to_write = [] for word in words: if len(line + " " + word) > 79: to_write.append(line) line = "" else: line = line + " " + word f.write("\n".join(to_write)) f.write('\n""".split())')
from db_connect import get_cursor dbc = get_cursor() ''' CREATE TABLE `wiki_pages` ( `id` varchar(255) NOT NULL, `title` varchar(255) NOT NULL, `content` mediumtext NOT NULL, `is_artist` tinyint(1) NOT NULL, `size` int(11) NOT NULL ) DEFAULT CHARSET=utf8; ALTER TABLE `wiki_pages` ADD PRIMARY KEY (`id`), ADD KEY `is_artist` (`is_artist`), ADD KEY `size` (`size`), ADD KEY `title` (`title`); ''' with open('enwiki-latest-pages-articles.xml') as fh: gen = corpus.extract_pages(fh) i = 0 for title, text, pgid in gen: text = text.lower() if 'infobox musical artist' in text and ':' not in title: dbc.execute('INSERT INTO wiki_pages (id, title, content, is_artist) VALUES(%s, %s, %s, 1)', [pgid, title, text]) print i, title i += 1
import sys sys.path.remove( '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python' ) #sys.path.remove('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC') sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages') #sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages/opencc.py') from gensim.corpora.wikicorpus import extract_pages, filter_wiki import bz2file import re import opencc from tqdm import tqdm import codecs wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2')) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s return opencc.convert(s).strip()
# -*- coding: utf-8 -*- """ Created on Sat Nov 16 21:09:51 2019 @author: us """ from gensim.corpora.wikicorpus import extract_pages,filter_wiki import bz2file import re #import opencc from opencc import OpenCC from tqdm import tqdm import codecs cc = OpenCC('t2s') # convert from Simplified Chinese to Traditional Chinese wiki = extract_pages(bz2file.open('zhwiki-20190720-pages-articles.xml.bz2')) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s return cc.convert(s).strip() i = 0
wiki = gensim.corpora.WikiCorpus(input_file, lemmatize=False, dictionary={}) for text in tqdm(list(wiki.get_texts())): str_line = bytes.join(b' ', text).decode() f.write(str_line + '\n') # from https://spaces.ac.cn/archives/4176/ from gensim.corpora.wikicorpus import extract_pages, filter_wiki import bz2file import re import opencc from tqdm import tqdm import codecs input_file = "E:\matt\get\wiki\zhwiki-20180301-pages-articles-multistream.xml.bz2" wiki = extract_pages(bz2file.open(input_file)) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s return opencc.convert(s).strip() #