def gen_entity_mentions_map(wiki_path, name2wikiid, wikiid2id, ent_max): hyp_pattern = re.compile(r'<a[^>]*href=\"([^\">]+)\"[^>]*>([^>]+)</a>', re.DOTALL | re.UNICODE) ignored_names = set() wikiid2mentions = {} with open(wiki_path, 'r', encoding='utf8') as inf: for line in inf: if len(wikiid2mentions) % 100 < 10: show_progress(percent=len(wikiid2mentions) / ent_max) clean_text = wikicorpus.filter_wiki(line) hyp_matches = re.finditer(hyp_pattern, line) for link in hyp_matches: name = unquote(link.groups()[0]) # wikipedia url id mention = link.groups()[1] if name in name2wikiid: wikiid = name2wikiid[name] if wikiid not in wikiid2mentions: wikiid2mentions[wikiid] = [] wikiid2mentions[wikiid].append(mention) else: ignored_names.add(name) return wikiid2mentions
def preprocess_wikidata(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower().split('../img/')[0] tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return (tokens, text)
def fetch_wiki_texts(in_file, namespaces_to_filter=WIKI_DEFAULT_NAMESPACES_TO_FILTER, min_text_length=200): return ((title, clean_text, page_id) for title, text, page_id in extract_pages(bz2.BZ2File(in_file), namespaces_to_filter) for clean_text in (filter_wiki(text), ) if len(clean_text.strip()) >= min_text_length)
def preprocess(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower() tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i) > 2] return tokens
def process_article(args): title = args[0] text = filter_wiki(args[1]) text = utils.to_unicode(text, encoding='utf8', errors="strict") text = text.replace('\n', ' ') pageid = args[2] return title, text, pageid
def get_text(self): result = "" elelist = self.dom.getElementsByTagName('rev') if elelist.length is not 0: ele = elelist[0] s = filter_wiki(ele.childNodes[0].data).encode('sjis', 'ignore') result = re.sub(r'[^a-zA-Z ]', '', s.decode('sjis', 'ignore')).lower() return result
def _get_plaintext(content): return WIKI_CRUFT_RE.sub( r'', WIKI_NEWLINE_RE.sub( r'\n', WIKI_HEADER_RE.sub(r'\1', WIKI_QUOTE_RE.sub( r'', filter_wiki(content))))).strip()
def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) dict of (str: str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0', ) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping id_path = "./{%(ns)s}id" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text articleID = elem.find(id_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall( top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] section_contents = [ filter_wiki(section_content) for section_content in section_contents ] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, interlinks, articleID else: return title, sections, articleID
def iter_wiki(dump_file): # making a wiki token stream """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield tokens
def iter_wiki(self): """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(self.dump_file)): text = filter_wiki(text) tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS] if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield title, tokens
def process_article(args): """ Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). """ text, title, pageid = args text = filter_wiki(text) tokens = extract_jp_entities(text) if text else [] return tokens, title, pageid
def wiki_docs(dir="data/simple_wiki"): """ :param path: :return: """ for filename in os.listdir(os.path.join(BASE_DIR, dir)): with open(os.path.join(BASE_DIR, dir, filename)) as f: doc = filter_wiki(f.read()) yield doc
def process_article(args): # override original method in wikicorpus.py text, lemmatize, title, pageid = args text = filter_wiki(text) # result = utils.lemmatize(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenize(text) return result, title, pageid
def iter_wiki(dump_file): ignore_namespaces = "Wikipedia Category File Portal Template MediaWiki User Help Book Draft".split( ) for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any( title.startswith(ns + ':') for ns in ignore_namespaces): continue yield title, tokens
def make_corpus(args): # Create new, empty file for the corpus wikidump_file = args.dump_file lang_code = wikidump_file.split("/")[-1][:2] output_filename = args.output_file f = open(output_filename, "w+") f.close() nlp = stanfordnlp.Pipeline(processors="tokenize", lang=lang_code, models_dir="/u/nlp/data/stanfordnlp_resources/") total_tokens = 0 checkpoint = 100000 for event, elem in etree.iterparse(args.dump_file, events=('start', 'end', 'start-ns', 'end-ns')): if event == 'end': if elem.tag == add_ns("page"): ns = elem.find(add_ns("ns")) if ns is not None and ns.text == "0": revision = elem.find(add_ns("revision")) if revision is None: continue text_elem = revision.find(add_ns("text")) if text_elem is None: continue text = text_elem.text if text is None: continue text = wikicorpus.filter_wiki(text) text = text.lower() try: sentences = nlp(text).sentences except Exception: continue article_len = sum([len(sent.words) for sent in sentences]) if article_len > int(args.min_tokens_for_article): for sentence in sentences: words = [word.text for word in sentence.words] # Take out heading words, that usually appear as ==heading== words = [ word for word in words if "==" not in word ] if len(words) > 5: total_tokens += len(words) line = " ".join(words) with open(output_filename, "a+") as outfile: outfile.write(line + "\n") if total_tokens >= checkpoint: print(f"At {total_tokens} tokens") checkpoint += 100000 if total_tokens >= int(args.max_tokens): print("Reached max tokens! We're at {0}.".format(total_tokens)) return print(f"Finished corpus with {total_tokens} tokens!")
def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) return s
def extract_text_content(xml_dump): # article_count = 0 with open('wiki.en.txt', 'w') as file: for title, content, pageid in tqdm(extract_pages(xml_dump)): try: file.write(filter_wiki(content).strip() + "\n") # article_count += 1 # if article_count % 10000 == 0: # logging.info(f'{article_count} articles processed') except Exception as e: logging.warning(str(e))
def get_texts(self): length = 0 for _, _, text in process_data(self.input): length += 1 yield [tok for tok in self.tokenizer(filter_wiki(text)) if tok not in self.stopwords] if self.limit and length >= self.limit: break self.length = length
def iter_wiki(dump_file, n=-1): ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() counter = 0 for title, text, pageid in _extract_pages(smart_open(dump_file)): counter += 1 if counter == n: break text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any(title.startswith(ns+':') for ns in ignore_namespaces): continue yield title, tokens
def segment(page_xml, include_interlinks=False): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. include_interlinks : bool Whether or not interlinks should be parsed. Returns ------- (str, list of (str, str), (Optionally) dict of (str: str)) Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0',) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: if include_interlinks: interlinks = find_interlinks(text) section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) section_headings = [heading.strip() for heading in section_headings] assert len(section_contents) == len(section_headings) else: interlinks = [] section_contents = [] section_headings = [] section_contents = [filter_wiki(section_content) for section_content in section_contents] sections = list(zip(section_headings, section_contents)) if include_interlinks: return title, sections, interlinks else: return title, sections
def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('[\s\S]*?', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) #cc = opencc.OpenCC('mix2s') #return cc.convert(s).strip() return s
def main(args): os.makedirs(args.outdir, exist_ok=True) filter_namespaces = ('0', ) out_i = 0 for in_fname in glob.glob(args.inglob): for title, text, pageid in extract_pages( bz2.BZ2File(in_fname), filter_namespaces=filter_namespaces): if out_i % args.skip == 0: with open(os.path.join(args.outdir, f'{pageid}.txt'), 'w') as f: f.write(filter_wiki(text)) out_i += 1
def _clean(self, d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('(==+)', '\n', s) if self.s2t is True: return cc.convert(d[0]).strip(), cc.convert(s).strip() else: return d[0].strip(), s.strip()
def process_article(args): """Parse a Wikipedia article, returning its content as a list of sentences (each a list of utf8-encoded token strings). """ text, do_lemmatize, title, pageid = args text = filter_wiki(text) process_fn = utils.lemmatize if do_lemmatize else tokenize sentences = [] for sentence in re.split(SENTENCE_BOUNDARY, text): if not sentence: continue sentences.append(process_fn(sentence)) return sentences, title, pageid
def main(): parser = ArgumentParser() parser.add_argument('-e', '--encoding') parser.add_argument('-t', '--tokenization', default='none') args = parser.parse_args() encoding = args.encoding tokenization = args.tokenization if encoding: sys.stdout = codecs.getwriter(encoding)(sys.stdout) sys.stdin = codecs.getreader(encoding)(sys.stdin) for text in sys.stdin: text = filter_wiki(text).strip() if tokenization != 'none': if tokenization == 'alpha': tokens = tokenize(text, tok_type='alpha', lower=False) elif tokenization == 'alpha-lower': tokens = tokenize(text, tok_type='alpha', lower=True) elif tokenization == 'all': tokens = tokenize(text, tok_type='all', lower=False, norm_num=True) elif tokenization == 'all-lower': tokens = tokenize(text, tok_type='all', lower=True, norm_num=True) elif tokenization == 'all-num': tokens = tokenize(text, tok_type='all', lower=False, norm_num=False) elif tokenization == 'all-lower-num': tokens = tokenize(text, tok_type='all', lower=True, norm_num=False) elif tokenization == 'nopunct': tokens = tokenize(text, tok_type='nopunct', lower=False, norm_num=True) elif tokenization == 'nopunct-lower': tokens = tokenize(text, tok_type='nopunct', lower=True, norm_num=True) elif tokenization == 'nopunct-num': tokens = tokenize(text, tok_type='nopunct', lower=False, norm_num=False) elif tokenization == 'nopunct-lower-num': tokens = tokenize(text, tok_type='nopunct', lower=True, norm_num=False) else: raise NotImplementedError text = ' '.join(tokens) if PAT_ALPHABETIC.match(text): sys.stdout.write(text + '\n')
def prepare_data(filename, destname): pages = wc.extract_pages(bz2.BZ2File(filename), ('0',)) corpus = [] x = [] y = [] count = 0 for p in pages: text = wc.filter_wiki(p[1]) tokens = [token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if len(token) <= 15 and not token.startswith('_')] if len(tokens) >= 50: length = 0 old_i = 0 for i, token in enumerate(tokens): length += len(token) if length > MAX_CHAR_LENGTH: corpus.append(tokens[old_i: i]) length = len(token) old_i = i if i == len(tokens) - 1: corpus.append(tokens[i:]) count = 0 for sent in corpus: count += 1 if count >= 100000: break sent_y = [] sent_x = [] for token in sent: if all([65 <= c <= 90 or 97 <= c <= 122 for c in token]): sent_y.extend([False] * (len(token) - 1) + [True]) sent_x.extend([c - 64 if c <= 90 else c - 70 for c in token]) sent_y.extend([False] * (MAX_CHAR_LENGTH - len(sent_x))) sent_x.extend([0] * (MAX_CHAR_LENGTH - len(sent_x))) y.append(sent_y) x.append(sent_x) if len(sent_x) != MAX_CHAR_LENGTH: print(len(sent_x)) x = np.array(x) y = np.array(y) pickle.dump(x, open(os.path.abspath(destname + '_x'), 'wb')) pickle.dump(y, open(os.path.abspath(destname + '_y'), 'wb'))
def segment(page_xml): """Parse the content inside a page tag Parameters ---------- page_xml : str Content from page tag. Returns ------- (str, list of (str, str)) Structure contains (title, [(section_heading, section_content)]). """ elem = cElementTree.fromstring(page_xml) filter_namespaces = ('0',) namespace = get_namespace(elem.tag) ns_mapping = {"ns": namespace} text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping title_path = "./{%(ns)s}title" % ns_mapping ns_path = "./{%(ns)s}ns" % ns_mapping lead_section_heading = "Introduction" top_level_heading_regex = r"\n==[^=].*[^=]==\n" top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" title = elem.find(title_path).text text = elem.find(text_path).text ns = elem.find(ns_path).text if ns not in filter_namespaces: text = None if text is not None: section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) assert len(section_contents) == len(section_headings) else: section_contents = [] section_headings = [] section_contents = [filter_wiki(section_content) for section_content in section_contents] sections = list(zip(section_headings, section_contents)) return title, sections
def extract_first_sentence(text): # extraction section 0 summary_content = text.encode('UTF-8').decode('UTF-8') top_level_heading_regex = r"\n==[^=].*[^=]==\n" summary_content = re.split(top_level_heading_regex, summary_content)[0] summary_content = filter_wiki(summary_content) summary_content = re.sub(r"'''", "", summary_content) summary_content = re.sub(r'\\n', ' ', summary_content) summary_content = re.sub(r'\\', '', summary_content) summary_content = re.sub(r"(\(.*?\))", "", summary_content) summary_content = summary_content.strip() sents = sent_tokenize(summary_content) if len(sents) < 1: return "" first = sents[0] if sents[0] is "." or sents[0] is "" or sents[0].startswith("See also"): if len(sents) > 1: first = sents[1] else: return "" return first.strip()
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # initializations articles = {} all_missing = [] redir_on = {} collisions = {} non_ascii = [] site = mwclient.Site('en.wikipedia.org', '/w/api.php/') # get all txt files in a folder and iterate over them filelist = glob.glob(os.path.join(base_path, p['folder_path'], "*.txt")) for f in filelist: # get the word we are working on f_name = os.path.basename(f) k_word = os.path.splitext(f_name)[0] logger.info("working on file: %s" % f_name) # try to convert the word into ascii for the http query file_obj = codecs.open(f, "r", "utf-16") counter = 0 words = [] for w in file_obj.readlines(): try: s = w.strip().decode('ascii') words.append(s) except Exception: counter += 1 non_ascii.append(w.strip()) logger.info("\t%d words containing non ascii are ommited" % counter) articles[k_word] = {} logger.info("\tfound %d words in file" % len(words)) for word in words: data = {} page = site.Pages[word] # follow the redirect and check for collisions if page.redirect: res = re.search('\[\[(.+)\]\]', page.edit()) redir_word = urllib.unquote(res.groups()[0]) if redir_word in redir_on: logger.warning("[%s AND %s] both redirect on --> %s" % (word, redir_on[redir_word], redir_word)) collisions[redir_word] = redir_on[redir_word] else: logger.info("[%s] redir from [%s]" % (redir_word, word)) redir_on[redir_word] = word text = site.Pages[redir_word].edit() data['redirected'] = redir_word else: text = page.edit() # check for missing wikipedia articles if text == "": all_missing.append(word) continue # preprocess the received article data['text'] = wikicorpus.filter_wiki(text) in_ascii = ud.normalize('NFKD', data['text']).encode('ascii', 'ignore') data['text'] = preprocess_string(in_ascii) articles[k_word][word] = data logger.info('add human rating to the articles') id_word = {} sparql_path = os.path.join(base_path, p['sparql_path']) with open(os.path.join(sparql_path, 'id_word.txt')) as f: for line in f.readlines(): idx, word = line.strip().split('\t') id_word[idx] = word #add human rating to the wikipedia data not_found = [] with open(os.path.join(sparql_path, p['human_file'])) as f: for line in f.readlines(): arr = line.split() word = id_word[arr[0]] term = arr[3] try: articles[word][term]['rating'] = int(arr[4]) except KeyError: not_found.append(term) logger.info("%d words from the ref queries not found" % len(not_found)) f = open(os.path.join(output_dir, "articles.pickle"), 'wb') pickle.dump(articles, f) f.close info = {} info['missing'] = all_missing info['redirs'] = redir_on info['collisions'] = collisions info['not_found'] = not_found info['non_ascii'] = non_ascii f = open(os.path.join(output_dir, "info.pickle"), 'wb') pickle.dump(info, f) f.close logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))
def _get_plaintext(content): return CRUFT_RE.sub(r'', NEWLINE_RE.sub(r'\n', HEADER_RE.sub(r'\1', QUOTE_RE.sub(r'', filter_wiki(content))))).strip()
def tokenize(text): return [token for token in simple_preprocess(filter_wiki(text)) if token not in UTF8STP]
def _clean_content(self, content): return WIKI_CRUFT_RE.sub( r'', WIKI_NEWLINE_RE.sub( r'\n', WIKI_HEADER_RE.sub( r'\1', WIKI_QUOTE_RE.sub( r'', filter_wiki(content))))).strip()