def parse_page(title, text, vocab, inlinks, article_min_words=1, ignore_namespace=True): filtered_text = filter_wiki(text, promote_remaining=False, simplify_links=False) tok_num = 0 # Presave the interlinks in this page (for DCA training) founded_links = [] # The end of previous link prev = 0 for match in RE_P16.finditer(filtered_text): # Tokenize sentences between the previous link and the current link link_start, link_end = match.span() if prev < link_start: toks = tokenize(filtered_text[prev:link_start]) prev = link_end tok_num += len(toks) # Tokenize current link parts = match.groups()[0].split('|') entity = parts[0] if '[' not in entity and ']' not in entity: try: interlink_text = parts[1] except IndexError: interlink_text = entity if interlink_text: link_toks = tokenize(interlink_text) else: link_toks = [] tok_num += len(link_toks) entity = entity.replace(' ', '_') if entity in vocab: founded_links.append(entity) # Tokenize remaining text toks = tokenize(filtered_text[prev:]) tok_num += len(toks) # Filter if tok_num < article_min_words: return False if ignore_namespace: if any(title.startswith(name + ':') for name in IGNORED_NAMESPACES): return False # update inlinks for entity in founded_links: if entity not in inlinks: inlinks[entity] = set() inlinks[entity].add(title) return True
def build_glove(word2vec, target_files, output_path): word2vec1 = KeyedVectors(vector_size=300) print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size)) buf1 = [] buf2 = [] contains = set() def add_buffer(w, f): nonlocal buf1, buf2 if w not in contains: buf1.append(w) buf2.append(f) contains.add(w) def clear_buffer(): nonlocal buf1, buf2 buf1 = [] buf2 = [] for f in target_files: for i, s in enumerate(load_json(f), 1): sentence = s['description'] for w in tokenize(sentence): w = w.lower() if w in word2vec: add_buffer(w, word2vec[w]) if i % 10 == 0 and len(buf1) > 0: word2vec1.add(buf1, buf2, replace=False) clear_buffer() if len(buf1) > 0: word2vec1.add(buf1, buf2, replace=False) print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size)) KeyedVectors.save_word2vec_format(word2vec1, output_path, binary=True)
def Pdf2Vec(titles): ''' Vectorizes a given PDF on your local filesystem to a Log Entropy TF-IDF vector to then query against your similarity index Returns: [document-logent-vec-1, document-logent-vec-2, ... ,document-logent-vec-N] where N is is the number of titles ''' #TODO: Make it so you can give a model as an arguement to vecorize a given #document into any trained gensim model ret_lst = [] logent = LogEntropyModel.load('../models/logEntropy.model') diction = Dictionary.load('../models/wiki_dict.dict') for title in titles: curr_file = open('../data/articleData/pdfs/' + title + '.pdf') doc = slate.PDF(curr_file) doc = ' '.join(doc) doc_tokens = wikicorpus.tokenize(doc) bow = diction.doc2bow(doc_tokens) bow_logent = logent[bow] ret_lst.append(bow_logent) curr_file.close() return ret_lst
def _process_page(page): assert len(page['section_titles']) == len(page['section_texts']) assert len(page.keys()) == 3 rows = [page['title']] for section_title, section_text in zip(page['section_titles'], page['section_texts']): rows.append(section_title) rows.append(section_text) page_tok = tokenize('\n'.join(rows)) return ' '.join(page_tok) + '\n'
def download_articles(article_names): with open(ARTICLES_FILE, 'w') as articles_file: with open(INDEX_FILE, 'w') as index_file: for article_name in article_names: print "Processing {0}".format(article_name) try: text = util.get_article_text(article_name) tokenized_article = tokenize(text) tokenized_article = [w for w in tokenized_article if not w in util.STOP_WORDS] for token in tokenized_article: articles_file.write("{0} ".format(token)) articles_file.write("\n") index_file.write("{0}\n".format(article_name)) except util.ArticleNotFoundError: print "Could not find: {0}".format(article_name) except: print "Error getting article"
def _article_tfidf(lang, article_title): text = _article_text(lang, article_title) if text is None: return None else: return dict(_tfidf[_dict.doc2bow(tokenize(filter_wiki(text)))])
def tfidf_similarity_query(title, content): tokens = wikicorpus.tokenize(wikicorpus.filter_wiki(content)) vector = dict(tfidf[dictionary.doc2bow(tokens)]) return cosine_similarity(seed_article, vector)
def word_portion_query(title, content): words = [word.decode('utf-8') for word in wikicorpus.tokenize(wikicorpus.filter_wiki(content))] words_in_keywords = sum(word in keywords for word in words) return safe_ratio(words_in_keywords, len(words))
def keyword_portion_query(title, content): word_set = {word.decode('utf-8') for word in wikicorpus.tokenize(wikicorpus.filter_wiki(content))} keywords_in_words = sum(keyword in word_set for keyword in keywords) return safe_ratio(keywords_in_words, len(keywords))
def _prepare_description(cls, game): title = game['title'] description = game['description']['full'] whats_cool = game['description']['whats_cool_about_it'] text = f'{title} {description} {whats_cool}' return tokenize(cls._cleanhtml(text))
def parse_page(title, text, vocab, page_entity_word_co_occur, context_entity_word_co_occur, word_count, entity_window_size=20, article_min_words=1, ignore_namespace=True): filtered_text = filter_wiki(text, promote_remaining=False, simplify_links=False) tok_num = 0 # Collect words in this page batch_words = [] # Presave the index of entities in this page (for entity pre-training) founded_entity_idx = [] # Presave the interlinks in this page (for DCA training) prev = 0 for match in RE_P16.finditer(filtered_text): # Tokenize sentences between the previous link and the current link link_start, link_end = match.span() if prev < link_start: toks = tokenize(filtered_text[prev:link_start], lower=False) for tok in toks: batch_words.append([tok]) prev = link_end tok_num += len(toks) # Tokenize current link parts = match.groups()[0].split('|') entity = parts[0] if '[' not in entity and ']' not in entity: try: interlink_text = parts[1] except IndexError: interlink_text = entity if interlink_text: link_toks = tokenize(interlink_text, lower=False) else: link_toks = [] batch_words.append(link_toks) tok_num += len(link_toks) entity = entity.replace(' ', '_') if entity in vocab: founded_entity_idx.append([len(batch_words) - 1, entity]) # Tokenize remaining text toks = tokenize(filtered_text[prev:], lower=False) for tok in toks: batch_words.append([tok]) tok_num += len(toks) # Filter if tok_num < article_min_words: return False if ignore_namespace: if any(title.startswith(name + ':') for name in IGNORED_NAMESPACES): return False # Start counting if title in vocab: if title not in page_entity_word_co_occur: page_entity_word_co_occur[title] = set() we = page_entity_word_co_occur[title] else: we = None for words in batch_words: for word in words: # Count occurrence #(w,) word_count[word] = word_count.get(word, 0) + 1 if we is not None: # Count co-occurrence #(w,e) according to rule (i) we.add(word) half_entity_window_size = entity_window_size // 2 for idx, entity in founded_entity_idx: if entity not in context_entity_word_co_occur: context_entity_word_co_occur[entity] = set() we = context_entity_word_co_occur[entity] # look toward left for j in range(max(idx - half_entity_window_size, 0), idx): for word in batch_words[j]: # Count #(w,e) according to rule (ii) we.add(word) # look toward right for j in range(idx + 1, min(idx + half_entity_window_size, len(batch_words))): for word in batch_words[j]: # Count #(w,e) according to rule (ii) we.add(word) return True
def normalizeText(self, text): words = tokenize(filter_wiki(text.lower())) text = " ".join(words) return text
def tokenize_gensim(text): tokenize(text)
def _get_query(self, text): """Preprocess and tokenize text, return it as BOW (bag of words). """ return self.dictionary.doc2bow( wikicorpus.tokenize(wikicorpus.filter_wiki(text)))
def get_tokens(text, token_min_len=1, token_max_len=100): text = filter_wiki(text) return tokenize(text, token_min_len, token_max_len, True)
def tokenize_with_stemming(*params, **kwparams): tokens = tokenize(*params, **kwparams) tokens = cut_off_references(tokens) tokens = [stemmer.stem(token) for token in tokens] return tokens