def preclean_xlsxParagraph(text_list): preprocessor = TextPreprocessor() word_list = list() for text in text_list: clean_text = preprocessor.clean_sentence(text) word_list.append(' '.join(preprocessor.tokenize_text(clean_text))) return (word_list)
def extract_dictionary(document_paths, max_words): """ Extracts a gensim Dictionary object from a set of documents. Parameters ---------- document_paths : [str] List of document paths that make up the corpus. Returns ------- dictionary : gensim.corpora.Dictionary Extracted dictionary (or tokenizer). """ print("Extracting dictionary from corpus") dictionary = Dictionary(prune_at=None) preprocessor = TextPreprocessor() for document_path in tqdm(document_paths): with open(document_path, "r") as f: document = f.read() document = preprocessor.clean_sentence(document, alphabetic_only=True) words = preprocessor.tokenize_text(document) dictionary.add_documents([words]) if len(dictionary) > max_words: start = time() dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=int(max_words * 0.9)) print("Dictionary filtered in {} seconds".format(time() - start)) return dictionary
class BM25Engine(SearchEngine): tokenizer_file = "data/processed/{id}/tokenizer" corpus_file = "data/processed/{id}/corpus" doc_idxs_file = "data/processed/{id}/doc_idxs.json" def __init__(self, tokenizer, corpus, idxs2id): """ Parameters ---------- tokenizer : gensim.corpora.Dictionary Word tokenizer. corpus : gensim.corpora.mmcorpus.MmCorpus Bag-of-words formatted corpus of documents. """ self.preprocessor = TextPreprocessor() self.tokenizer = tokenizer self.corpus = corpus self.internal_engine = BM25(self.corpus) self.idxs2id = idxs2id print("BM25 engine loaded") def top_k_matches(self, query, k): clean = self.preprocessor.clean_sentence(query, alphabetic_only=True) word_list = self.preprocessor.tokenize_text(clean) bow_representation = self.tokenizer.doc2bow(word_list) scores = self.internal_engine.get_scores(bow_representation) top_k_idxs = np.argsort(scores)[::-1][:k] return [self.idxs2id[str(idx)] for idx in top_k_idxs] def load(id): tokenizer = Dictionary.load(BM25Engine.tokenizer_file.format(id=id)) corpus = MmCorpus(BM25Engine.corpus_file.format(id=id)) with open(BM25Engine.doc_idxs_file.format(id=id), "r") as f: idxs2id = json.load(f) return BM25Engine(tokenizer, corpus, idxs2id)
class BoWHighlighter(SegmentHighlighter): def __init__(self): self.preprocessor = TextPreprocessor() self.counter = CountVectorizer() def highlight(self, document, query, precision): clean_query = self.preprocessor.clean_sentence( query) #, alphabetic_only=True word_list = self.preprocessor.tokenize_text(clean_query) #Convert query after tokenization as a string query_mod = ' '.join(word_list) document = re.sub(" +", " ", document) document = re.sub("\n\s+", " \n \n ", document) document = re.sub("\n+", "\n", document) # document = re.sub("\n*", "\n\n ", document) document = document.strip() highlights = [] scores = [] for paragraph in document.split(" \n \n "): if len(paragraph) == 0: continue clean_paragraph = self.preprocessor.clean_sentence( paragraph) #, alphabetic_only=True corpus = self.preprocessor.tokenize_text(clean_paragraph) paragraph_mod = ' '.join(corpus) vectorizer = self.counter.fit([query_mod, paragraph_mod]) vectors = [ vec for vec in vectorizer.transform([query_mod, paragraph_mod ]).toarray() ] norm_vec_query = np.linalg.norm(vectors[0]) norm_vec_paragraph = np.linalg.norm(vectors[1]) if norm_vec_paragraph == 0: continue cosine_similarity = np.dot( vectors[0], vectors[1]) / (norm_vec_query * norm_vec_paragraph) if cosine_similarity > precision: print("The cosine similary is: ", cosine_similarity, paragraph, "\n \n") highlights.append(paragraph) scores.append(cosine_similarity) return highlights, scores def load(id): return BoWHighlighter()
def preclean_entireDoc(text): preprocessor = TextPreprocessor() clean_text = preprocessor.clean_sentence(text) word_list = preprocessor.tokenize_text(clean_text) words = ' '.join(word_list) return (words)
class IterativeCorpusBuilder(): def __init__(self, document_paths, max_words): self.tokenizer = IterativeCorpusBuilder.extract_dictionary( document_paths, max_words=max_words) self.document_paths = iter(document_paths) self.preprocessor = TextPreprocessor() self.clock = time() self.iterations = 0 self.inform_frequency = 1000 self.max_words = 2 * 10**6 def __next__(self): with open(next(self.document_paths), "r") as f: document = f.read() document = self.preprocessor.clean_sentence(document, alphabetic_only=True) words = self.preprocessor.tokenize_text(document) bow_representation = self.tokenizer.doc2bow(words) # Inform progress as specified self.iterations += 1 if self.iterations % self.inform_frequency == 0: print("{} iterations took {} seconds. {} done.".format( self.inform_frequency, time() - self.clock, self.iterations)) self.clock = time() return bow_representation def __iter__(self): print("Building corpus term-document matrices") return self def extract_dictionary(document_paths, max_words): """ Extracts a gensim Dictionary object from a set of documents. Parameters ---------- document_paths : [str] List of document paths that make up the corpus. Returns ------- dictionary : gensim.corpora.Dictionary Extracted dictionary (or tokenizer). """ print("Extracting dictionary from corpus") dictionary = Dictionary(prune_at=None) preprocessor = TextPreprocessor() for document_path in tqdm(document_paths): with open(document_path, "r") as f: document = f.read() document = preprocessor.clean_sentence(document, alphabetic_only=True) words = preprocessor.tokenize_text(document) dictionary.add_documents([words]) if len(dictionary) > max_words: start = time() dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=int(max_words * 0.9)) print("Dictionary filtered in {} seconds".format(time() - start)) return dictionary