def clear_documents(docs): tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # with open(f"{Settings.DIRECTORY}/data/words/{Settings.PROJECT_NAME}_{Settings.ID}", 'w') as f: # for d in docs: # f.write(d + "\n") # f.write("\n") # Clean text based on java stop words docs_content = [] for doc in docs: directory = f"{Settings.DIRECTORY}/data/words/{Settings.PROJECT_NAME}/{doc[0]}" with open(directory, "w+") as f: f.write(f"{doc[0]}\n") f.write("Before processing:\n") f.write(f"{doc[1]}\n") doc_content = StringUtils.clear_text(doc[1]) docs_content.append(doc_content) with open(directory, "a+") as f: f.write("\nAfter processing:\n") f.write(f"{doc_content}\n") # compile sample documents into a list doc_set = docs_content # list for tokenized documents in loop texts = [] # loop through document list for text in doc_set: # clean and tokenize document string raw = text.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [t for t in tokens if not t in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(st) for st in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # filter dictionary from outliers dictionary.filter_extremes(no_below=3, no_above=0.75, keep_n=1000) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] return texts, corpus, dictionary
def apply_tfidf_to_pair(self, source, target): source = StringUtils.clear_text(source) target = StringUtils.clear_text(target) return self.cosine_sim(source, target)