def __init__(self, core_term_path, pretrain=True, update=True, fasttext_corpus_path=None): p = PorterStemmer() with open(core_term_path, 'r') as f: self.core_terms = list( set([ p.stem(word.strip()) for word in f.readlines() if len(word.strip()) > 0 ])) self.core_terms.sort() self.word_embeddings = WordEmbeddings(pretrain, update, fasttext_corpus_path) self.core_term_dict = {} index = 2 for core_term in self.core_terms: self.core_term_dict[core_term] = index index += 1 with open(fasttext_corpus_path, 'r') as f: fasttext_corpus_content = f.readlines() documents = [ line.strip().split() for idx, line in enumerate(fasttext_corpus_content) if idx % 2 == 0 and len(line.strip()) > 0 ] dictionary = corpora.Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidf_model = models.TfidfModel(corpus) self.idfs = { dictionary[kv[0]]: kv[1] for kv in tfidf_model.idfs.items() }
class TextPreprocessor: def __init__(self): self.stop = set(stopwords.words('english')) self.global_stemmer = PorterStemmer() # Stem the word. It returns base form of a word def stem(self, word): stemmed = self.global_stemmer.stem(word) return stemmed # Removes stop words from a sentence def remove_stopwords(self, sentence): tokens = [] # Currently this only removes fullstop at the end for i in sentence.lower().split(): if i not in '<stop>': if i.endswith('.'): i = i.replace(".", "") elif i.endswith(','): i = i.replace(",", "") tokens.append(i) # if i not in self.stop and i not in '<stop>': # i = i.replace(",", "") # i = i.replace(".", "") # i = self.stem(i) # tokens.append(i) return tokens # Parse the file and generates training set def parse_file(self, filename, is_student_answer): with open(filename) as f: content = f.readlines() if is_student_answer: file_list = [] for line in content: line_list = self.remove_stopwords(line) file_list.append(line_list) return file_list else: file_dict = {} for line in content: line_list = self.remove_stopwords(line) file_dict[line_list.pop(0)] = line_list return file_dict
class Tokenizer: def __init__(self): self.p = PorterStemmer() def parse(self, nl_path, code_path): return self.__combine(self.__parse_file(nl_path, True, True), self.__parse_file(code_path, False, True)) @staticmethod def __combine(nl_dict, code_dict): ret = [] for key in sorted([int(key) for key in nl_dict.keys()]): ret.append((nl_dict[str(key)], code_dict[str(key)], str(key))) return ret def __parse_file(self, file_path, rm_stopwords=False, stem=False): ret = {} with open(file_path, 'r') as f: lines = f.readlines() for line in lines: if len(line) > 0: p = line.index('\t') idx = line[: p] tokens = self.__get_tokens(line[p + 1:], rm_stopwords, stem) ret[idx] = tokens return ret def __get_tokens(self, content, rm_stopwords=False, stem=False): words = [word for word in re.split('[^A-Za-z]+', content) if len(word) > 0] ret = [] for word in words: ret += self.__camel_case_split(word) tmp = [] for word in ret: if rm_stopwords: word = remove_stopwords(word) if len(word) > 0: if stem: word = self.p.stem(word) tmp.append(word) ret = tmp return ret @staticmethod def __camel_case_split(word): matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word) return [m.group(0).lower() for m in matches]
def get_text_similarity(url_df, dictionary, tfidf, sims, inds, sim_type): ''' calculate the tfidf title similarity between origin and target :param url_df: df of keywords for each url :param dictionary: tfidf dictionary :param tfidf: :param sims: tfidf similarity matrix :param inds: index of order or titles :param sim_type: title or body :return: dataframe of urls and title similarity scores ''' url_df[sim_type] = url_df[sim_type].fillna('') translator = str.maketrans('', '', string.punctuation) global_stemmer = PorterStemmer() # load all stopwords with open( '/Users/thyde/Documents/cloned_proj_moat/project_moat/stopwords.txt' ) as f: stopwords = f.read().split() # parse text into list of word stems texts = [[ global_stemmer.stem(word) for word in text.translate(translator).lower().split() if word not in stopwords ] for text in url_df[sim_type].values] # calculate similarity score to target title sim_scores = [] for text in texts: vec_bow = dictionary.doc2bow(text) vec_tfidf = tfidf[vec_bow] #import pdb;pdb.set_trace() res = sims[vec_tfidf] try: sim_scores.append(res[inds[url_df[ url_df['origin'] == url_df['url']][sim_type].values[0]]]) except KeyError: sim_scores.append(np.NaN) return pd.DataFrame([url_df['url'].values, sim_scores], index=['url', 'title_similarity']).T
def title_sim_construction(url_list, list_name, collection): ''' :param url_list: list of urls to get title similarity for :param list_name: list name for purposes of saving pkl file :param collection: mongo collection to query :return: nothing, just saves files ''' #classic stopwords textfile, will be in repo with open('/Users/thyde/Downloads/stopwords.txt') as f: stopwords = f.read().split() translator = str.maketrans('', '', string.punctuation) global_stemmer = PorterStemmer() #mc = MongoAtlasClient("mongodb://*****:*****@investopedia-shard-00-00-ibdgj.mongodb.net:27017,investopedia-shard-00-01-ibdgj.mongodb.net:27017,investopedia-shard-00-02-ibdgj.mongodb.net:27017/test?ssl=true&replicaSet=investopedia-shard-0&authSource=admin", "sSQXR9fVxNu2P0U5") #my_collection = mc['investopedia']['corpus'] docs = list(collection.find({'url': {'$in': url_list}})) # pull out title text and create index dictionary title_text = [doc['title'] for doc in docs] title_ind = {title: i for i, title in enumerate(title_text)} # parse words in titles texts = [[ global_stemmer.stem(word) for word in title.translate(translator).lower().split() if word not in stopwords ] for title in title_text] # create gensim corpus dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # save files pkl.dump(dictionary, open('{}_title_dictionary.pkl'.format(list_name), 'wb')) pkl.dump(tfidf, open('{}_title_tfidf.pkl'.format(list_name), 'wb')) pkl.dump(corpus_tfidf, open('{}_title_corpus.pkl'.format(list_name), 'wb')) pkl.dump(title_ind, open('{}_title_ind.pkl'.format(list_name), 'wb'))
def preprocess_text(self, text): """Apply preprocessing to a single text document. This should perform tokenization in addition to any other desired preprocessing steps. Args: text (str): document text read from plain-text file. Returns: iterable of str: tokens produced from `text` as a result of preprocessing. """ for character_filter in self.character_filters: text = character_filter(text) tokens = self.tokenizer(text) for token_filter in self.token_filters: tokens = token_filter(tokens) if self.stem: p = PorterStemmer() tokens = [p.stem(token) for token in tokens] return tokens
def searchIndbFacebookSaved(search_value): for x in "and or it is the a".split(): search_value.replace(" " + x + " ", "") result = dbFacebookSaved.query.filter( dbFacebookSaved.title.ilike("%" + search_value.replace(" ", "%") + "%")) #("%" + search_value + "%"))# idList = [ result.order_by(dbFacebookSaved.date)[count - 1].id for count in range(result.count(), 0, -1) ] idDict = dict() idDict = adding_weight_to_dict(idDict, idList, 1) print ".ilike" print idDict stemmer = PorterStemmer() search_value = search_value.split() search_valueRaw = list(search_value) if len(search_value) > 1: sumVector = model3['car'] * 0 for searchTerm in search_valueRaw: if searchTerm.lower() in model3.vocab: sumVector = sumVector + model3[searchTerm.lower()] similarList = model3.similar_by_vector(sumVector) print "similarList (sumVector)" print similarList """ for i in range(min(5,len(similarList))): if similarList[i][1] >= 0.7 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) print "append " + similarList[i][0] + " from fasttext(sum of vec)" """ print "New search value after sumVec:" search_value += [ similarList[i][0] for i in range(min(5, len(similarList))) if similarList[i][1] >= 0.72 and similarList[i][0] not in search_value ] print search_value search_valueR = [] for searchTerm in search_valueRaw: for i, mdl in enumerate([model, model2]): if searchTerm.lower() in mdl.vocab: similarList = mdl.most_similar(searchTerm.lower()) listLengh = 3 if i == 0 else 5 scoreThreshold = 0.5 if i == 0 else 0.55 tempText = " from gensim_word2vec for relating to " if i == 0 else " from fasttext(CBOW) for relating to " for i in range(min(listLengh, len(similarList))): if similarList[i][1] >= scoreThreshold and similarList[i][ 0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][ 0] + tempText + searchTerm """ if searchTerm.lower() in model.vocab: similarList = model.most_similar(searchTerm.lower()) for i in range(min(3,len(similarList))): if similarList[i][1] >= 0.5 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][0] + " from gensim_word2vec for relating to " + searchTerm if searchTerm.lower() in model2.vocab: similarList = model2.most_similar(searchTerm.lower()) for i in range(min(5,len(similarList))): if similarList[i][1] >= 0.55 and similarList[i][0] not in search_value: search_value.append(similarList[i][0]) search_valueR.append(similarList[i][0]) print "append " + similarList[i][0] + " from fasttext(CBOW) for relating to " + searchTerm """ """ print "search_value before stemming:" print search_value stemmer = PorterStemmer() search_value = [stemmer.stem(word) for word in search_value] search_value = list(set(search_value)) search_valueR = [stemmer.stem(word) for word in search_valueR] search_valueR = list(set(search_valueR)) print "search_value bafter stemming:" """ print search_value for word in search_value: if word == stemmer.stem( word) or not stemmer.stem(word) in search_value: result = dbFacebookSaved.query.filter( dbFacebookSaved.title.contains(word)) resultKwd = dbFacebookSaved.query.filter( dbFacebookSaved.keywords.contains(word)) resultSummary = dbFacebookSaved.query.filter( dbFacebookSaved.summary.contains(word)) weight = 1 if len(preprocess_string(word)) == 0: weight = 0.1 elif word in search_valueR: weight = 0.5 idList = [ read_db_data_to_article( result.order_by(dbFacebookSaved.date)[count - 1])['id'] for count in range(result.count(), 0, -1) ] idDict = adding_weight_to_dict(idDict, idList, 1 * weight) print ".title.contains(" + word + ")" print idDict idList = [ read_db_data_to_article( resultKwd.order_by(dbFacebookSaved.date)[count - 1])['id'] for count in range(resultKwd.count(), 0, -1) ] idDict = adding_weight_to_dict(idDict, idList, 0.5 * weight) print ".keywords.contains(" + word + ")" print idDict idList = [] for count in range(resultSummary.count(), 0, -1): if not resultSummary.order_by( dbFacebookSaved.date)[count - 1].id in idList and len( preprocess_string(word)) > 0: article = read_db_data_to_article( resultSummary.order_by(dbFacebookSaved.date)[count - 1]) idList.append(article['id']) cumsum = 0 # preprocess_string is a gensim function that do preprocessing for a string. ex: people -> peopl, Oranges -> orang word = preprocess_string(word)[0] for w in article['text']: if len(preprocess_string(w)) > 0: w = preprocess_string(w) if cumsum <= 0.6 and word in w: idDict[article['id']] = idDict.get( article['id'], 0) + 0.2 * weight cumsum = cumsum + 0.2 * weight print ".summary.contains(" + word + ")" #idDict = adding_weight_to_dict(idDict, idList, 0.2) print idDict else: print "ignore " + word + " for " + stemmer.stem(word) return idDict