def prepare_index(dictionary, model, tfidf, documents): if not os.path.isfile('soft_cosine.index'): similarity_index = WordEmbeddingSimilarityIndex(model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in documents]], similarity_matrix) index.save('soft_cosine.index') return SoftCosineSimilarity.load('soft_cosine.index')
def get_score(self, query_string): if isinstance(query_string, list): query_string = " ".join(query_string) query = preprocess(query_string) ''' Finally, we calculate the soft cosine similarity between the query and each of the documents. Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), the soft cosine similarity considers word similarity as well. ''' # Compute Soft Cosine Measure between the query and the documents. # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb query_tf = self.tfidf[self.dictionary.doc2bow(query)] index = SoftCosineSimilarity( self.tfidf[[ self.dictionary.doc2bow(document) for document in self.corpus ]], self.similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents print("Mentee values: {}".format(query_string)) return doc_similarity_scores
def createW2VecIndex(reference_dict): from gensim.corpora import Dictionary from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix print("Prepare Word2Vec model") import time t1 = time.time() corpus = [] #reference = [] for term in reference_dict: corpus.append(word_tokenize(term)) #reference.append(term) model = Word2Vec(corpus, size=20, min_count=1) # train word-vectors termsim_index = WordEmbeddingSimilarityIndex(model.wv) #<---- dictionary = Dictionary(corpus) bow_corpus = [dictionary.doc2bow(document) for document in corpus] similarity_matrix = SparseTermSimilarityMatrix( termsim_index, dictionary) # construct similarity matrix docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=3) t2 = time.time() print(" W2v index and dictionary in ", (t2 - t1) / 60, " minutes") import pickle f = open("./models/W2VecIndexes.bin", 'wb') pickle.dump((docsim_index, dictionary), f) return docsim_index, dictionary
def compute_sim_matrix(self): ''' if(self.model_type.lower() == "fasttext"): model = FastText(self.questions) else: model = Word2Vec(self.questions) ''' self.dictionary = Dictionary(self.questions) self.tfidf = TfidfModel(dictionary=self.dictionary) word2vec_model = Word2Vec(self.questions, workers=cpu_count(), min_count=5, size=300, seed=12345) sim_index = WordEmbeddingSimilarityIndex(word2vec_model.wv) sim_matrix = SparseTermSimilarityMatrix(sim_index, self.dictionary, self.tfidf, nonzero_limit=100) bow_corpus = [ self.dictionary.doc2bow(document) for document in self.questions ] tfidf_corpus = [self.tfidf[bow] for bow in bow_corpus] self.docsim_index = SoftCosineSimilarity(tfidf_corpus, sim_matrix, num_best=10)
def get_sim_index(wv_model, bow_corpus, dictionary): termsim_index = WordEmbeddingSimilarityIndex(wv_model.wv) similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) return docsim_index
def softcossim(query, documents): # Compute Soft Cosine Measure between the query and the documents. query = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in documents]], similarity_matrix) similarities = index[query] return similarities
def get_embedding_files(self, num_best=10): """ Get the dictionary, bow_corpos, similiarity matrix and docsim index pre-trained on all image tags. """ # embeddings try: with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "rb") as f: self.dictionary, self.bow_corpus, self.similarity_matrix, _ = pickle.load( f) self.docsim_index = SoftCosineSimilarity(self.bow_corpus, self.similarity_matrix, num_best=num_best) except FileNotFoundError: print( f'no file found, training word2vec to get bow_corpus, similarity matrix and docsim index' ) # read in all tags try: with open(f'{constants.DATA_DIR}/all_img_tags.pkl', 'rb') as fp: all_img_tags_lower = pickle.load(fp) except FileNotFoundError: print( f'no file found at {constants.DATA_DIR}/all_img_tags.pkl') model = Word2Vec(all_img_tags_lower, size=20, min_count=1) # train word2vec termsim_index = WordEmbeddingSimilarityIndex(model.wv) self.dictionary = Dictionary(all_img_tags_lower) self.bow_corpus = [ self.dictionary.doc2bow(document) for document in all_img_tags_lower ] self.similarity_matrix = SparseTermSimilarityMatrix( termsim_index, self.dictionary) # construct similarity matrix # 10 (default) most similar image tag vectors self.docsim_index = SoftCosineSimilarity(self.bow_corpus, self.similarity_matrix, num_best=num_best) print( f'Saving bow_corpus, similarity matrix and docsim index to {constants.EMBEDDING_DIR}' ) with open(f'{constants.EMBEDDING_DIR}/soft_cosine.pkl', "wb") as f: pickle.dump((self.dictionary, self.bow_corpus, self.similarity_matrix, self.docsim_index), f)
def _softcossim(self, query: str, documents: list): # Compute Soft Cosine Measure between the query and each of the documents. query = self.tfidf[self.dictionary.doc2bow(query)] index = SoftCosineSimilarity( self.tfidf[[ self.dictionary.doc2bow(document) for document in documents ]], self.similarity_matrix) similarities = index[query] return similarities
def similarity(self, query, documents): """Caclulate cosine simularity between query and all documents""" bow_query = self.dictionary.doc2bow(query) bow_docs = [ self.dictionary.doc2bow(document) for document in documents ] index = SoftCosineSimilarity(bow_docs, self.matrix) similarities = index[bow_query] return similarities
def __init__(self, initializer): tfidf = initializer.getTfIdf() dictionary = initializer.getDictionary() query = tfidf[dictionary.doc2bow(initializer.getPreprocessedNews())] preprocessed_documents = initializer.getPreprocessedDocuments() index = SoftCosineSimilarity( tfidf[[ dictionary.doc2bow(document) for document in preprocessed_documents ]], initializer.getSimilarityMatrix()) similarities = index[query] self.scores = similarities[1:len(similarities)]
def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False): super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv) self.dictionary = Dictionary(self.corpus) bow = [self.dictionary.doc2bow(doc) for doc in self.corpus] similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary) self.softcosinesimilarity = SoftCosineSimilarity( bow, similarity_matrix, num_best=10 )
def glove_score_1v1(query_string, documents): # query_string = 'Leticia has 3+ years of experience in data science. She has a background in applied mathematics and computer science and currently works as a data scientist at Ørsted. In her work, she builds condition-based algorithms to predict when their offshore wind turbines are going to fail in order to optimize daily operations. Leticia has an international upbringing and has lived in 9 different countries, and she is driven by a great work environment with diversity in the workplace. Leticia wants to become a mentor to help students in their transition to professional life and share their own experiences of studying and working abroad and succeeding as a woman in a male-dominated field. Leticia would prefer a mentee that has ambition and drive, such that she has a better understanding of where he or she wants to go and how she can help in the best way.' # documents = ['I would describe myself as being dedicated and curious. I am very interested in data analytics and operations research, specially in connection with logistics and planning. For my Bachelor thesis I did a simulation project with Copenhagen Malmö Port on how to optimise the logistics operations at their container-terminal, which really sparked my interest in this area. I am always interesting in learning new things and I try to take advantage of the great opportunities offered through my studies at DTU - like this mentorship or having the opportunity to go abroad for a semester. Last year I spent a semester at Hong Kong University of Science and Technology which was a big experience both academically and personally. Currently, I am working as a student assistant in Danmarks Nationalbank, and even though it is interesting getting an insight into the financial world and having to apply my skills to a different area, at some time, I would like to try something more related to my studies. I would like to be part of the program to gain more knowledge of what it is like working in the industry as a data analyst or engineer - preferably working with logistics, data analytics or operations research. I know very few engineers outside the academic world at DTU, so I would appreciate a mentor who could share some of their experiences and tips on transitioning from student to professional. I am leaning towards specialising in prescriptive analytics, so I would also be very interested in learning more about how optimisation methods and simulation studies are actually applied to real-world problems. What I hope to achieve as a mentee is to be more prepared for working in the industry and get advice on how to make smart choices regarding my studies. I would also appreciate some advice on whether to take another semester abroad during my Masters or gain more work-experience.', # 'My greatest ambition is to leave the world in a better state for humans to experience the quality of life than it was when I entered it. This reason lead me to choose scientific studies - general engineering in Paris at first, and then Applied Mathematics in DTU - in the hope to use technologys leverage for maximum impact. Disclaimer: I am currently not looking for a position as I am to continue working for Tomorrow, the fantastic company I am already working for I nevertheless am very interested to get some insights, from a mentor that went through a similar line of study, into how they decided on starting to work straight away vs continue in the academic world by applying for a PhD. I am also eager to learn more about what it actually means to be a professional "data scientist". How much research/theory is actually useful in day-to-day operations and what level of freedom they can have in their decisions and organisation. I am also curious to learn more about career path for data scientist. The popularity of this position is fairly recent and for this reason, career evolution for a data scientist is still rather obscure to me.'] # 'I would describe myself as focused, structured and vigorous. My main interest is overall concrete technology. It is from the mixing recipes to the maintaining of old structures to "cure" its sickness. The topic of my bachelor project was about testing the different national and international test methods for alkali silica reactions (ASR). To find out the most optimal methods, to catch that sand and stone which could develop ASR. My master thesis is about testing if mine tailings could be used as a substitute for fly ash, which soon not will be available at the same amount as earlier. In my free time, I have been doing a lot of volunteering. I have been a coach for a handball team for 11-12 year old girls for two years. I learned a lot about coaching, planning and taught the girls to be team players. Further I have been part of the organizing committee for the study start and the council for my study line for three years. Where I further developed my competencies planning, leading and get things done. I usually take the lead when things need to be done, but I dont know if Im suited for management. I hope to get a closer look at "the real life", to get ready when I finish my thesis in January. I want to a mentee to get knowledge about the "life" after university. I would prefer a mentor who works with civil engineering, but a mentor who can taught me difference between consulting and entrepreneur firms, so I can find out what is right for me, would be a nice. I still don\'t know what exactly I can be, but I would appreciate some advice. I hope to achieve a way into the business, which could help me find a job after my thesis.'] # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb # Preprocess the documents, including the query string corpus = [preprocess(document) for document in documents] query = preprocess(query_string) ''' Then we create a similarity matrix, that contains the similarity between each pair of words, weighted using the term frequency: ''' # Load the model: this is a big file, can take a while to download and open glove = api.load("glove-wiki-gigaword-50") similarity_index = WordEmbeddingSimilarityIndex(glove) # Build the term dictionary, TF-idf model print("Everything has been initialized") dictionary = Dictionary(corpus + [query]) tfidf = TfidfModel(dictionary=dictionary) # Create the term similarity matrix. similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) ''' Finally, we calculate the soft cosine similarity between the query and each of the documents. Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), the soft cosine similarity considers word similarity as well. ''' # Compute Soft Cosine Measure between the query and the documents. # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb query_tf = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents sorted_indexes = np.argsort(doc_similarity_scores)[::-1] count = 0 print("Mentee values: {}".format(query_string)) for idx in sorted_indexes: count += 1 if count > 10: break print( f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}') return doc_similarity_scores
def find_similarity(search_w, corpus_w): rv = {} rv['result'] = [] bmatch = False #Tokenize the sentence into words #search_tokens = [word for word in search_w.split()] #corpus_tokens = [word for word in corpus_w.split()] search_tokens = search_w corpus_tokens = corpus_w #print(search_tokens) #print(corpus_tokens) #print("-----") #cp = [] #for c in corpus_tokens: # cp.append([c]) #corpus_tokens = cp search_tokens = [search_w] print(corpus_tokens) print(search_tokens) # Prepare a dictionary and a corpus. #documents = [svc_tokens, specs_tokens] dictionary = corpora.Dictionary(corpus_tokens) termsim_index = WordEmbeddingSimilarityIndex(model.wv) bow_corpus = [dictionary.doc2bow(doc) for doc in corpus_tokens] similarity_matrix = SparseTermSimilarityMatrix( termsim_index, dictionary) # construct similarity matrix docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) # Compute soft cosine similarity for t in search_tokens: #print("looking for %s" %(t.split())) for e in t.split(','): match = {} e = e.strip() lkup = [e] try: result = docsim_index[dictionary.doc2bow(lkup)] except: result = [(0, 0)] print(f"looking for {lkup}, result {result}") if len(result) and result[0][1] > 0.5: match['word'] = e.split() match['value'] = str(result) rv['result'].append(match) bmatch = True #print(docsim_index[dictionary.doc2bow(search_tokens)]) return rv if bmatch else None
def compute_msg_dist_matrix(data): lst_notifications = data # print(lst_notifications) model = Word2Vec(lst_notifications, min_count=1) # train word-vectors termsim_index = WordEmbeddingSimilarityIndex(model.wv) data_2 = [d.split() for d in lst_notifications] #print(data) dictionary = Dictionary(data_2) bow_corpus = [dictionary.doc2bow(document) for document in data_2] similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix) sof_cosine_distance_matrix = 1- np.array(docsim_index) return sof_cosine_distance_matrix
def computeDocumentSimilarityIndex(self, corpus): """ Compute the similarity matrix of the model Args: corpus: dictionary to use to create index Returns: SoftCosineSimilarity instance """ if self.wordEmbedding is None: self.wordEmbedding = WordEmbeddingSimilarityIndex(self.model) # create similarity matrix, update flags simMatrix = SparseTermSimilarityMatrix(self.wordEmbedding, corpus) return SoftCosineSimilarity([x.sentence for x in self.itemScores], simMatrix)
def softcos(defns, return_centers=False): keys = list(defns.keys()) if len(defns) == 1: return unclusterable_default(keys, return_centers=return_centers) dictionary, bow_corpus = mk_dictionary_bow_corpus(defns.values()) if len(dictionary) == 0: return unclusterable_default(keys, return_centers=return_centers) similarity_index = WordEmbeddingSimilarityIndex(vecs.get_en()) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) index = SoftCosineSimilarity(bow_corpus, similarity_matrix) affinities = np.zeros((len(defns), len(defns))) for row, similarities in enumerate(index): affinities[row] = similarities return graph_clust_grouped(affinities, keys, return_centers)
def calculate_distance(self,query_string,documents): def preprocess(doc): # Tokenize, clean up input document string doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc) doc = sub(r'<[^<>]+(>|$)', " ", doc) doc = sub(r'\[img_assist[^]]*?\]', " ", doc) doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc) return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in STOPWORDS] # Preprocess the documents, including the query string corpus = [preprocess(document) for document in documents] query = preprocess(query_string) # Load the model: this is a big file, can take a while to download and open similarity_index = WordEmbeddingSimilarityIndex(glove) # Build the term dictionary, TF-idf model dictionary = Dictionary(corpus+[query]) tfidf = TfidfModel(dictionary=dictionary) # Create the term similarity matrix. similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) query_tf = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents sorted_indexes = np.argsort(doc_similarity_scores)[::-1] if len(documents) > 1: for idx in sorted_indexes: print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}') # print(doc_similarity_scores) return doc_similarity_scores
def get_score(self, query_string): if isinstance(query_string, list): query_string = " ".join(query_string) query = preprocess(query_string) print("Everything has been initialized") dictionary = Dictionary(self.corpus + [query]) tfidf = TfidfModel(dictionary=dictionary) # Create the term similarity matrix. similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, dictionary, tfidf) ''' Finally, we calculate the soft cosine similarity between the query and each of the documents. Unlike the regular cosine similarity (which would return zero for vectors with no overlapping terms), the soft cosine similarity considers word similarity as well. ''' # Compute Soft Cosine Measure between the query and the documents. # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb query_tf = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in self.corpus]], similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents sorted_indexes = np.argsort(doc_similarity_scores)[::-1] count = 0 print("Mentee values: {}".format(query_string)) for idx in sorted_indexes: count += 1 if count > 10: break # print(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {self.documents[idx]}') return doc_similarity_scores
def create_model(storage_client, json_in, video_id): """ Create soft cosine similarity model Keywords arguments: storage_client -- a Storage instance json_in -- json returned from the YouTube Captions API video_id -- the Youtube video_id Returns: - A Soft Cosine Measure model - The dictionary of terms computed """ video_id = video_id.lower() # check if bucket exists if blob_exists(storage_client, video_id): # retrieve blob from bucket bucket = storage_client.bucket(bucket_name) blob = bucket.blob(video_id) # The blob's name is the video ID # download the storage pickle as a binary string blob_str = blob.download_as_string() dictionary, index = pickle.loads(blob_str) return dictionary, index # download stop_words and glove stop_words, glove = download_resources() # Create Glove similarity Index similarity_index = WordEmbeddingSimilarityIndex(glove) # parse json captions into document form documents = processInput(json_in) # create a corpus from documents corpus = [preprocess(document, stop_words) for document in documents] # create dictionary from documents dictionary = Dictionary(corpus) tfidf = TfidfModel(dictionary=dictionary) # create a term similarity matrix similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) # Compute Soft Cosine Measure between documents index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) # save index and dictionary storage_client = storage.Client() # create a binary pickle representation bin_tuple = pickle.dumps((dictionary, index)) bucket = storage_client.bucket(bucket_name) blob = bucket.blob(video_id) # save to storage blob.upload_from_string(bin_tuple) if debug_messages: print("Binary model with name {} and dictionary uploaded.".format( video_id)) return dictionary, index
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False, num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if num_topics is None: num_topics = 100 possible_model_names = [ 'tf_idf', # 0 'lsi_bow', 'lsi_tf_idf', # 1, 2 'rp_bow', 'rp_tf_idf', # 3, 4 'lda_bow', 'lda_tf_idf', # 5, 6 'hdp_bow', 'hdp_tf_idf', # 7, 8 'word2vec', # 9 ] chosen_model_name = possible_model_names[chosen_model_no] print(chosen_model_name) game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() nlp = spacy.load('en_core_web_lg') documents = list(steam_tokens.values()) dct = Dictionary(documents) print(len(dct)) dct.filter_extremes(no_below=no_below, no_above=no_above) print(len(dct)) corpus = [dct.doc2bow(doc) for doc in documents] # Pre-processing pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf') tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors) if pre_process_corpus_with_tf_idf: # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf! print('Corpus as Tf-Idf') pre_processed_corpus = tfidf_model[corpus] else: print('Corpus as Bag-of-Words') pre_processed_corpus = corpus # Model model = None wv = None index2word_set = None if chosen_model_name == 'tf_idf': print('Term Frequency * Inverse Document Frequency (Tf-Idf)') model = tfidf_model elif chosen_model_name.startswith('lsi'): print('Latent Semantic Indexing (LSI/LSA)') model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('rp'): print('Random Projections (RP)') model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('lda'): print('Latent Dirichlet Allocation (LDA)') model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('hdp'): print('Hierarchical Dirichlet Process (HDP)') model = HdpModel(pre_processed_corpus, id2word=dct) elif chosen_model_name == 'word2vec': use_a_lot_of_ram = False if use_a_lot_of_ram: model = None print('Loading Word2Vec based on Google News') # Warning: this takes a lot of time and uses a ton of RAM! wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) else: if use_spacy: print('Using Word2Vec with spaCy') else: print('Training Word2Vec') model = Word2Vec(documents) wv = model.wv if not use_spacy: wv.init_sims(replace=normalize_vectors) index2word_set = set(wv.index2word) else: print('No model specified.') model = None if chosen_model_name != 'word2vec': if not use_soft_cosine_similarity: index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct)) else: w2v_model = Word2Vec(documents) similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100) index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix) else: index = None query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) app_ids = list(int(app_id) for app_id in steam_tokens.keys()) matches_as_app_ids = [] for query_count, query_app_id in enumerate(query_app_ids): print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids), query_app_id, get_app_name(query_app_id, game_names))) query = steam_tokens[str(query_app_id)] if use_spacy: spacy_query = Doc(nlp.vocab, query) else: spacy_query = None if chosen_model_name != 'word2vec': vec_bow = dct.doc2bow(query) if pre_process_corpus_with_tf_idf: pre_preoccessed_vec = tfidf_model[vec_bow] else: pre_preoccessed_vec = vec_bow vec_lsi = model[pre_preoccessed_vec] sims = index[vec_lsi] if use_soft_cosine_similarity: sims = enumerate(sims) similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims] similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) else: if use_spacy: similarity_scores = {} for app_id in steam_tokens: reference_sentence = steam_tokens[app_id] spacy_reference = Doc(nlp.vocab, reference_sentence) similarity_scores[app_id] = spacy_query.similarity(spacy_reference) else: query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set) similarity_scores = {} counter = 0 num_games = len(steam_tokens) for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set) try: similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence) except ZeroDivisionError: similarity_scores[app_id] = 0 similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed, verbose=False) matches_as_app_ids.append(similar_app_ids) print_ranking(query_app_ids, matches_as_app_ids, only_print_banners=True) return
print(now(), 'loaded speech data') np.seterr(divide='ignore', invalid='ignore') dictionary = Dictionary(documents) tfidf = TfidfModel(dictionary=dictionary) wv = KeyedVectors.load("word2vec_100_3_polish.bin") print(now(), 'loaded model') similarity_index = WordEmbeddingSimilarityIndex(wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) print(now(), 'created similarity matrix') index = SoftCosineSimilarity( [dictionary.doc2bow(document) for document in documents], similarity_matrix) print(now(), 'created index') # index.save('soft_cosine.index') # index = SoftCosineSimilarity.load('soft_cosine.index') while True: try: query = input("Query: ").lower().split() # query = tfidf[dictionary.doc2bow(query)] similarities = index[dictionary.doc2bow(query)] result_list = [partie[i] for i in [a[0] for a in similarities]] score_list = [a[1] for a in similarities] results = [' '.join(each) for each in result_list] for score, result in zip(score_list, results):
def fit( self, path_to_model, source, target, sourcetext="text", sourcedate="publication_date", targettext="text", targetdate="publication_date", keyword_source=None, keyword_target=None, keyword_source_must=False, keyword_target_must=False, condition_source=None, condition_target=None, days_before=None, days_after=None, merge_weekend=False, threshold=None, from_time=None, to_time=None, to_csv=False, destination="comparisons", to_pajek=False, filter_above=0.5, filter_below=5, ): """ path_to_model = Supply a pre-trained word2vec model. Information on how to train such a model can be found here: https://rare-technologies.com/word2vec-tutorial/ source/target = doctype of source/target (can also be a list of multiple doctypes) sourcetext/targettext = field where text of target/source can be found (defaults to 'text') sourcdate/targetedate = field where date of source/target can be found (defaults to 'publication_date') keyword_source/_target = optional: specify keywords that need to be present in the textfield; list or string (lowercase) keyword_source/_target_must = optional: In case of a list, do all keywords need to appear in the text (logical AND) or does at least one of the words need to be in the text (logical OR). Defaults to False (logical OR) condition_source/target = optional: supply the field and its value as a dict as a condition for analysis, e.g. {'topic':1} (defaults to None) days_before = days target is before source (e.g. 2); days_after = days target is after source (e.g. 2) -> either both or none should be supplied. Additionally, merge_weekend = True will merge articles published on Saturday and Sunday. threshold = threshold to determine at which point similarity is sufficient; if supplied only the rows who pass it are included in the dataset from_time, to_time = optional: specifying a date range to filter source and target articles. Supply the date in the yyyy-MM-dd format. to_csv = if True save the resulting data in a csv file - otherwise a pandas dataframe is returned destination = optional: where should the resulting datasets be saved? (defaults to 'comparisons' folder) to_pajek = if True save - in addition to csv/pickle - the result (source, target and similarity score) as pajek file to be used in the Infomap method (defaults to False) - not available in combination with days_before/days_after parameters filter_above = Words occuring in more than this fraction of all documents will be filtered filter_below = Words occuring in less than this absolute number of docments will be filtered """ now = time.localtime() logger.info( "The results of the similarity analysis could be inflated when not using the recommended text processing steps (stopword removal, punctuation removal, stemming) beforehand" ) # Load the pretrained model (different ways depending on how the model was saved) logger.info("Loading word embeddings...") try: softcosine_model = gensim.models.Word2Vec.load(path_to_model) except: softcosine_model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format( path_to_model, binary=True) logger.info("Done") # Construct source and target queries for elasticsearch if isinstance(source, list): # multiple doctypes source_query = { "query": { "bool": { "filter": { "bool": { "must": [{ "terms": { "doctype": source } }] } } } } } elif isinstance(source, str): # one doctype source_query = { "query": { "bool": { "filter": { "bool": { "must": [{ "term": { "doctype": source } }] } } } } } if isinstance(target, list): # multiple doctypes target_query = { "query": { "bool": { "filter": { "bool": { "must": [{ "terms": { "doctype": target } }] } } } } } elif isinstance(target, str): # one doctype target_query = { "query": { "bool": { "filter": { "bool": { "must": [{ "term": { "doctype": target } }] } } } } } # Change query if date range was specified source_range = {"range": {sourcedate: {}}} target_range = {"range": {targetdate: {}}} if from_time: source_range["range"][sourcedate].update({"gte": from_time}) target_range["range"][targetdate].update({"gte": from_time}) if to_time: source_range["range"][sourcedate].update({"lte": to_time}) target_range["range"][targetdate].update({"lte": to_time}) if from_time or to_time: source_query["query"]["bool"]["filter"]["bool"]["must"].append( source_range) target_query["query"]["bool"]["filter"]["bool"]["must"].append( target_range) # Change query if keywords were specified if isinstance(keyword_source, str) == True: source_query["query"]["bool"]["filter"]["bool"]["must"].append( {"term": { sourcetext: keyword_source }}) elif isinstance(keyword_source, list) == True: if keyword_source_must == True: for item in keyword_source: source_query["query"]["bool"]["filter"]["bool"][ "must"].append({"term": { sourcetext: item }}) elif keyword_source_must == False: source_query["query"]["bool"]["should"] = [] source_query["query"]["bool"]["minimum_should_match"] = 1 for item in keyword_source: source_query["query"]["bool"]["should"].append( {"term": { sourcetext: item }}) if isinstance(keyword_target, str) == True: target_query["query"]["bool"]["filter"]["bool"]["must"].append( {"term": { targettext: keyword_target }}) elif isinstance(keyword_target, list) == True: if keyword_target_must == True: for item in keyword_target: target_query["query"]["bool"]["filter"]["bool"][ "must"].append({"term": { targettext: item }}) elif keyword_target_must == False: target_query["query"]["bool"]["should"] = [] target_query["query"]["bool"]["minimum_should_match"] = 1 for item in keyword_target: target_query["query"]["bool"]["should"].append( {"term": { targettext: item }}) # Change query if condition_target or condition_source is specified if isinstance(condition_target, dict) == True: target_query["query"]["bool"]["filter"]["bool"]["must"].append( {"match": condition_target}) if isinstance(condition_source, dict) == True: source_query["query"]["bool"]["filter"]["bool"]["must"].append( {"match": condition_source}) # Retrieve source and target articles as generators source_query = scroll_query(source_query) target_query = scroll_query(target_query) # Make generators into lists and filter out those who do not have the specified keys (preventing KeyError) target_query = [ a for a in target_query if targettext in a["_source"].keys() and targetdate in a["_source"].keys() ] source_query = [ a for a in source_query if sourcetext in a["_source"].keys() and sourcedate in a["_source"].keys() ] # Target and source texts (split) target_text = [] for doc in target_query: target_text.append(doc["_source"][targettext].split()) source_text = [] for doc in source_query: source_text.append(doc["_source"][sourcetext].split()) logger.info("Preparing dictionary") dictionary = Dictionary(source_text + target_text) logger.info( "Removing all tokens that occur in less than {} documents or in more than {:.1f}% or all documents from dictionary" .format(filter_below, filter_above * 100)) dictionary.filter_extremes(no_below=filter_below, no_above=filter_above) logger.info("Preparing tfidf model") tfidf = TfidfModel(dictionary=dictionary) logger.info("Preparing soft cosine similarity matrix") similarity_matrix = softcosine_model.wv.similarity_matrix( dictionary, tfidf) # extract additional information from sources source_dates = [doc["_source"][sourcedate] for doc in source_query] source_ids = [doc["_id"] for doc in source_query] source_doctype = [doc["_source"]["doctype"] for doc in source_query] source_dict = dict(zip(source_ids, source_dates)) source_dict2 = dict(zip(source_ids, source_doctype)) # extract information from targets target_ids = [doc["_id"] for doc in target_query] target_dates = [doc["_source"][targetdate] for doc in target_query] target_dict = dict(zip(target_ids, target_dates)) target_doctype = [doc["_source"]["doctype"] for doc in target_query] target_dict2 = dict(zip(target_ids, target_doctype)) # If specified, comparisons compare docs within sliding date window if days_before != None or days_after != None: logger.info("Performing sliding window comparisons...") # merge queries including identifier key for i in source_query: i.update({"identifier": "source"}) for i in target_query: i.update({"identifier": "target"}) source_query.extend(target_query) # sourcedate and targetdate need to be the same key (bc everything is done for sourcedate) if targetdate is not sourcedate: logger.info( "Make sure that sourcedate and targetdate are the same key." ) else: # convert dates into datetime objects for a in source_query: if isinstance(a["_source"][sourcedate], datetime.date) == True: pass # is already datetime object else: a["_source"][sourcedate] = [ int(i) for i in a["_source"][sourcedate][:10].split("-") ] a["_source"][sourcedate] = datetime.date( a["_source"][sourcedate][0], a["_source"][sourcedate][1], a["_source"][sourcedate][2], ) # sort query by date source_query.sort(key=lambda item: item["_source"][sourcedate]) # create list of all possible dates d1 = source_query[0]["_source"][sourcedate] d2 = source_query[-1]["_source"][sourcedate] delta = d2 - d1 date_list = [] for i in range(delta.days + 1): date_list.append(d1 + datetime.timedelta(i)) # create list of docs grouped by date (dates without docs are empty lists) grouped_query = [] for d in date_list: dt = [] for a in source_query: if a["_source"][sourcedate] == d: dt.append(a) grouped_query.append(dt) # Optional: merges saturday and sunday into one weekend group # Checks whether group is Sunday, then merge together with previous (saturday) group. if merge_weekend == True: grouped_query_new = [] for group in grouped_query: # if empty, append empty list if not group: grouped_query_new.append([]) # if group is sunday, extend previous (saturday) list, except when it is the first day in the data. elif group[0]["_source"][sourcedate].weekday() == 6: if not grouped_query_new: grouped_query_new.append(group) else: grouped_query_new[-1].extend(group) # for all other weekdays, append new list else: grouped_query_new.append(group) grouped_query = grouped_query_new # Sliding window starts here... How it works: # A sliding window cuts the documents into groups that should be compared to each other based on their publication dates. A list of source documents published on the reference date is created. For each of the target dates in the window, the source list is compared to the targets, the information is put in a dataframe, and the dataframe is added to a list. This process is repeated for each window. We end up with a list of dataframes, which are eventually merged together into one dataframe. len_window = days_before + days_after + 1 source_pos = ( days_before ) # source position is equivalent to days_before (e.g. 2 days before, means 3rd day is source with the index position [2]) n_window = 0 for e in tqdm(self.window(grouped_query, n=len_window)): n_window += 1 df_window = [] source_texts = [] source_ids = [] if not "source" in [ l2["identifier"] for l2 in e[source_pos] ]: pass else: for doc in e[source_pos]: try: if doc["identifier"] == "source": # create sourcetext list to compare against source_texts.append( doc["_source"][sourcetext].split()) # extract additional information source_ids.append(doc["_id"]) except: logger.error( "This does not seem to be a valid document" ) print(doc) # create index of source texts query = tfidf[[ dictionary.doc2bow(d) for d in source_texts ]] # iterate through targets for d in e: target_texts = [] target_ids = [] for doc in d: try: if doc["identifier"] == "target": target_texts.append( doc["_source"][targettext].split()) # extract additional information target_ids.append(doc["_id"]) except: logger.error( "This does not seem to be a valid document" ) print(doc) # do comparison if len(target_ids) == 0: logger.warning( "Empty list of target ids. Skipping comparisons." ) continue index = SoftCosineSimilarity( tfidf[[ dictionary.doc2bow(d) for d in target_texts ]], similarity_matrix, ) try: sims = index[query] except: logger.warning( "There was a problem calculating the similarities, skipping this one" ) print(target_ids) sims = [] # make dataframe df_temp = (pd.DataFrame( sims, columns=target_ids, index=source_ids).stack().reset_index()) df_window.append(df_temp) df = pd.concat(df_window, ignore_index=True) df.columns = ["source", "target", "similarity"] df["source_date"] = df["source"].map(source_dict) df["target_date"] = df["target"].map(target_dict) df["source_doctype"] = df["source"].map(source_dict2) df["target_doctype"] = df["target"].map(target_dict2) # Optional: if threshold is specified if threshold: df = df.loc[df["similarity"] >= threshold] # Make exports folder if it does not exist yet if not os.path.exists(destination): os.mkdir(destination) # Optional: save as csv file if to_csv == True: df.to_csv( os.path.join( destination, r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.csv" .format( now=now, target=target, source=source, n_window=n_window, ), )) # Otherwise: save as pickle file else: df.to_pickle( os.path.join( destination, r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.pkl" .format( now=now, target=target, source=source, n_window=n_window, ), )) # Optional: save as pajek file not for days_before/days_after if to_pajek == True: logger.info( "Does not save as Pajek file with days_before/days_after because of the size of the files." ) # Same procedure as above, but without specifying a time frame (thus: comparing all sources to all targets) else: # Create index out of target texts logger.info("Preparing the index out of target texts...") index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(d) for d in target_text]], similarity_matrix) # Retrieve source IDs and make generator to compute similarities between each source and the index logger.info("Preparing the query out of source texts...") query = tfidf[[dictionary.doc2bow(d) for d in source_text]] query_generator = (item for item in query) # Retrieve similarities # Makes a separate dataframe for each source doc and saves this. logger.info("Starting comparisons...") i = 0 s_ids = 0 for doc in query_generator: i += 1 # count each round of comparisons # if doc is empty (which may happen due to pruning) # then we skip this comparison if len(doc) == 0: s_ids += 1 logger.info("Skipped one empty document") continue # make comparison sims = index[doc] # make dataframe df = pd.DataFrame([sims]).transpose() logger.debug("Created dataframe of shape {}".format(df.shape)) logger.debug("Length of target_id list: {}".format( len(target_ids))) df["target"] = target_ids df["source"] = source_ids[s_ids] df.columns = ["similarity", "target", "source"] df["source_date"] = df["source"].map(source_dict) df["target_date"] = df["target"].map(target_dict) df["source_doctype"] = df["source"].map(source_dict2) df["target_doctype"] = df["target"].map(target_dict2) df = df.set_index("source") # Optional: if threshold is specified if threshold: df = df.loc[df["similarity"] >= threshold] # Make exports folder if it does not exist yet if not "comparisons" in os.listdir("."): os.mkdir("comparisons") # Optional: save as csv file if to_csv == True: df.to_csv( os.path.join( destination, r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.csv" .format(now=now, target=target, source=source, i=i), )) # Otherwise: save as pickle file else: df.to_pickle( os.path.join( destination, r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.pkl" .format(now=now, target=target, source=source, i=i), )) # Optional: additionally save as pajek file if to_pajek == True: G = nx.Graph() # change int to str (necessary for pajek format) df["similarity"] = df["similarity"].apply(str) # change column name to 'weights' to faciliate later analysis df.rename({"similarity": "weight"}, axis=1, inplace=True) # notes and weights from dataframe G = nx.from_pandas_edgelist(df, source="source", target="target", edge_attr="weight") # write to pajek nx.write_pajek( G, os.path.join( destination, r"INCA_softcosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.net" .format(now=now, target=target, source=source, i=i), ), ) s_ids += 1 # move one doc down in source_ids logger.info("Done with source " + str(i) + " out of " + str(len(source_text)))
# Load the model: this is a big file, can take a while to download and open glove = api.load("glove-wiki-gigaword-50") similarity_index = WordEmbeddingSimilarityIndex(glove) # Build the term dictionary, TF-idf model dictionary = Dictionary(corpus + [query]) tfidf = TfidfModel(dictionary=dictionary) # Create the term similarity matrix. similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf) # Compute Soft Cosine Measure between the query and the documents. # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb query_tf = tfidf[dictionary.doc2bow(query)] index = SoftCosineSimilarity( tfidf[[dictionary.doc2bow(document) for document in corpus]], similarity_matrix) doc_similarity_scores = index[query_tf] # Output the sorted similarity scores and documents sorted_indexes = np.argsort(doc_similarity_scores)[::-1] results = [] if __name__ == "__main__": for idx in sorted_indexes: print( f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}') # results.append(f'{idx} \t {doc_similarity_scores[idx]:0.3f} \t {documents[idx]}')
termsim_index = WordEmbeddingSimilarityIndex(model.wv) # Create Corpus List corpus_list = [] for data in dataset: docs = "" for sentence in data['gejala']: docs += " " + sentence corpus_list.append(docs) # Create token list for all document corpus corpus_list_token = [preprocess(doc) for doc in corpus_list] dictionary = Dictionary(corpus_list_token) bow_corpus = [dictionary.doc2bow(document) for document in corpus_list_token] # Create Term similarity matrix similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) # Compute Soft Cosine Similarity docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) text = "Diare (10 hingga 12 kali per hari) Diare disertai darah.Kram pada perut.Buang air besar yang kental.Gas dalam perut.Gejala yang umum seperti demam, sakit punggung, dan lelah." predict = predict_decease(text, docsim_index, dictionary) print(predict) # sentence_end = re.compile(r'''[.!?]['"]?\s{1,2}(?=)''') # input_sentences = re.split(r"\.|\?|\!",text) # print(docs_similarity(input_sentences, [sentence for sentence in predict['data']['gejala']], model))
def main(): tfidf = None word2vec = None similarityMatrix = None browndict = {} corporadict = None word2vec = None choice = "" while choice != "exit": choice = "" while choice not in ["tfidf", "word2vec", "exit"]: choice = input( "TF-IDF or Word2Vec? [TFIDF, Word2Vec, Exit]\n>").lower() if choice == "exit": break catType = "" while catType not in ["within", "between", "return"]: catType = input( "Within or between clusters? [Within, Between, Return]\n>" ).lower() if catType == "return": break # get all of the words for each document per category texts = [] if catType == "within": for c in brown.categories(): words = NormalizeWords(brown.words(categories=c)) texts.append(words) # build a dictionary for me to use later browndict[c] = words elif catType == "between": for c in brown.categories(): words = NormalizeWords(brown.words(categories=c)) texts.append(words[:len(words) // 2]) texts.append(words[len(words) // 2:]) # build a dictionary for me to use later browndict[c + "1/2"] = words[:len(words) // 2] browndict[c + "2/2"] = words[len(words) // 2:] # create the corpora dictionary built from gensim corporadict = corpora.Dictionary(texts) # create a corpus for the training corpus = [] for line in texts: corpus.append(corporadict.doc2bow(line)) if choice == "tfidf": # create the tfidf model from our built corpus tfidf = TfidfModel(corpus=corpus) # build the similarity matrix similarityMatrix = MatrixSimilarity(corpus, num_features=len(corporadict)) elif choice == "word2vec": word2vec = Word2Vec(brown.sents()) # build term similiarity matrix from our models word-vector termSimilarityIndex = WordEmbeddingSimilarityIndex(word2vec.wv) # build sparse similarity matrix sparseSimiliarityMatrix = SparseTermSimilarityMatrix( termSimilarityIndex, corporadict) # build similarity word-vector WV_SimilarityMatrix = SoftCosineSimilarity( corpus, sparseSimiliarityMatrix) maxes = {} if choice == "tfidf": # Print out the code keys = list(browndict.keys()) for i in range(len(keys) - 1): # Convert to a bag of words and to a tfidf vector, then query it. query_bow = corporadict.doc2bow(browndict[keys[i]]) query_tfidf = tfidf[query_bow] # Get the similarity of every cluster query_similarity = similarityMatrix[query_tfidf] for j in range(i + 1, len(query_similarity)): sim = query_similarity[j] print(keys[i], "and", keys[j], "have a similarity of:", sim) print("") elif choice == "word2vec": keys = list(browndict.keys()) for i in range(len(keys) - 1): # Convert to a bag of words and query it query_bow = corporadict.doc2bow(browndict[keys[i]]) # Get the similarity of every cluster query_similarity = WV_SimilarityMatrix[query_bow] for j in range(i + 1, len(query_similarity)): sim = query_similarity[j] print(keys[i], "and", keys[j], "have a similarity of:", sim) print("")
# Read the data into a pandas dataframe df = pd.DataFrame([text.data]).T df['text'] = df[0] df = df[df['text'].map(type) == str] df.dropna(axis=0, inplace=True, subset=['text']) df = df.sample(frac=1.0) df.reset_index(drop=True, inplace=True) corpus = df['text'].apply(apply_all) # preprocessed tokenized corpus bow = [dictionary.doc2bow(doc) for doc in corpus] # transform into a gen bow bow = [i for i in bow if len(i) > 0] # remove empty lists return bow train_ath = clean(cat='soc.religion.christian', subset='test') # reference set docsim_index = SoftCosineSimilarity( train_ath, similarity_matrix) # SCM with ref set for later use atheism_test = clean('soc.religion.christian', subset='test') # query set 1: sim windows_test = clean('comp.windows.x', subset='test') # qyery set 2: diff SCM_ath_ath = docsim_index[atheism_test] # ref vs q1 SCM_ath_win = docsim_index[windows_test] # ref vs q2 ath_ath = SCM_ath_ath.flatten() ath_win = SCM_ath_win.flatten() fig = plt.figure(1, figsize=(10, 5)) sns.distplot(ath_win, bins=50, color='red', label='Christian-WindowsX') # atheism-windows sns.distplot(ath_ath, bins=50, color='green', label='Christian-Christian') # atheism-atheism
# dictionary, index and article_ids are used if past_behavior (SoftCosine) recommender is used try: lda_model = gensim.models.LdaModel.load("put path to model here") except: lda_model = None try: lda_dict = gensim.corpora.Dictionary.load("/put path to dict here") except: lda_dict = None try: dictionary = gensim.corpora.Dictionary.load("put path to dict here") except: dictionary = None try: index = SoftCosineSimilarity.load('put path to index here') except: index = None try: article_ids = pickle.load(open('put path to article ids here', 'rb')) except: article_ids = None login.login_view = 'login' if not app.debug: if app.config['MAIL_SERVER']: auth = None if app.config['MAIL_USERNAME'] or app.config['MAIL_PASSWORD']: auth = (app.config['MAIL_USERNAME'], app.config['MAIL_PASSWORD']) secure = None if app.config['MAIL_USE_TLS']:
# preprocessed tokenized corpus bow = [dictionary.doc2bow(doc) for doc in corpus] # transform into a gen bow bow = [i for i in bow if len(i) > 0] # remove empty lists return bow #%% test1BOW = get_bow(df_test1['tokenized']) train1 = df['Dominant_Topic'] == 1 dftrain1 = df[train1] dftrain1.reset_index(drop=True, inplace=True) train1BOW = get_bow(dftrain1['Text']) #%% testvstraining_1 = SoftCosineSimilarity(train1BOW, similarity_matrix) scs_topic1 = testvstraining_1[test1BOW] #%% test4BOW = get_bow(df_test4['tokenized']) scs_topic1vstopic4 = testvstraining_1[test4BOW] #%% fig = plt.figure(1, figsize=(10, 5)) sns.distplot(scs_topic1, bins=50, color='red', label='TrainingvsTest (Topic1)') # atheism-windows sns.distplot(scs_topic1vstopic4, bins=50, color='green', label='Topic1vsTopic4') # atheism-atheism
def _preprocess_dataset( self, level: str ) -> Tuple[List[Document], List[Document], np.ndarray, List[Tuple[int, float]]]: LOGGER.info('Preprocessing {} ({})'.format(self.dataset, level)) if level == 'validation': pivot = int(round(len(self.dataset.train_documents) * 0.8)) train_documents = self.dataset.train_documents[:pivot] test_documents = self.dataset.train_documents[pivot:] elif level == 'test': train_documents = self.dataset.train_documents test_documents = self.dataset.test_documents else: message = 'Expected validation or test level, but got {}' raise ValueError(message.format(level)) cache_path = self.model.cache_dir / 'text_classification' cache_path.mkdir(exist_ok=True) method_parameters = TEXT_CLASSIFICATION_METHOD_PARAMETERS[self.method] if self.method == 'scm': train_corpus = [document.words for document in train_documents] dictionary = Dictionary(train_corpus, prune_at=None) tfidf = TfidfModel(dictionary=dictionary, smartirs='nfn') termsim_index = WordEmbeddingSimilarityIndex( self.model.vectors, **method_parameters['similarity_index']) cache_path = cache_path / '{}-{}-{}-{}'.format( self.dataset.name, self.dataset.split_idx, self.method, level) try: similarity_matrix = SparseTermSimilarityMatrix.load( str(cache_path), mmap='r') except IOError: similarity_matrix = SparseTermSimilarityMatrix( termsim_index, dictionary, tfidf, **method_parameters['similarity_matrix']) similarity_matrix.save(str(cache_path)) train_corpus = [ dictionary.doc2bow(document) for document in train_corpus ] train_corpus = tfidf[train_corpus] similarity_model = SoftCosineSimilarity(train_corpus, similarity_matrix) test_corpus = (document.words for document in test_documents) test_corpus = [ dictionary.doc2bow(document) for document in test_corpus ] test_corpus = tfidf[test_corpus] elif self.method == 'wmd': train_corpus = [document.words for document in train_documents] cache_path = cache_path / '{}-{}'.format(self.dataset.name, self.method) cache_path = cache_path.with_suffix('.shelf') similarity_model = ParallelCachingWmdSimilarity( train_corpus, self.model.vectors, cache_path) test_corpus = [document.words for document in test_documents] else: message = 'Preprocessing for method {} not yet implemented'.format( self.method) raise ValueError(message) with np.errstate(all='ignore'): similarities = similarity_model[test_corpus] expected_shape = (len(test_documents), len(train_documents)) if similarities.shape != expected_shape: message = 'Expected similarities with shape {}, but received shape {}' raise ValueError(message.format(expected_shape, similarities.shape)) return (train_documents, test_documents, similarities, test_corpus)