def clean_compute_similarity(d1, d2): #print(type(d1)) #print(type(d2)) d1 = remove_stopwords(d1).split() d2 = remove_stopwords(d2).split() #print(d1) #print(d2) # Dictionary and Corpus documents = [d1, d2] dictionary = corpora.Dictionary(documents) # Composing the similarity matrix similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # Conversion of sentences into bag-of-words vectors - The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. d1 = dictionary.doc2bow(d1) d2 = dictionary.doc2bow(d2) #print(d1) #print(d2) # Soft cosine similarity - Considers similarities between pairs of features score = softcossim(d1, d2, similarity_matrix) return score
def soft_cosine_similarity(data_source, data_target): data_source = [s.strip() for s in data_source] data_source = [s.lower() for s in data_source] data_source = [s.split() for s in data_source] data_target = [s.strip() for s in data_target] data_target = [s.lower() for s in data_target] data_target = [s.split() for s in data_target] random.shuffle(data_source) overall_data = data_source + data_target assert len(overall_data) == len(data_source) + len( data_target), 'Lengths should be equal' #dictionary = corpora.Dictionary(data_source) dictionary = corpora.Dictionary(overall_data) print('Making similarity matrix') similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) overlap_list = [] same_counter = 0 target_bow_list = [] print('Processing target data') for k in range(len(data_target)): if k % 100 == 0: print(k) target_sent = data_target[k] target_bow = dictionary.doc2bow(target_sent) target_bow_list.append(target_bow) for i in range(1000): print('Iteration ', i) max_overlap = -100 overlapper = 0 source_sentence = data_source[i] source_bow = dictionary.doc2bow(source_sentence) for j in range(len(data_target)): #if i == j: # continue target_bow = target_bow_list[j] distance = softcossim(source_bow, target_bow, similarity_matrix) if distance == 1: same_counter += 1 if distance > max_overlap: max_overlap = distance overlapper = j overlap_list.append(max_overlap) print('Source: ', data_source[i]) print('Closest sentence: ', data_target[overlapper]) print('Overlap: ', max_overlap) print('Perfect matches: ', same_counter) avg_overlap = sum(overlap_list) / len(overlap_list) print(overlap_list) print('Average distance: ', avg_overlap)
def test_distributions(self): # checking bag of words as inputs vec_1 = [(0, 1.0), (2, 1.0)] # hello world vec_2 = [(1, 1.0), (2, 1.0)] # hi world similarity_matrix = csc_matrix([[1, 0.5, 0], [0.5, 1, 0], [0, 0, 1]]) result = matutils.softcossim(vec_1, vec_2, similarity_matrix) expected = 0.75 self.assertAlmostEqual(expected, result)
def soft_cosine_similarity_matrix(sentences): len_array = np.arange(len(sentences)) xx, yy = np.meshgrid(len_array, len_array) cossim_mat = pd.DataFrame([[ round(softcossim(sentences[i], sentences[j], similarity_matrix), 2) for i, j in zip(x, y) ] for y, x in zip(xx, yy)]) return cossim_mat
def get_category_seed_similarity(self, sentence, seeds, similarity_matrix): result = 0 length = len(seeds) sentence_d2b = self.dictionary.doc2bow(sentence) for word in seeds: seed_d2b = self.dictionary.doc2bow([word]) result += softcossim(sentence_d2b, seed_d2b, similarity_matrix) return result / length
def test_distributions(self): # checking bag of words as inputs vec_1 = [(0, 1.0), (2, 1.0)] # hello world vec_2 = [(1, 1.0), (2, 1.0)] # hi world similarity_matrix = csc_matrix([[1, 0.5, 0], [0.5, 1, 0], [0, 0, 1]]) result = matutils.softcossim(vec_1, vec_2, similarity_matrix) expected = 0.75 self.assertAlmostEqual(expected, result)
def test_inputs(self): # checking empty inputs vec_1 = [] vec_2 = [] similarity_matrix = csc_matrix((0, 0)) result = matutils.softcossim(vec_1, vec_2, similarity_matrix) expected = 0.0 self.assertEqual(expected, result) # checking CSR term similarity matrix format similarity_matrix = csr_matrix((0, 0)) result = matutils.softcossim(vec_1, vec_2, similarity_matrix) expected = 0.0 self.assertEqual(expected, result) # checking unknown term similarity matrix format with self.assertRaises(ValueError): matutils.softcossim(vec_1, vec_2, np.matrix([]))
def test_inputs(self): # checking empty inputs vec_1 = [] vec_2 = [] similarity_matrix = csc_matrix((0, 0)) result = matutils.softcossim(vec_1, vec_2, similarity_matrix) expected = 0.0 self.assertEqual(expected, result) # checking CSR term similarity matrix format similarity_matrix = csr_matrix((0, 0)) result = matutils.softcossim(vec_1, vec_2, similarity_matrix) expected = 0.0 self.assertEqual(expected, result) # checking unknown term similarity matrix format with self.assertRaises(ValueError): matutils.softcossim(vec_1, vec_2, np.matrix([]))
def create_soft_cossim_matrix(sentences, title, des): len_array = np.arange(len(sentences)) xx, yy = np.meshgrid(0, len_array) cossim_mat = pd.DataFrame([[ round(softcossim(sentences[i], sentences[j], similarity_matrix), 2) for i, j in zip(x, y) ] for y, x in zip(xx, yy)]) k = cossim_mat.sort_values(by=0, ascending=False) k1 = pd.DataFrame(k) return k1
def get_similarity(self,first_document,second_document): documents = [ first_document, second_document] dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents]) similarity_matrix = self.__fasttext_model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) sentences_of_first_document = dictionary.doc2bow(simple_preprocess(first_document)) sentences_of_second_document = dictionary.doc2bow(simple_preprocess(second_document)) sentences = [sentences_of_first_document, sentences_of_second_document] return softcossim(sentences_of_first_document, sentences_of_second_document, similarity_matrix)
def is_duplicate(text1, text2, threshold): preprocessed_text1 = preprocessor.preprocess_text(data['text1']) preprocessed_text2 = preprocessor.preprocess_text(data['text2']) bow1 = dictionary.doc2bow(preprocessed_text1) bow2 = dictionary.doc2bow(preprocessed_text2) softcossim_similarity = softcossim(bow1, bow2, similarity_matrix) return softcossim_similarity >= threshold
def soft_cosine(tokens, stem=False, lemma=False): """ Apply soft cosine between two paragraphs using fasttext embeddings and append the scores to a dataframe :param tokens: :param stem: if stem is true, output cosine scores are saved for paragraphs with stemmed tokens :param lemma: if lemma is true, output cosine scores are saved for paragraphs with lemmatized tokens :return: none """ softcosout = [] colnames = [] df_softcos = pd.DataFrame() tokens = tokens.apply(lambda x: ' '.join(x)) token_1 = [] token_2 = [] for count in range(0, len(tokens)-1): sent1 = tokens[count] sent2 = tokens[count+1] parag1 = 'parag#' + str(count+1) parag2 = ' & ' + str(count+2) paragnumber = parag1 + parag2 parag_1 = str(count+1) parag_2 = str(count+2) documents = [sent1, sent2] # create vocabulary dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents]) # apply fasttext model similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # create bag of words sent_1 = dictionary.doc2bow(simple_preprocess(sent1)) sent_2 = dictionary.doc2bow(simple_preprocess(sent2)) # apply softcosine similarity soft_cosine_output = softcossim(sent_1, sent_2, similarity_matrix) print(soft_cosine_output) colnames.append(paragnumber) softcosout.append(soft_cosine_output) token_1.append(parag_1) token_2.append(parag_2) # create and export dataframe df_softcos = pd.DataFrame(softcosout, columns = ['Soft cosine']) df_softcos['token 1'] = token_1 df_softcos['token 2'] = token_2 print(df_softcos) if stem: df_softcos.to_csv(r'softcosine_stem_output15.csv', index=None, header=True) if lemma: df_softcos.to_csv(r'softcosine_lemma_output15.csv', index=None, header=True)
def cosine_similarity(reference, hypothesis, model): reference = reference.split() hypothesis = hypothesis.split() documents = [hypothesis, reference] dictionary = corpora.Dictionary(documents) similarity_matrix = emb_models[model].similarity_matrix(dictionary) hypothesis = dictionary.doc2bow(hypothesis) reference = dictionary.doc2bow(reference) return softcossim(hypothesis, reference, similarity_matrix)
def soft_cosine_similarity(text_1, text_2, corpus): dictionary = Dictionary(corpus) text_1 = dictionary.doc2bow(text_1) text_2 = dictionary.doc2bow(text_2) w2v_model = Word2Vec(corpus, workers=cpu_count(), min_count=1, size=300, seed=12345) similarity_matrix = sparse.csr_matrix( MatrixSimilarity(Dense2Corpus(w2v_model.wv.syn0.T))) return softcossim(text_1, text_2, similarity_matrix)
def deriveSoftCosineSimilarityMatrix(allDict, limit=None, weName="glove-wiki-gigaword-50", simThreshold=0.3): # documents=getTestDocuments() docsZip = getDocList(allDict, limit, stop_list=getCustomStopWords(), with_ids=True) documents = [] ids = [] for i, j in docsZip: documents.append(j) ids.append(i) model = getWordEmbeddingModel(weName=weName) # Create gensim Dictionary of unique IDs of all words in all documents # pyDAVis param "d" dictionary = corpora.Dictionary( [simple_preprocess(doc) for doc in documents]) # Convert the sentences into bag-of-words vectors. sentences = [] # pyDAVis param "c" for doc in documents: sentences.append(dictionary.doc2bow(simple_preprocess(doc))) # Create a TF-IDF model. TF-IDF encoding represents words as their # relative importance to the whole document in a collection of documents, # i.e. the sentences. # pyDAVis param "lda" tf_idf = models.TfidfModel(sentences) # Prepare the similarity matrix similarity_matrix = model.similarity_matrix(dictionary, tfidf=tf_idf, threshold=simThreshold, exponent=2.0, nonzero_limit=100) # create 1xN vector filled with 1,2,..N len_array = np.arange(len(sentences)) # create NxN array filled with 1..N down, 1..N across xx, yy = np.meshgrid(len_array, len_array) # Iterate over the 2d matrix calculating theMatrix = [[ round(softcossim(sentences[i], sentences[j], similarity_matrix), 2) for i, j in zip(x, y) ] for y, x in zip(xx, yy)] cossim_mat = pd.DataFrame(theMatrix, index=ids, columns=ids) return cossim_mat
def similarity(path1, path2): ''' prints the cosine and soft cosine similiarities of the text objects located at path1 and path2 ''' # opens the file and saves them as readable objects one = open(path1).read().replace('\n', '') two = open(path2).read().replace('\n', '') #creates a list containing the objects lst = [one, two] #drops the stop words -------------------------------------------------------- vectorizer = CountVectorizer(stop_words='english') #converts the text documents to a (sparse) matrix of token counts sparse_matrix = vectorizer.fit_transform(lst) #creates dense matrix dense_matrix = sparse_matrix.todense() df = pd.DataFrame(dense_matrix, columns=vectorizer.get_feature_names(), index=['1', '2']) #prints the cosine similarity --------------------------------------------------------- cos_similarity = cosine_similarity(df, df)[0][1] #creates corpus? dictionary? ------------------------------------------------------------------------ dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in lst]) print(dictionary) #creates similarity matrix similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) #create bag-of-words vectors for each text file one = dictionary.doc2bow(simple_preprocess(one)) two = dictionary.doc2bow(simple_preprocess(two)) #prints the soft cosine similarity ------------------------------------------------------ soft_similarity = softcossim(one, two, similarity_matrix) return [ 'Cosine Similarity: ' + str(cos_similarity), 'Soft Cosine Similarity: ' + str(soft_similarity) ]
def compare_sentences(sentence1, sentence2, model=word2vec_model300): sentence1 = sentence1.split() sentence2 = sentence2.split() documents = [sentence1, sentence2] dictionary = corpora.Dictionary(documents) ws1 = dictionary.doc2bow(sentence1) ws2 = dictionary.doc2bow(sentence2) similarity_matrix = model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) return softcossim(ws1, ws2, similarity_matrix)
def make_dist_vec(vector, doc_vectors, similarity_matrix, length=5): """ f**k off!! Read code instead of searching for docstring """ gen_vector = [] for i in doc_vectors: if vector == i: continue gen_vector = gen_vector + [softcossim(vector, i, similarity_matrix)] gen_vector.sort(reverse=True) return gen_vector[:length + 1]
def softCosineSimilarityTest(numtestdocs=20, weName="glove-wiki-gigaword-50"): # documents=getTestDocuments() # documents=getSampleDocs(numtestdocs) documents = getDocList(limit=numtestdocs) model = getWordEmbeddingModel(weName=weName) # Create gensim Dictionary of unique IDs of all words in all documents dictionary = corpora.Dictionary( [simple_preprocess(doc) for doc in documents]) # Prepare the similarity matrix similarity_matrix = model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # Convert the sentences into bag-of-words vectors. sentences = [] for doc in documents: sentences.append(dictionary.doc2bow(simple_preprocess(doc))) # Create a TF-IDF model. TF-IDF encoding represents words as their # relative importance to the whole document in a collection of documents, # i.e. the sentences. # tf_idf = models.TfidfModel(sentences) # print("tf_idf:", tf_idf) # create 1xN vector filled with 1,2,..N len_array = np.arange(len(sentences)) # create NxN array filled with 1..N down, 1..N across xx, yy = np.meshgrid(len_array, len_array) # Iterate over the 2d matrix calculating theMatrix = [[ round(softcossim(sentences[i], sentences[j], similarity_matrix), 2) for i, j in zip(x, y) ] for y, x in zip(xx, yy)] names = [] # for identifying rows and columns jj = 0 for doc in documents: names.append(str(jj) + " " + doc[:15] + "\t") jj += 1 cossim_mat = pd.DataFrame(theMatrix, index=names, columns=names) return cossim_mat
def get_similarities(self, query): """Get similarity between `query` and current index instance. Warnings -------- Do not use this function directly; use the self[query] syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix` Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix. """ if isinstance(query, numpy.ndarray): # Convert document indexes to actual documents. query = [self.corpus[i] for i in query] if not query or not isinstance(query[0], list): query = [query] n_queries = len(query) result = [] for qidx in range(n_queries): # Compute similarity for each query. qresult = [ matutils.softcossim(document, query[qidx], self.similarity_matrix) for document in self.corpus ] qresult = numpy.array(qresult) # Append single query result to list of all results. result.append(qresult) if len(result) == 1: # Only one query. result = result[0] else: result = numpy.array(result) return result
def get_similarities(self, query): """Get similarity between `query` and this index. Warnings -------- Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix. """ is_corpus, query = utils.is_corpus(query) if not is_corpus: if isinstance(query, numpy.ndarray): # Convert document indexes to actual documents. query = [self.corpus[i] for i in query] else: query = [query] result = [] for query_document in query: # Compute similarity for each query. qresult = [ matutils.softcossim(query_document, corpus_document, self.similarity_matrix) for corpus_document in self.corpus ] qresult = numpy.array(qresult) # Append single query result to list of all results. result.append(qresult) if is_corpus: result = numpy.array(result) else: result = result[0] return result
def similarity(quest, faq=faq): faq_clean = cleaner(faq) dictionary = corpora.Dictionary(faq_clean) corpus = [dictionary.doc2bow(q) for q in cleaner(faq)] similarities = [] faq_ = cleaner(faq) dictionary = corpora.Dictionary(faq_) corpus = [dictionary.doc2bow(q) for q in faq_] similarity_matrix = w2v_model.similarity_matrix(dictionary) question = cleanq(quest) for i in range(len(corpus)): similarities.append( softcossim(dictionary.doc2bow(question), corpus[i], similarity_matrix)) return (faq[similarities.index(sorted(similarities, reverse=True)[0])], similarities.index(sorted(similarities, reverse=True)[0]))
def get_similarities(self, query): """Get similarity between `query` and current index instance. Warnings -------- Do not use this function directly; use the self[query] syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix` Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix. """ if isinstance(query, numpy.ndarray): # Convert document indexes to actual documents. query = [self.corpus[i] for i in query] if not query or not isinstance(query[0], list): query = [query] n_queries = len(query) result = [] for qidx in range(n_queries): # Compute similarity for each query. qresult = [matutils.softcossim(document, query[qidx], self.similarity_matrix) for document in self.corpus] qresult = numpy.array(qresult) # Append single query result to list of all results. result.append(qresult) if len(result) == 1: # Only one query. result = result[0] else: result = numpy.array(result) return result
def get_similarities(self, query): """Get similarity between `query` and this index. Warnings -------- Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number) Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix. """ is_corpus, query = utils.is_corpus(query) if not is_corpus: if isinstance(query, numpy.ndarray): # Convert document indexes to actual documents. query = [self.corpus[i] for i in query] else: query = [query] result = [] for query_document in query: # Compute similarity for each query. qresult = [matutils.softcossim(query_document, corpus_document, self.similarity_matrix) for corpus_document in self.corpus] qresult = numpy.array(qresult) # Append single query result to list of all results. result.append(qresult) if is_corpus: result = numpy.array(result) else: result = result[0] return result
def softCosine(self, model, documents): """ Returns a similarity score using cosine similarity between combined word vectors of two documents. Credit and additional information: https://www.machinelearningplus.com/nlp/cosine-similarity/ @param model: A set of pretrained word embeddings, such as GoogleNews-vectors-negative300.bin. @param documents: A size 2 array of strings. Example: ['This is a short sentence.', 'One. Two sentences here.'] """ # Prepare a dictionary and a corpus. dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents]) # Prepare the similarity matrix similarity_matrix = model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # Convert the sentences into bag-of-words vectors. sentenceVector = [] for doc in documents: sentenceVector.append(dictionary.doc2bow(simple_preprocess(doc))) return softcossim(sentenceVector[0], sentenceVector[1], similarity_matrix)
print("Model loaded Input the document") s = input() s = s.lower() test_data = word_tokenize(s) v1 = model.infer_vector(test_data) print("V1_infer", v1) #s = dictionary.doc2bow(s) max = 0 idx = 0 for d in data: if softcossim(dictionary.doc2bow( (s.lower()).split()), dictionary.doc2bow( (d.lower()).split()), similarity_matrix) > max: max = softcossim(dictionary.doc2bow((s.lower()).split()), dictionary.doc2bow((d.lower()).split()), similarity_matrix) idx = data.index(d) print(max) print(data[idx]) # to find most similar doc using tags #similar_doc = model.docvecs.most_similar('1') #print("work") #print(similar_doc)
corpus = [dictionary.doc2bow(document) for document in documents] # Convert the sentences into bag-of-words vectors. question1 = dictionary.doc2bow(question1) question2 = dictionary.doc2bow(question2) question3 = dictionary.doc2bow(question3) question4 = dictionary.doc2bow(question4) import gensim.downloader as api w2v_model = api.load("glove-wiki-gigaword-50") similarity_matrix = w2v_model.similarity_matrix(dictionary) from gensim.matutils import softcossim similarity = softcossim(question1, question2, similarity_matrix) print('similarity = %.4f' % similarity) """The similarity for the 1st pair is relative large, this means soft cosine thinks these two sentence are very similar.""" similarity = softcossim(question3, question4, similarity_matrix) print('similarity = %.4f' % similarity) """On the other hand, the similarity for the 2nd pair is very small, this means soft cosine thinks this pair are not similar. ### FuzzyWuzzy We have covered some basics on Fuzzy String Matching in Python, let's have a quick peak on whether FuzzyWuzzy can help with our question dedupe problem. """ from fuzzywuzzy import fuzz
exponent=2.0, nonzero_limit=100) # Convert sentences into bag-of-words vectors. sentence_1 = dictionary.doc2bow(sentence_1) sentence_2 = dictionary.doc2bow(sentence_2) sentence_3 = dictionary.doc2bow(sentence_3) sentence_4 = dictionary.doc2bow(sentence_4) print(sentence_1) print(sentence_3) print(sentence_3) print(sentence_4) # Soft cosine similarity print(softcossim(sentence_1, sentence_2, similarity_matrix)) print(softcossim(sentence_1, sentence_3, similarity_matrix)) print(softcossim(sentence_2, sentence_3, similarity_matrix)) print(softcossim(sentence_2, sentence_4, similarity_matrix)) # In[21]: print(dataset['SYNONYM_VALUE']) # In[86]: # Testing Gensim with the actual KOIOS data # Step 1 - Clean data (Removing stopwords and punctuation) from gensim.parsing.preprocessing import remove_stopwords from gensim.utils import simple_preprocess
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # Convert the sentences into bag-of-words vectors. sent_1 = dictionary.doc2bow(simple_preprocess(doc_trump)) sent_2 = dictionary.doc2bow(simple_preprocess(doc_election)) sent_3 = dictionary.doc2bow(simple_preprocess(doc_putin)) sent_4 = dictionary.doc2bow(simple_preprocess(doc_soup)) sent_5 = dictionary.doc2bow(simple_preprocess(doc_noodles)) sent_6 = dictionary.doc2bow(simple_preprocess(doc_dosa)) sentences = [sent_1, sent_2, sent_3, sent_4, sent_5, sent_6] print(softcossim(sent_1, sent_2, similarity_matrix)) def create_soft_cossim_matrix(sentences): len_array = np.arange(len(sentences)) xx, yy = np.meshgrid(len_array, len_array) cossim_mat = pd.DataFrame([[ round(softcossim(sentences[i], sentences[j], similarity_matrix), 2) for i, j in zip(x, y) ] for y, x in zip(xx, yy)]) return cossim_mat print(create_soft_cossim_matrix(sentences))
print('#################################################################') print('') duplicates = [] none_duplicates = [] train = get_train() for data in tqdm(train, desc='Calculating similarities'): preprocessed_text1 = preprocessor.preprocess_text(data['text1']) preprocessed_text2 = preprocessor.preprocess_text(data['text2']) bow1 = dictionary.doc2bow(preprocessed_text1) bow2 = dictionary.doc2bow(preprocessed_text2) softcossim_similarity = softcossim(bow1, bow2, similarity_matrix) if data['duplicate']: duplicates.append(softcossim_similarity) else: none_duplicates.append(softcossim_similarity) sleep(2) print('Mean duplicates:', mean(duplicates)) print() mean_duplicates = mean(duplicates) test = get_test() correct = 0
non_col = "Objective: To investigate the clinicopathologic and molecular features of the rare cribriform morular variant of papillary thyroid carcinoma (CMV-PTC). Methods: The clinicopathologic data of 10 patients with CMV-PTC were retrospectively reviewed. Immunohistochemical (IHC) staining was done using LSAB method. DNA sequencing for APC were applied using Sanger method. BRAF V600E mutation was examined using ARMS method. The cytological, morphological, IHC and molecular features were analyzed. Results: All patients were female at an average age of 27 years old. The tumors were mostly located in the right lobe of thyroid. Fine needle aspiration cytology was performed in three patients; two were diagnosed as suspicious for PTC and one as PTC. Nine tumors presented as solitary nodule and two as multiple nodules in both lobes. Infiltration was demonstrated in three cases. The average size was 2.6 cm. The neoplastic cells were arranged in papillary, cribriform, solid and glandular patterns, with rare or without colloid inside the lumen. The number of morula varied, ranging from zero to many. The neoplastic cells were variably enlarged, showing round, oval or spindle shape. Nuclear irregularity was identified as irregular membrane, nuclear grooves or pseudoinclusion, but no typical ground glass feature. Peculiar nuclear clearing could be observed in the morular cells. IHC staining showed the neoplastic cells were negative for thyroglobulin and p63, but positive for TTF1, cytokeratin 19 and estrogen receptor. Diffuse staining with cytokeratin was seen in the neoplastic cells and the morula. Specific cytoplasmic and nuclear staining of β-catenin was seen in the neoplastic cells but not the morula. Ki-67 proliferation index was 1%-30%. No recurrence or metastasis was observed. One patient was demonstrated to harbor both somatic and germline mutations of the APC gene, who was found to have adenomatous polyposis and her mother died of colonic carcinoma. No BRAF V600E mutation was detected. Conclusions: CMV-PTC is rare and shows atypical cytological and clinicopathological features, and it is easily misdiagnosed.TG, TTF1, ER and β-catenin are specific IHC markers for CMV-PTC. The morula is negative for cytokeratin 19, in contrast to squamous metaplasia. Although CMV-PTC has indolent clinical behavior, a definite diagnosis is necessary to rule out the possibility of APC gene mutation and related extra-thyroidal neoplasm, such as FAP and Gardner syndrome." non_col = re.sub( r"(?<=\w[^\d])\.|\.(?=[^\d])|\(|\)|\[|\]|,(?= )|((?<=[^\w])-|-(?=[^\w]))|:|\?|\;", " ", non_col) non_col = remove_stopwords(non_col) non_col = stem_text(non_col).split() non_col = trigram[bigram[non_col]] col_1 = dictionary.doc2bow(col_1) col_2 = dictionary.doc2bow(col_2) non_col = dictionary.doc2bow(non_col) similarity = softcossim(col_1, col_2, similarity_matrix) print('similarity = %.4f' % similarity) similarity = softcossim(col_1, non_col, similarity_matrix) print('similarity = %.4f' % similarity) #print non_col #inferred_docvec = model.infer_vector(trigram[bigram[non_col]],steps=5000) ''' #age_test = stem_text("16 years-old").split() ngrams = trigram[bigram[query_doc]] for item in ngrams:
dictionary.doc2bow(document) for document in all_documents_stop_removed ] print("document loaded and corpus created.") size = len(all_documents_stop_removed) Matrix = [0] * size for i in range(size): Matrix[i] = [0] * size model = KeyedVectors.load_word2vec_format( '/home/mostafa/Desktop/WMD/wiki.fa.vec', binary=False) similarity_matrix = model.similarity_matrix(dictionary) print('model loaded') for i in range(1, len(all_documents_stop_removed)): for j in range(0, i): # print i,",",j doc_i = all_documents_stop_removed[i] doc_j = all_documents_stop_removed[j] doc_i = dictionary.doc2bow(doc_i) doc_j = dictionary.doc2bow(doc_j) similarity = softcossim(doc_i, doc_j, similarity_matrix) # print similarity Matrix[i][j] = similarity b = Matrix[i] min_distance = np.amax(b) print str(i) + "\t" + str(khabarID[i]) + "\t" + str(min_distance) # print i
def preprocess(words): common_words = ["habitat", "stay", "just", "the", "is", "of", "and", "for", "anything", "it", "a", "an", "in", "if", "that", "to", "here", "find", "your", "you", "more", "become", "some", "individuals", "can", "all", "about", "regardless", "we", "so", "be", "as", "ever"] punctuation = [".", "!", "?", ",", ";", ":"] output = [] for i in range(len(words)): initial_word = words[i] if words[i][len(words[i])-1] in punctuation: initial_word = words[i][:-1] if not initial_word.lower() in common_words: output.append(initial_word) return output dictionary = corpora.Dictionary([preprocess(doc) for doc in documents]) similarity_matrix = fast_text_model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) first_sentence = dictionary.doc2bow(preprocess(documents[0])) second_sentence = dictionary.doc2bow(preprocess(documents[1])) third_sentence = dictionary.doc2bow(preprocess(documents[2])) print(softcossim(first_sentence, second_sentence, similarity_matrix)) print(softcossim(first_sentence, third_sentence, similarity_matrix)) print(softcossim(second_sentence, third_sentence, similarity_matrix)) print(softcossim(third_sentence, second_sentence, similarity_matrix))