def test_distributions(self): # checking bag of words as inputs vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] result = matutils.hellinger(vec_1, vec_2) expected = 0.185241936534 self.assertAlmostEqual(expected, result) # checking ndarray, csr_matrix as inputs vec_1 = numpy.array([[1, 0.3], [0, 0.4], [2, 0.3]]) vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) result = matutils.hellinger(vec_1, vec_2) expected = 0.160618030536 self.assertAlmostEqual(expected, result) # checking ndarray, list as inputs vec_1 = numpy.array([0.6, 0.1, 0.1, 0.2]) vec_2 = [0.2, 0.2, 0.1, 0.5] result = matutils.hellinger(vec_1, vec_2) expected = 0.309742984153 self.assertAlmostEqual(expected, result) # testing LDA distribution vectors numpy.random.seed(0) model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.hellinger(lda_vec1, lda_vec2) expected = 1.0406845281146034e-06 self.assertAlmostEqual(expected, result)
def calculate_hellinger_predictions(users_profiles, papers_topics, fold, splits, lag=500): print("Calculating predictions based on hellinger distance...") s_time = time.time() predictions = np.zeros((users_profiles.shape[0], papers_topics.shape[0])) step = 0 if not (splits is None): print("Calculating for test items only...") for (i, u) in enumerate(users_profiles): # Get the test items, calculate the predictions for the test items only test_items = np.array(splits[i, fold]) for j in test_items: predictions[i, j] = 1 - hellinger(u, papers_topics[j]) step += 1 if step % lag == 0: print( "{} users done, time since prediction calculation: {:5.2f} minutes" .format(step, (time.time() - s_time) / 60)) else: print("Calculating for all items ...") for (i, u) in enumerate(users_profiles): for (j, p) in enumerate(papers_topics): predictions[i, j] = 1 - hellinger(u, p) step += 1 if step % lag == 0: print( "{} users done, time since prediction calculation: {:5.2f} minutes" .format(step, (time.time() - s_time) / 60)) return predictions
def main(): args = parser.parse_args() # dialect = ['pa','sy'] dialect = [args.dialect_one, args.dialect_two] folder = args.corpus_folder + '/' # clean_data/comparable/msa/ , clean_data/comparable/egypt/ corpus_files = [folder + dialect[0] + '.txt', folder + dialect[1] + '.txt'] dictionary, corpus = models.build_comparable_ldamodel_training(folder, dialect) # sys.exit() # print('dict',len(dictionary)) # print(dictionary.token2id) # print('corpus', len(corpus)) lda_model = models.build_ldamodel(corpus, dictionary) folders = [folder + dialect[0] + '/', folder + dialect[1] + '/'] # for sub_folder in folders: Hellinger_summation = 0 Jaaccard_summation = 0 for file in os.listdir(folders[0]): try: extension = os.path.splitext(file)[1] if extension == '.txt': first_filepath = os.path.join(folders[0], file) second_filepath = os.path.join(folders[1], file) with open(first_filepath, encoding='utf-8') as f: # we can define file_name first_documents = f.read() first_dialect = [word for word in first_documents.split()] # print(first_dialect) with open(second_filepath, encoding='utf-8') as f: # we can define file_name second_documents = f.read() second_dialect = [word for word in second_documents.split()] # print(second_dialect) bow_first_dialect = lda_model.id2word.doc2bow(first_dialect) bow_second_dialect = lda_model.id2word.doc2bow(second_dialect) # print(bow_first_dialect) # we can now get the LDA topic distributions for these lda_bow_first_dialect = lda_model[bow_first_dialect] lda_bow_second_dialect = lda_model[bow_second_dialect] # print(lda_bow_first_dialect) print('Hellinger distance between 1 and 2 ') print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect)) Hellinger_summation = Hellinger_summation + hellinger(lda_bow_first_dialect, lda_bow_second_dialect) print('Jcard Distance') print(jaccard(bow_first_dialect, bow_second_dialect)) Jaaccard_summation = Jaaccard_summation + jaccard(bow_first_dialect, bow_second_dialect) # sys.exit() except : pass print('total hellinger = ', Hellinger_summation / 10197) print('Total JC = ', Jaaccard_summation / 10197)
def test_distributions(self): # checking bag of words as inputs vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] result = matutils.hellinger(vec_1, vec_2) expected = 0.185241936534 self.assertAlmostEqual(expected, result) # checking ndarray, csr_matrix as inputs vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]]) vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) result = matutils.hellinger(vec_1, vec_2) expected = 0.160618030536 self.assertAlmostEqual(expected, result) # checking ndarray, list as inputs vec_1 = np.array([0.6, 0.1, 0.1, 0.2]) vec_2 = [0.2, 0.2, 0.1, 0.5] result = matutils.hellinger(vec_1, vec_2) expected = 0.309742984153 self.assertAlmostEqual(expected, result) # testing LDA distribution vectors np.random.seed(0) model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.hellinger(lda_vec1, lda_vec2) expected = 1.0406845281146034e-06 self.assertAlmostEqual(expected, result)
def comparable_corpus_distance(folder, dialect): dictionary, corpus = models.build_comparable_ldamodel_training( folder, dialect) lda_model = models.build_ldamodel(corpus, dictionary) folders = [folder + dialect[0] + '/', folder + dialect[1] + '/'] Hellinger_summation = 0 Jaaccard_summation = 0 for file in os.listdir(folders[0]): try: extension = os.path.splitext(file)[1] if extension == '.txt': first_filepath = os.path.join(folders[0], file) second_filepath = os.path.join(folders[1], file) with open(first_filepath, encoding='utf-8') as f: # we can define file_name first_documents = f.read() first_dialect = [word for word in first_documents.split()] # print(first_dialect) with open(second_filepath, encoding='utf-8') as f: # we can define file_name second_documents = f.read() second_dialect = [word for word in second_documents.split()] # print(second_dialect) bow_first_dialect = lda_model.id2word.doc2bow(first_dialect) bow_second_dialect = lda_model.id2word.doc2bow(second_dialect) # print(bow_first_dialect) # we can now get the LDA topic distributions for these lda_bow_first_dialect = lda_model[bow_first_dialect] lda_bow_second_dialect = lda_model[bow_second_dialect] # print(lda_bow_first_dialect) print('Hellinger distance between 1 and 2 ') print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect)) Hellinger_summation = Hellinger_summation + hellinger( lda_bow_first_dialect, lda_bow_second_dialect) print('Jcard Distance') print(jaccard(bow_first_dialect, bow_second_dialect)) Jaaccard_summation = Jaaccard_summation + jaccard( bow_first_dialect, bow_second_dialect) # sys.exit() except: pass print('total hellinger = ', Hellinger_summation / 10197) print('Total JC = ', Jaaccard_summation / 10197)
def get_vector_similarity_hellinger(self, vec1, vec2, model): '''Get similarity between two vectors''' dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \ matutils.sparse2full(vec2, model.num_topics)) sim = 1.0 / (1.0 + dist) return sim
def compute_hellinger(dist01, dist02): unique_words = set([x[1] for x in dist01] + [x[1] for x in dist02]) dict_dist01 = {x[1]: x[0] for x in dist01} dict_dist02 = {x[1]: x[0] for x in dist02} vec01 = [dict_dist01.get(x, 0) for x in unique_words] vec02 = [dict_dist02.get(x, 0) for x in unique_words] return hellinger(vec01, vec02)
def test_inputs(self): # checking empty inputs vec_1 = [] vec_2 = [] result = matutils.hellinger(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result) # checking np array and list input vec_1 = np.array([]) vec_2 = [] result = matutils.hellinger(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result) # checking scipy csr matrix and list input vec_1 = csr_matrix([]) vec_2 = [] result = matutils.hellinger(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result)
def distance(text, doc_topic_probs): topic, x = predict_topic(text) #print(topic) #print(x) #print("*****") #print(lda_output) #dists = euclidean_distances(x, doc_topic_probs)[0] #print(j+"----------------------------------------------------------------------") #print(x[0]) #print(doc_topic_probs[0]) #print(hellinger(x[0],doc_topic_probs[0])) #for i in dists: #print(i) #return dists return hellinger(text, lda_output)
def CheckOverlaps(self, dist_tolerance, parsed_dict, topic_dict): overlaps_graph = {} overlaps_print = {} for doc1 in parsed_dict: #print(doc1) for index_doc1, topics_doc1 in enumerate(parsed_dict[doc1]): #print(" ", topics_doc1) #print('------------------') for doc2 in parsed_dict: if doc1 == doc2: break for index_doc2, topics_doc2 in enumerate( parsed_dict[doc2]): dist = hellinger(topics_doc1, topics_doc2) if (dist <= dist_tolerance): doc1_topic_graph = doc1 + ': Topic ' + str( index_doc1 + 1) doc2_topic_graph = doc2 + ': Topic ' + str( index_doc2 + 1) doc1_topic_print = self.GetNestedElement( topic_dict, doc1, index_doc1) doc2_topic_print = self.GetNestedElement( topic_dict, doc2, index_doc2) try: overlaps_graph[(doc1_topic_graph)] += [ (doc2_topic_graph, dist) ] overlaps_print[(doc1 + ': Topic ' + str(index_doc1 + 1), doc1_topic_print)] += [ (doc2 + ': Topic ' + str(index_doc2 + 1), doc2_topic_print) ] except KeyError: overlaps_graph[(doc1_topic_graph)] = [ (doc2_topic_graph, dist) ] overlaps_print[(doc1 + ': Topic ' + str(index_doc1 + 1), doc1_topic_print)] = [ (doc2 + ': Topic ' + str(index_doc2 + 1), doc2_topic_print) ] return overlaps_graph, overlaps_print
def Hellinger_similiarity(self, corpus, corpus_model_user_description, num_best=5): 'implements Hellinger similarity using gensim modules' length = len(corpus_model_user_description) queryXhotel = np.zeros((length, len(corpus))) print('It takes some time') for i in range(length): for j in range(len(corpus)): queryXhotel[i][j] = hellinger(corpus_model_user_description[i], corpus[j]) print(i) #np.save('hellinger_similiarity', queryXhotel) accuracy_array = self.make_accuracy_array(queryXhotel, num_best, bol=False) #true? return accuracy_array
def corpus_distance(folder, dialect, corpus_files): dictionary, corpus = models.build_ldamodel_training(folder, dialect) # dictionary, corpus = premodel.upload_data(dialect) # print('here', len(corpus)) lda_model = models.build_ldamodel(corpus, dictionary) # now we add the two dialects to test the distance betwen them with open(corpus_files[0], encoding='utf-8') as f: # we can define file_name first_documents = f.read() first_dialect = [word for word in first_documents.split()] with open(corpus_files[1], encoding='utf-8') as f: # we can define file_name second_documents = f.read() second_dialect = [word for word in second_documents.split()] # now let's make these into a bag of words format bow_first_dialect = lda_model.id2word.doc2bow(first_dialect) bow_second_dialect = lda_model.id2word.doc2bow(second_dialect) # we can now get the LDA topic distributions for these lda_bow_first_dialect = lda_model[bow_first_dialect] lda_bow_second_dialect = lda_model[bow_second_dialect] print('Hellinger distance between 1 and 2 ') print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect)) print('Jcard Distance') print(jaccard(bow_first_dialect, bow_second_dialect)) print('kullback_leibler between 1 to 2') # print(kullback_leibler(lda_bow_first_dialect, lda_bow_second_dialect)) print('kullback_leibler between 2 to 1')
def test_distributions(self): # checking different length bag of words as inputs vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] result = matutils.hellinger(vec_1, vec_2) expected = 0.484060507634 self.assertAlmostEqual(expected, result) # checking symmetrical bag of words inputs return same distance vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1), (8, 0.1), (10, 0.8), (9, 0.1)] result = matutils.hellinger(vec_1, vec_2) result_symmetric = matutils.hellinger(vec_2, vec_1) expected = 0.856921568786 self.assertAlmostEqual(expected, result) self.assertAlmostEqual(expected, result_symmetric) # checking ndarray, csr_matrix as inputs vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]]) vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) result = matutils.hellinger(vec_1, vec_2) expected = 0.160618030536 self.assertAlmostEqual(expected, result) # checking ndarray, list as inputs vec_1 = np.array([0.6, 0.1, 0.1, 0.2]) vec_2 = [0.2, 0.2, 0.1, 0.5] result = matutils.hellinger(vec_1, vec_2) expected = 0.309742984153 self.assertAlmostEqual(expected, result) # testing LDA distribution vectors np.random.seed(0) model = self.class_(self.corpus, id2word=common_dictionary, num_topics=2, passes=100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.hellinger(lda_vec1, lda_vec2) expected = 1.0406845281146034e-06 self.assertAlmostEqual(expected, result)
bow_water = model.id2word.doc2bow(doc_water) bow_finance = model.id2word.doc2bow(doc_finance) bow_bank = model.id2word.doc2bow(doc_bank) lda_bow_water = model[bow_water] lda_bow_finance = model[bow_finance] lda_bow_bank = model[bow_bank] tfidf_bow_water = tfidf[bow_water] tfidf_bow_finance = tfidf[bow_finance] tfidf_bow_bank = tfidf[bow_bank] from gensim.matutils import kullback_leibler, jaccard, hellinger hellinger(lda_bow_water, lda_bow_finance) hellinger(lda_bow_finance, lda_bow_bank) hellinger(lda_bow_bank, lda_bow_water) hellinger(lda_bow_finance, lda_bow_water) kullback_leibler(lda_bow_water, lda_bow_bank) kullback_leibler(lda_bow_bank, lda_bow_water) jaccard(bow_water, bow_bank) jaccard(doc_water, doc_bank) jaccard(['word'], ['word']) def make_topics_bow(topic): # takes the string returned by model.show_topics() # split on strings to get topics and the probabilities
# In[36]: #testing the model on this research paper test_doc = [] f = open('/Users/Moukthika/Desktop/ultimate_test.txt', 'r', encoding='utf8') test_doc.append(f.read()) #print(test_doc) for d in test_doc: doc_words = d.split(" ") #print(doc_words) doc_words = dictionary.doc2bow(doc_words) doc_words = ldaseq[doc_words] print(doc_words) #testing the model on another document not in the corpus test_doc2 = [] p = open('/Users/Moukthika/Desktop/pdf_extract/99.txt', 'r', encoding='utf8') test_doc2.append(p.read()) for d1 in test_doc2: doc2_words = d1.split(" ") doc2_words = dictionary.doc2bow(doc2_words) doc2_words = ldaseq[doc2_words] print(doc2_words) # In[37]: #comparing the above two documents hellinger(doc_words, doc2_words)
""" dis1 = self.get_topic_distrb(doc1_tk) dis2 = self.get_topic_distrb(doc2_tk) # return 1 - matutils.hellinger(dis1, dis2) return matutils.cossim(dis1, dis2) def get_model_name(self): return "LDA" if __name__ == "__main__": docs = [ 'this is a test', 'test assure quality', 'test is important', ] lda = LDA(fo_lang_code="en") new_doc1 = ["software", 'quality', 'rely', 'test'] new_doc2 = ["quality", "is", "important"] new_doc3 = ["i", "have", "a", "pretty", "dog"] lda.train(docs) dis1 = lda.get_topic_distrb(new_doc1) dis2 = lda.get_topic_distrb(new_doc2) dis3 = lda.get_topic_distrb(new_doc3) print(dis1) print(dis2) print(dis3) print(matutils.hellinger(dis1, dis2)) print(matutils.hellinger(dis1, dis3))
# Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Hellinger distance H_distance = [] for i in range(1747, len(data_reg)): temp_distance = [] for j in range(len(data_news)): lda_doc1 = lda_model[corpus_news[j]] lda_reg1 = lda_model[corpus_reg[i]] temp_distance.append(hellinger(lda_doc1[0], lda_reg1[0])) H_distance.append(temp_distance) H_dist = DataFrame(H_distance) H_dist.to_csv('Hillinger_distance.csv') # DTM def BasicCleanText(raw_text): cleantextprep = str(raw_text) expression = "[^a-zA-Z0-9 ]" # keep only letters, numbers and whitespace cleantextCAP = re.sub(expression, '', cleantextprep) # apply regex cleantext = cleantextCAP.lower() # lower case # Tokenization
lda_bow_water = model[bow_water] lda_bow_finance = model[bow_finance] lda_bow_bank = model[bow_bank] ############################################################################### # Hellinger # --------- # # We're now ready to apply our distance metrics. These metrics return a value between 0 and 1, where values closer to 0 indicate a smaller 'distance' and therefore a larger similarity. # # Let's start with the popular Hellinger distance. # # The Hellinger distance metric gives an output in the range [0,1] for two probability distributions, with values closer to 0 meaning they are more similar. # from gensim.matutils import hellinger print(hellinger(lda_bow_water, lda_bow_finance)) print(hellinger(lda_bow_finance, lda_bow_bank)) ############################################################################### # Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. # # In the second case, the documents are a lot more similar, semantically. Trained with the model, they give a much less distance value. # ############################################################################### # Kullback–Leibler # ---------------- # # Let's run similar examples down with Kullback Leibler. # from gensim.matutils import kullback_leibler
def hellinger_distance(self, doc_bow, bow_corpus): scores = [(i, hellinger(doc_bow, document)) for i, document in enumerate(bow_corpus)] return heapq.nsmallest(100, scores, lambda x: x[1])
def phish_extraction(): result = { '1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0, '8': 0, '9': 0, '10': 0, '11': 0, '12': 0, '13': 0, '14': 0, '15': 0, '16': 0, '17': 0, '18': 0, '19': 0, '20': 0, '21': 0, '22': 0, '23': 0, '24': 0, '25': 0, '26': 0, '27': 0, '28': 0, '29': 0, '30': 0, '31': 0, '32': 0, '33': 0, '34': 0, '35': 0, '36': 0, '37': 0, '38': 0, '39': 0, '40': 0, '1': 0, '42': 0, '43': 0, '44': 0, '45': 0, '46': 0, '47': 0, '48': 0, '49': 0, '50': 0, '51': 0, '52': 0, '53': 0, '54': 0, '55': 0, '56': 0, '57': 0, '58': 0, '59': 0, '60': 0, '61': 0, '62': 0, '63': 0, '64': 0, '65': 0, '66': 0, '67': 0, '68': 0, '69': 0, '70': 0, '71': 0, '72': 0, '73': 0, '74': 0, '75': 0, '76': 0, '77': 0, '78': 0, '79': 0, '80': 0, '81': 0, '82': 0, '83': 0, '84': 0, '85': 0, '86': 0, '87': 0, '88': 0, '89': 0, '90': 0, '91': 0, '92': 0, '93': 0, '94': 0, '95': 0, '96': 0, '97': 0, '98': 0, '99': 0, '100': 0, '101': 0, '102': 0, '103': 0, '104': 0, '105': 0, '106': 0, '107': 0, '108': 0, '109': 0, '110': 0, '111': 0, '112': 0, '113': 0, '114': 0, '115': 0, '116': 0, '117': 0, '118': 0, '119': 0, '120': 0, '121': 0, '122': 0, '123': 0, '124': 0, '125': 0, '126': 0, '127': 0, '128': 0, '129': 0, '130': 0, '131': 0, '132': 0, '133': 0, '134': 0, '135': 0, '136': 0, '137': 0, '138': 0, '139': 0, '140': 0, '141': 0, '142': 0, '143': 0, '144': 0, '145': 0, '146': 0, '147': 0, '148': 0, '149': 0, '150': 0, '151': 0, '152': 0, '153': 0, '154': 0, '155': 0, '156': 0, '157': 0, '158': 0, '159': 0, '160': 0, '161': 0, '162': 0, '163': 0, '164': 0, '165': 0, '166': 0, '167': 0, '168': 0, '169': 0, '170': 0, '171': 0, '172': 0, '173': 0, '174': 0, '175': 0, '176': 0, '177': 0 } interhref = [] exterhref = [] interlog = [] exterlog = [] chain = [] title = [] text = [] chainurl(chain) starturl = chain[0] landurl = chain[-1] interandextern(landurl, interhref, exterhref, "file/href.txt") interandextern(landurl, interlog, exterlog, "file/logged.txt") loaddata(title, 'file/title.txt') loaddata(text, 'file/text.txt') feature_1 = [] f1_8feature(feature_1, starturl) f1_8feature(feature_1, landurl) f1_3_8feature(feature_1, interhref) f1_3_8feature(feature_1, interlog) f1_3_8feature(feature_1, exterhref) f1_3_8feature(feature_1, exterlog) ## # Feature 2 calculating ## start = list(getfreeurl(starturl)) land = list(getfreeurl(landurl)) startrdn = list(getrdn(starturl)) landrdn = list(getrdn(landurl)) intlog = [] intlink = [] intrdn = [] extrdn = [] extlog = [] extlink = [] for var in interhref: intlink.append(getfreeurl(var)) intrdn.append(getrdn(var)) for var in interlog: intlog.append(getfreeurl(var)) intrdn.append(getrdn(var)) for var in exterhref: extlink.append(getfreeurl(var)) for var in exterlog: extlog.append(getfreeurl(var)) extrdn.append(getrdn(var)) # you can use any corpus, this is just illustratory texts = [ text, title, start, land, startrdn, landrdn, intlog, intlink, intrdn, extrdn, extlog, extlink ] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] import numpy numpy.random.seed( 1) # setting random seed to get the same results each time. from gensim.models import ldamodel model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2) #, minimum_probability=1e-8) model.show_topics() #print_wo("\n") from gensim.matutils import hellinger feature_2 = [] for combo in combinations(texts, 2): # 2 for pairs, 3 for triplets, etc ## we can now get the LDA topic distributions for these bow0 = model.id2word.doc2bow(combo[0]) bow1 = model.id2word.doc2bow(combo[1]) lda_bow0 = model[bow0] lda_bow1 = model[bow1] #print_wo("Distance #",count,":",hellinger(lda_bow0,lda_bow1)) feature_2.append(hellinger(lda_bow0, lda_bow1)) #print_wo(hellinger(lda_bow0,lda_bow1),",") #for i in range(16): # print_wo(i,":",dictionary.get(i)) # now let's make these into a bag of words format # feature_2.append(binaryfeatures(intrdn, title)) feature_2.append(binaryfeatures(extrdn, title)) #print_wo(binaryfeatures(intrdn,title),",") #print_wo(binaryfeatures(extrdn,title),",") ## # f3 features calculeting ## feature_3n4 = [] startmld = getmld(starturl) landmld = getmld(landurl) mlds = [startmld, landmld] startrdn = getrdn(starturl) landrdn = getrdn(landurl) rdns = [startmld, landmld] compare = [text, title, intlog, extlog, intlink, extlink] for i in range(2): for j in range(6): if mlds[i] in compare[j]: feature_3n4.append(1) #print_wo("1",",") else: feature_3n4.append(0) #print_wo("0",",") compare = [title, intlog, extlog, intlink, extlink] compare = " ".join(str(x) for x in compare) for i in range(2): for j in range(5): if compare[j] in mlds[i]: feature_3n4.append(1) #print_wo("1",",") else: feature_3n4.append(0) #print_wo("0",",") for m in range(2): for n in range(5): if compare[j] in rdns[i] and compare[j] not in mlds[i]: feature_3n4.append(1) #print_wo("1",",") else: feature_3n4.append(0) #print_wo("0",",") ## # f3 features calculeted ## ## # f4 features calculeting ## if getrdn(starturl) in getrdn(landurl): feature_3n4.append(1) #print_wo(1,",") else: feature_3n4.append(0) #print_wo(0,",") if len(chain) > 2: feature_3n4.append(len(chain) - 2) #print_wo(len(chain)-2,",") else: feature_3n4.append(0) #print_wo(0,",") feature_3n4.append(len(interlog)) feature_3n4.append(len(interhref)) ##print_wo(len(interlog),",") ##print_wo(len(interhref),",") feature_3n4.append(len(exterlog)) feature_3n4.append(len(exterhref)) #print_wo(len(exterlog),",") #print_wo(len(exterhref),",") count = 0 for comp in interlog: if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) #print_wo(count,",") count = 0 for comp in interhref: if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) #print_wo(count,",") count = 0 if len(chain) > 2: for comp in chain[1:len(chain) - 1]: #check later if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) #print_wo(count,",") count = 0 if len(chain) > 2: for comp in chain[1:len(chain) - 1]: #check later if getrdn(landurl) in getrdn(comp): count += 1 feature_3n4.append(count) #print_wo(count,",") count = 0 for comp in exterlog: #check later if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) #print_wo(count,",") count = 0 for comp in exterlog: #check later if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) #print_wo(count,",") ## # f4 features calculed ## ## # f5 features calculation ## feature_5 = [] file = open('file/input.txt', "r") data = file.read() word = data.split() feature_5.append(len(word)) #print_wo(len(word),",") file = open('file/img.txt', "r") data = file.read() word = data.split() feature_5.append(len(word)) #print_wo(len(word),",") file = open('file/iframe.txt', "r") data = file.read() word = data.split() feature_5.append(len(word)) #print_wo(len(word),",") file = open('file/text.txt', "r") data = file.read() word = data.split() feature_5.append(len(word)) #print_wo(len(word),",") file = open('file/title.txt', "r") data = file.read() word = data.split() feature_5.append(len(word)) #print(len(word)) #sys.stdout = original_stdout # Reset the standard output to its original value res = feature_1 + feature_2 + feature_3n4 + feature_5 for i in range(len(res)): result[str(i)] = res[i] return result
from gensim.models import ldamodel model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2) #, minimum_probability=1e-8) model.show_topics() #print_wo("\n") from gensim.matutils import hellinger for combo in combinations(texts, 2): # 2 for pairs, 3 for triplets, etc ## we can now get the LDA topic distributions for these bow0 = model.id2word.doc2bow(combo[0]) bow1 = model.id2word.doc2bow(combo[1]) lda_bow0 = model[bow0] lda_bow1 = model[bow1] #print_wo("Distance #",count,":",hellinger(lda_bow0,lda_bow1)) print_wo(hellinger(lda_bow0, lda_bow1), ",") #for i in range(16): # print_wo(i,":",dictionary.get(i)) # now let's make these into a bag of words format # print_wo(binaryfeatures(intrdn, title), ",") print_wo(binaryfeatures(extrdn, title), ",") ## # f3 features calculeting ## startmld = getmld(starturl) landmld = getmld(landurl) mlds = [startmld, landmld]
def hellinger_distance(self, x, y): """ return hellinger between two lists """ return hellinger(x, y)
def similarity(vec1, vec2): '''Similaridad entre dos vectores''' dist = matutils.hellinger(matutils.sparse2full(vec1, atmodel.num_topics), \ matutils.sparse2full(vec2, atmodel.num_topics)) sim = 1.0 / (1.0 + dist) return sim
'worth', 'hoke','happy','foot','tv','weed', 'hard paint','good luck','olga','hair','gas', 'sex','especially','pretty','hope','basically', 'dream','hit','bit','ben krenke','weird','saying', 'okay','doesnt','understand','f**k','job','hard', 'night','weekend','f****d','sorry','school','cheap', 'literally','crazy','mom','year_old','home','year', 'old','bitch','song'] # bt_2 topics bt_2_topics = ['black pipe','bike','hard','suck','dick','fart noise', 'life','buy','ride','sorry','nah','working','easy', 'worst','comment','win','pissed','interview','bad', 'high','game','rib','drink','fast','dog','smoke weed', 'kit','happy birthday','apparently','lol','cleveland','sweet', 'hang','summer','get paper','good','jesus christ','idea', 'gay','dumb','jesus','sound','god damn','house','health insurance', 'stock','set nickname'] # Transforms user topics to bag of words bt_1_bow = bt_1_ldamodel.id2word.doc2bow(bt_1_topics) bt_1 = bt_1_ldamodel[bt_1_bow] bt_2_bow = bt_2_ldamodel.id2word.doc2bow(bt_2_topics) bt_2 = bt_2_ldamodel[bt_2_bow] # Computes the similarity of bt_1 & bt_2 from gensim.matutils import hellinger print('Similarity between bt_1 & bt_2:', hellinger(bt_1,bt_2))
def phish_extraction(url, driver): try: result = {} interhref=[] exterhref=[] interlog=[] exterlog=[] #chain=[] title=[] text=[] chain, logged, href, img, iframe, _input, title, text, flag = web_scrapping(url, driver) #print(chain, logged, href, img, iframe, _input, title, text, flag) if type(chain) is list and len(chain) > 0: starturl=chain[0] landurl=chain[-1] else: starturl=chain landurl=chain if flag == 0: return False interandextern(landurl,interhref,exterhref,href) interandextern(landurl,interlog,exterlog,logged) feature_1 = [] f1_8feature(feature_1, starturl) f1_8feature(feature_1, landurl) f1_3_8feature(feature_1, interhref) f1_3_8feature(feature_1, interlog) f1_3_8feature(feature_1, exterlog) f1_3_8feature(feature_1, exterhref) # #Feature 2 calculating # start=list(getfreeurl(starturl)) land=list(getfreeurl(landurl)) startrdn=list(getrdn(starturl)) landrdn=list(getrdn(landurl)) intlog=[] intlink=[] intrdn=[] extrdn=[] extlog=[] extlink=[] for var in interhref: intlink.append(getfreeurl(var)) intrdn.append(getrdn(var)) for var in interlog: intlog.append(getfreeurl(var)) intrdn.append(getrdn(var)) for var in exterhref: extlink.append(getfreeurl(var)) for var in exterlog: extlog.append(getfreeurl(var)) extrdn.append(getrdn(var)) # you can use any corpus, this is just illustratory texts = [ text,title,start,land,startrdn,landrdn,intlog,intlink,intrdn,extrdn,extlog,extlink ] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] import numpy numpy.random.seed(1) # setting random seed to get the same results each time. from gensim.models import ldamodel model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2)#, minimum_probability=1e-8) model.show_topics() #print_wo("\n") from gensim.matutils import hellinger feature_2 = [] for combo in combinations(texts, 2): # 2 for pairs, 3 for triplets, etc ## we can now get the LDA topic distributions for these bow0 = model.id2word.doc2bow(combo[0]) bow1 = model.id2word.doc2bow(combo[1]) lda_bow0 = model[bow0] lda_bow1 = model[bow1] #print_wo("Distance #",count,":",hellinger(lda_bow0,lda_bow1)) feature_2.append(hellinger(lda_bow0,lda_bow1)) #print_wo(hellinger(lda_bow0,lda_bow1),",") #for i in range(16): # print_wo(i,":",dictionary.get(i)) # now let's make these into a bag of words format # feature_2.append(binaryfeatures(intrdn,title)) feature_2.append(binaryfeatures(extrdn,title)) ## # f3 features calculeting ## feature_3n4 = [] startmld = getmld(starturl) landmld = getmld(landurl) mlds = [startmld,landmld] startrdn = getrdn(starturl) landrdn = getrdn(landurl) rdns = [startmld,landmld] compare = [text,title,intlog,extlog,intlink,extlink] for i in range(2): for j in range(6): if mlds[i] in compare[j]: feature_3n4.append(1) else: feature_3n4.append(0) compare = [title,intlog,extlog,intlink,extlink] compare = " ".join(str(x) for x in compare) for i in range(2): for j in range(5): if compare[j] in mlds[i]: feature_3n4.append(1) else: feature_3n4.append(0) for m in range(2): for n in range(5): if compare[j] in rdns[i] and compare[j] not in mlds[i]: feature_3n4.append(1) else: feature_3n4.append(0) ## # f3 features calculeted ## ## # f4 features calculeting ## if getrdn(starturl) in getrdn(landurl): feature_3n4.append(1) else: feature_3n4.append(0) if len(chain) > 2: feature_3n4.append(len(chain)-2) else: feature_3n4.append(0) feature_3n4.append(len(interlog)) feature_3n4.append(len(interhref)) feature_3n4.append(len(exterlog)) feature_3n4.append(len(exterhref)) count = 0 for comp in interlog: if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) count = 0 for comp in interhref: if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) count = 0 if len(chain) > 2 : for comp in chain[1:len(chain)-1] :#check later if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) count = 0 if len(chain) > 2 : for comp in chain[1:len(chain)-1] :#check later if getrdn(landurl) in getrdn(comp): count += 1 feature_3n4.append(count) count = 0 for comp in exterlog :#check later if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) count = 0 for comp in exterlog : #check later if getrdn(starturl) in getrdn(comp): count += 1 feature_3n4.append(count) ## # f4 features calculed ## ## # f5 features calculation ## feature_5 = [] data = _input#file.read() feature_5.append(len(data)) data = img# file.read() feature_5.append(len(data)) data = iframe# file.read() feature_5.append(len(data)) data = text#file.read() feature_5.append(len(data)) data = title# file.read() feature_5.append(len(data)) res = feature_1 + feature_2 + feature_3n4 + feature_5 if flag == 1: for i in range(len(res)) : if res[i] is not None: result[str(i)] = res[i] else: result[str(i)] = 0 else: return False except Exception as e: #print("extraction error",e) trace_back = sys.exc_info()[2] line = trace_back.tb_lineno print(format(line),e) if flag == 1: return False else: return False return result
filecontent = '' for word in file: filecontent = filecontent + word + ' ' documents.append(filecontent) stoplist = set(stopwords.words('english')) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] basetext = [] for list in texts: for item in list: basetext.append(item) bow_1 = lda.id2word.doc2bow(basetext) lda_1 = lda[bow_1] print("******************", filename) print("hellinger", hellinger(lda_1, lda_2)) print("kullback_leibler", kullback_leibler(lda_1, lda_2)) print("jaccard", jaccard(lda_1, lda_2)) file.close() #dictionary = corpora.Dictionary(texts) #corpus = [dictionary.doc2bow(text) for text in texts] #lda1 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=10000, passes=5) #print(lda1) #print(texts) """ basetext=[] for list in texts: for item in list: basetext.append(item) #print(len(basetext)) #print(basetext)
cluster_model = func(corpus, dictionary, words) print clist print model.show_topics(cluster_model) model_list2.append(cluster_model) model_list3.append(model.show_topics(cluster_model)) print '\n========' print('Comparison between Cluster topics and Version topics') print '========\n' distances = [] def print_cluster_version(i, j, mylist): print '\n========' print 'version', version[i] print 'cluster\t', mylist[j] for i in model_list2: for j in model_list: if hellinger(i, j) > 0.5: print "\ntopic comparison distance between version and reviews" print hellinger(i, j) print_cluster_version(model_list.index(j), model_list2.index(i), model_list3) distances.append(hellinger(i, j)) else: continue