def compareDocuments(modelName, similarityOutput): d2vModel = gensim.models.doc2vec.Doc2Vec.load(modelName) esDict = createESDict() txtSimilarity = open(similarityOutput, 'w', encoding='utf8') i = 0 #a day and a half period = 60 * 60 * 24 * 1.5 for label1, label2 in combinations(d2vModel.docvecs.doctags, r=2): #only care if the labels are in the esDict #and if the publications are different if (label1 in esDict) and (label2 in esDict) and \ (esDict[label1][1] != esDict[label2][1]): i += 1 if i % 500000 == 0: print(i) pubDate1 = esDict[label1][2] pubDate2 = esDict[label2][2] if abs(pubDate1 - pubDate2).total_seconds() < period: similarity = compareVectors(d2vModel[label1], d2vModel[label2]) txtSimilarity.write(label1 + " " + label2 + " " + str(similarity) + "\n") txtSimilarity.close()
def getRelevantArticles(modelName, similarityThreshold): d2vModel = gensim.models.doc2vec.Doc2Vec.load(modelName) esDict = createESDict() relevantArticles = [] for label in d2vModel.docvecs.doctags: if label in esDict: mostSimilarArticles = d2vModel.docvecs.most_similar(label) #filter mostSimilar (i.e. ignore most similar from same publication #or those that aren't even indexed in elastic) mostSimilarArticles = [ x for x in mostSimilarArticles if x[0] in esDict and esDict[x[0]][1] != esDict[label][1] ] #check first most similar and it's cosine similarity if len(mostSimilarArticles ) == 0 or mostSimilarArticles[0][1] < similarityThreshold: continue else: publication = esDict[label][1] path = esDict[label][1] + "/" + label[len(publication):] relevantArticles.append(path) return relevantArticles
def getMostUniquePublications(modelName, similarityThreshold): d2vModel = gensim.models.doc2vec.Doc2Vec.load(modelName) esDict = createESDict() #retain unique articles and total no of articles uniquePublicationArticles = {} publicationArticles = {} for label in d2vModel.docvecs.doctags: if label in esDict: #add to total articles if esDict[label][1] in publicationArticles: publicationArticles[esDict[label][1]] += 1 else: publicationArticles[esDict[label][1]] = 1 mostSimilarArticles = d2vModel.docvecs.most_similar(label) #filter mostSimilar (i.e. ignore most similar from same publication #or those that aren't even indexed in elastic) mostSimilarArticles = [ x for x in mostSimilarArticles if x[0] in esDict and esDict[x[0]][1] != esDict[label][1] ] #check first most similar and it's cosine similarity if len(mostSimilarArticles ) == 0 or mostSimilarArticles[0][1] < similarityThreshold: if esDict[label][1] in uniquePublicationArticles: uniquePublicationArticles[esDict[label][1]] += 1 else: uniquePublicationArticles[esDict[label][1]] = 1 return uniquePublicationArticles, publicationArticles
def compareDocumentsW2V(dmD2VModelPath, similarityOutput): wrapper = doc2VecWrapper() wrapper.loadDoc2VecModel(dmD2VModelPath) #init lemmas & stopwords lemmaDict = initLematizer() stopwords = createStopwordsSet() esDict = createESDict() txtSimilarity = open(similarityOutput, 'w', encoding='utf8') #a day and a half period = 60 * 60 * 24 * 1.5 #memorize word embeddings per label wordEmbedding = {} for label in wrapper.d2vModel.docvecs.doctags: if (label in esDict): content = esDict[label][3] wordEmbedding[label] = wrapper.createDocumentEmbeddingW2V( content, lemmaDict, stopwords) print("finished generating word embeddings for documents") print("starting pairwise comparison") i = 0 for label1, label2 in combinations(wrapper.d2vModel.docvecs.doctags, r=2): #only care if the labels are in the esDict #and if the publications are different if (label1 in esDict) and (label2 in esDict) and \ (esDict[label1][1] != esDict[label2][1]): i += 1 if i % 500000 == 0: print(i) pubDate1 = esDict[label1][2] pubDate2 = esDict[label2][2] if abs(pubDate1 - pubDate2).total_seconds() < period: similarity = compareVectors(wordEmbedding[label1], wordEmbedding[label2]) txtSimilarity.write(label1 + " " + label2 + " " + str(similarity) + "\n") txtSimilarity.close()
def createBTInput(timeThreshold, similarityThreshold, similarityFile='similarity.txt', rankingOutput='ranking.csv'): csvOutput = open(rankingOutput, 'w', encoding='utf8') esDict = createESDict() i = 0 for line in open(similarityFile, encoding='utf8'): if i % 500000 == 0: print(i) i += 1 label1, label2, similarity = line.split() similarity = float(similarity) publication1 = esDict[label1][1] publication2 = esDict[label2][1] #ignore if the similar articles come from the same publication #shouldn't fall through this too many times :) if publication1 == publication2: continue pubDate1 = esDict[label1][2] pubDate2 = esDict[label2][2] #filter using thresholds if abs(pubDate1 - pubDate2).total_seconds() < timeThreshold \ and similarity > similarityThreshold: winner = 1 if pubDate1 > pubDate2 else 2 csvOutput.write(publicationsDict[publication1] + "," + publicationsDict[publication2] + "," + str(winner) + "\n") csvOutput.close()
encoding='utf8') for i, articleLabel in enumerate(d2vModel.docvecs.doctags): #check if data in elastic if articleLabel in esDict: publication, title, headline, _ = extractData(esDict, articleLabel) #write in plot relevant info, consider adding pub date as well? metaPlot.write("%s\t%s\t%s\n" % (title, publication, headline)) tensorReplacement.write(tensorLines[i]) metaPlot.close() tensorPlot.close() if __name__ == "__main__": #doc2vec path for dm = 1, distributed memory model used dmD2VModelPath = 'doc2vec_models/classic, dm + dm_concat = 0/' #doc2vec path for dm = 0, distributed bag of words model used dbowD2VModelPath = 'doc2vec_models/dbow + dm_concat = 0/' #load doc2vec model and save it as word2vec for the word2vec2tensor.py #d2vModel = gensim.models.doc2vec.Doc2Vec.load(dbowD2VModelPath + 'doc2vec_model') #d2vModel.save_word2vec_format(dbowD2VModelPath + 'doc_tensor.w2v', doctag_vec=True, word_vec=False) esDict = createESDict() createD2VTensorflowTSV(esDict, dbowD2VModelPath) #createW2VTensorflowTSV(esDict, dmD2VModelPath + "doc2vec_model")