def compareDocuments(modelName, similarityOutput):
    d2vModel = gensim.models.doc2vec.Doc2Vec.load(modelName)
    esDict = createESDict()

    txtSimilarity = open(similarityOutput, 'w', encoding='utf8')

    i = 0

    #a day and a half
    period = 60 * 60 * 24 * 1.5

    for label1, label2 in combinations(d2vModel.docvecs.doctags, r=2):
        #only care if the labels are in the esDict
        #and if the publications are different
        if (label1 in esDict) and (label2 in esDict) and \
            (esDict[label1][1] != esDict[label2][1]):
            i += 1
            if i % 500000 == 0:
                print(i)

            pubDate1 = esDict[label1][2]
            pubDate2 = esDict[label2][2]

            if abs(pubDate1 - pubDate2).total_seconds() < period:
                similarity = compareVectors(d2vModel[label1], d2vModel[label2])
                txtSimilarity.write(label1 + " " + label2 + " " +
                                    str(similarity) + "\n")

    txtSimilarity.close()
def getRelevantArticles(modelName, similarityThreshold):
    d2vModel = gensim.models.doc2vec.Doc2Vec.load(modelName)
    esDict = createESDict()

    relevantArticles = []
    for label in d2vModel.docvecs.doctags:
        if label in esDict:

            mostSimilarArticles = d2vModel.docvecs.most_similar(label)
            #filter mostSimilar (i.e. ignore most similar from same publication
            #or those that aren't even indexed in elastic)
            mostSimilarArticles = [
                x for x in mostSimilarArticles
                if x[0] in esDict and esDict[x[0]][1] != esDict[label][1]
            ]

            #check first most similar and it's cosine similarity
            if len(mostSimilarArticles
                   ) == 0 or mostSimilarArticles[0][1] < similarityThreshold:
                continue
            else:
                publication = esDict[label][1]
                path = esDict[label][1] + "/" + label[len(publication):]
                relevantArticles.append(path)

    return relevantArticles
def getMostUniquePublications(modelName, similarityThreshold):

    d2vModel = gensim.models.doc2vec.Doc2Vec.load(modelName)
    esDict = createESDict()

    #retain unique articles and total no of articles
    uniquePublicationArticles = {}
    publicationArticles = {}

    for label in d2vModel.docvecs.doctags:
        if label in esDict:

            #add to total articles
            if esDict[label][1] in publicationArticles:
                publicationArticles[esDict[label][1]] += 1
            else:
                publicationArticles[esDict[label][1]] = 1

            mostSimilarArticles = d2vModel.docvecs.most_similar(label)
            #filter mostSimilar (i.e. ignore most similar from same publication
            #or those that aren't even indexed in elastic)
            mostSimilarArticles = [
                x for x in mostSimilarArticles
                if x[0] in esDict and esDict[x[0]][1] != esDict[label][1]
            ]

            #check first most similar and it's cosine similarity
            if len(mostSimilarArticles
                   ) == 0 or mostSimilarArticles[0][1] < similarityThreshold:
                if esDict[label][1] in uniquePublicationArticles:
                    uniquePublicationArticles[esDict[label][1]] += 1
                else:
                    uniquePublicationArticles[esDict[label][1]] = 1

    return uniquePublicationArticles, publicationArticles
def compareDocumentsW2V(dmD2VModelPath, similarityOutput):
    wrapper = doc2VecWrapper()
    wrapper.loadDoc2VecModel(dmD2VModelPath)

    #init lemmas & stopwords
    lemmaDict = initLematizer()
    stopwords = createStopwordsSet()
    esDict = createESDict()

    txtSimilarity = open(similarityOutput, 'w', encoding='utf8')

    #a day and a half
    period = 60 * 60 * 24 * 1.5

    #memorize word embeddings per label
    wordEmbedding = {}
    for label in wrapper.d2vModel.docvecs.doctags:
        if (label in esDict):
            content = esDict[label][3]
            wordEmbedding[label] = wrapper.createDocumentEmbeddingW2V(
                content, lemmaDict, stopwords)
    print("finished generating word embeddings for documents")

    print("starting pairwise comparison")
    i = 0
    for label1, label2 in combinations(wrapper.d2vModel.docvecs.doctags, r=2):

        #only care if the labels are in the esDict
        #and if the publications are different
        if (label1 in esDict) and (label2 in esDict) and \
            (esDict[label1][1] != esDict[label2][1]):
            i += 1
            if i % 500000 == 0:
                print(i)

            pubDate1 = esDict[label1][2]
            pubDate2 = esDict[label2][2]

            if abs(pubDate1 - pubDate2).total_seconds() < period:
                similarity = compareVectors(wordEmbedding[label1],
                                            wordEmbedding[label2])
                txtSimilarity.write(label1 + " " + label2 + " " +
                                    str(similarity) + "\n")

    txtSimilarity.close()
def createBTInput(timeThreshold,
                  similarityThreshold,
                  similarityFile='similarity.txt',
                  rankingOutput='ranking.csv'):
    csvOutput = open(rankingOutput, 'w', encoding='utf8')
    esDict = createESDict()

    i = 0
    for line in open(similarityFile, encoding='utf8'):
        if i % 500000 == 0:
            print(i)
        i += 1
        label1, label2, similarity = line.split()
        similarity = float(similarity)

        publication1 = esDict[label1][1]
        publication2 = esDict[label2][1]

        #ignore if the similar articles come from the same publication
        #shouldn't fall through this too many times :)
        if publication1 == publication2:
            continue

        pubDate1 = esDict[label1][2]
        pubDate2 = esDict[label2][2]

        #filter using thresholds
        if abs(pubDate1 - pubDate2).total_seconds() < timeThreshold \
            and similarity > similarityThreshold:

            winner = 1 if pubDate1 > pubDate2 else 2
            csvOutput.write(publicationsDict[publication1] + "," +
                            publicationsDict[publication2] + "," +
                            str(winner) + "\n")

    csvOutput.close()
Exemple #6
0
                             encoding='utf8')

    for i, articleLabel in enumerate(d2vModel.docvecs.doctags):
        #check if data in elastic
        if articleLabel in esDict:

            publication, title, headline, _ = extractData(esDict, articleLabel)

            #write in plot relevant info, consider adding pub date as well?
            metaPlot.write("%s\t%s\t%s\n" % (title, publication, headline))
            tensorReplacement.write(tensorLines[i])

    metaPlot.close()
    tensorPlot.close()


if __name__ == "__main__":

    #doc2vec path for dm = 1, distributed memory model used
    dmD2VModelPath = 'doc2vec_models/classic, dm + dm_concat = 0/'

    #doc2vec path for dm = 0, distributed bag of words model used
    dbowD2VModelPath = 'doc2vec_models/dbow + dm_concat = 0/'

    #load doc2vec model and save it as word2vec for the word2vec2tensor.py
    #d2vModel = gensim.models.doc2vec.Doc2Vec.load(dbowD2VModelPath + 'doc2vec_model')
    #d2vModel.save_word2vec_format(dbowD2VModelPath + 'doc_tensor.w2v', doctag_vec=True, word_vec=False)

    esDict = createESDict()
    createD2VTensorflowTSV(esDict, dbowD2VModelPath)
    #createW2VTensorflowTSV(esDict, dmD2VModelPath + "doc2vec_model")