Example #1
0
    def GetTfidf(self, dictionary, corpus):
        tfidf = models.TfidfModel(corpus)
        vec_lsi = tfidf[corpus[0]]
        index = Similarity('t_index', corpus, len(dictionary))
        #tsims = index[vec_lsi]
        cnt = 0
        for similarities in index:
            if cnt == 1:
                return list(enumerate(similarities))
            cnt += 1
        #return list(enumerate(tsims))

    def GetSimilarities(self, dictionary, corpus):
        self.GetWord2Vec(dictionary, corpus)
        #print("lsims")
        lsims = self.GetLsm(dictionary, corpus)
        #print("tsims")
        tsims = self.GetTfidf(dictionary, corpus)
        return (
            tsims,
            lsims,
        )


if __name__ == '__main__':
    reader = Reader(r"C:\Users\CCrowe\Documents\Kaggle\Quora\train.csv",
                    r"C:\Users\CCrowe\Documents\Kaggle\Quora\test.csv")
    documents = reader.GetDocuments()
    print("documents retrieved")
    similar = Similar(documents, texts)