Beispiel #1
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)
Beispiel #2
0
class TestDoc2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)
Beispiel #3
0
def create_sim_dict(file, vectors, min_sim=0.55, topn=10, num_trees=200):
    indexer = AnnoyIndexer(vectors, num_trees=num_trees)
    sim_dict = dict()
    for w in messages.pbar(vectors.vocab):
        sim = indexer.most_similar(vectors.get_vector(w), topn)
        sim_dict[w] = [s for s in sim if s[1] > min_sim]
    with open(file, 'wb') as fileout:
        pickle.dump(sim_dict, fileout)
class TestDoc2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy  # noqa:F401
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector],
                                                           topn=5,
                                                           indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #5
0
class TestDoc2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]

    def testDocumentIsSimilarToItself(self):
        approx_neighbors = self.index.most_similar(self.vector, 1)
        doc, similarity = approx_neighbors[0]

        self.assertEqual(doc, 0)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.docvecs.most_similar(
            positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        fname = testfile()
        self.index.save(fname)
        self.assertTrue(os.path.exists(fname))
        self.assertTrue(os.path.exists(fname + '.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = testfile()
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #6
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.wv.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector],
                                                   topn=5,
                                                   indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector],
                                                  topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #7
0
class TestWord2VecAnnoyIndexer(unittest.TestCase):

    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]

    def testVectorIsSimilarToItself(self):
        label = self.model.index2word[0]
        approx_neighbors = self.index.most_similar(self.vector, 1)
        word, similarity = approx_neighbors[0]

        self.assertEqual(word, label)
        self.assertEqual(similarity, 1.0)

    def testApproxNeighborsMatchExact(self):
        approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index)
        exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5)

        approx_words = [neighbor[0] for neighbor in approx_neighbors]
        exact_words = [neighbor[0] for neighbor in exact_neighbors]

        self.assertEqual(approx_words, exact_words)

    def testSave(self):
        self.index.save('index')
        self.assertTrue(os.path.exists('index'))
        self.assertTrue(os.path.exists('index.d'))

    def testLoadNotExist(self):
        from gensim.similarities.index import AnnoyIndexer
        self.test_index = AnnoyIndexer()

        self.assertRaises(IOError, self.test_index.load, fname='test-index')

    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
Beispiel #8
0
def predict(text):
    model = doc2vec.Doc2Vec.load('../models/doc2vec.model')
    indexer = AnnoyIndexer()
    indexer.load('../models/dv_index')
    indexer.model = model
    # print(indexer.labels)
    new_vec = []
    for word in transform_text(text, strip=False):
        new_vec.append(model[word])
    print(new_vec)
    sv = model.infer_vector(transform_text(text, strip=False))
    print(sv)
    print(indexer.most_similar(sv, 2))
class Recommander(object):
    def __init__(self, vec_file, pap, pat, pro):
        # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True)
        self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format(
            vec_file, binary=True)
        self.paper_index = AnnoyIndexer()
        self.paper_index.load(pap)
        self.patent_index = AnnoyIndexer()
        self.patent_index.load(pat)
        self.project_index = AnnoyIndexer()
        self.project_index.load(pro)
        self.t2v = Convert2Vec(self.wm)
        self.cuttor = FilterCut()
        self.db = DB()
        self.featureIndex = self.buildFeatureIndex()

    def buildFeatureIndex(self):
        paperFeature = open(
            "/testdata400/data/recommender/data0828/feature/paper_feature.txt",
            'r')
        patentFeature = open(
            "/testdata400/data/recommender/data0828/feature/patent_feature.txt",
            'r')
        projectFeature = open(
            "/testdata400/data/recommender/data0828/feature/project_feature.txt",
            'r')
        featureIndex = {}
        featureIndex['paper'] = self.loadFeature(paperFeature)
        featureIndex['patent'] = self.loadFeature(patentFeature)
        featureIndex['project'] = self.loadFeature(projectFeature)
        return featureIndex

    def loadFeature(self, file):
        file = file.readlines()
        index = {}
        index['field'] = {}
        index['type'] = {}
        index['province'] = {}
        index['unit'] = {}
        for line in file:
            feature = line.split('\t')
            if feature[1] not in index['field']:
                index['field'][feature[1]] = []
            index['field'][feature[1]].append(feature[0])
            if feature[2] not in index['type']:
                index['type'][feature[2]] = []
            index['type'][feature[2]].append(feature[0])
            if feature[3] not in index['province']:
                index['province'][feature[3]] = []
            index['province'][feature[3]].append(feature[0])
            if feature[4] not in index['unit']:
                index['unit'][feature[4]] = []
            index['unit'][feature[4]].append(feature[0])
        return index

    # 过滤论文,项目,专利
    def filter(self, typee, topDocs, filterParams, topN):
        topDocIds = [i for i, j in topDocs]
        if not (filterParams[0] == '' or filterParams[0] == '-1' or typee
                == 'project'):  # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤
            if filterParams[0] not in self.featureIndex[typee]['field']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['field'][filterParams[0]]))
        if not (filterParams[1] == '' or filterParams[1] == '-1'):  # type
            if filterParams[1] not in self.featureIndex[typee]['type']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['type'][filterParams[1]]))
        if not (filterParams[2] == '' or filterParams[2] == '-1'):  # province
            if filterParams[2] not in self.featureIndex[typee]['province']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['province'][filterParams[2]]))
        if not (filterParams[3] == '' or filterParams[3] == '-1'):  # unit
            if filterParams[3] not in self.featureIndex[typee]['unit']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['unit'][filterParams[3]]))
        result = []
        for i in topDocs:
            if i[0] in topDocIds:
                result.append(i)
            if len(result) == topN:
                break
        return result

    # 不过滤地区,且返回全部满足的文档,而不仅仅是topn个文档
    # def filterForExpert(self, typee, topDocs, filterParams):
    #     topDocIds = [i for i,j in topDocs]
    #     if not (filterParams[0] == '' or filterParams[
    #         0] == '-1' or typee == 'project'):  # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤
    #         if filterParams[0] not in self.featureIndex[typee]['field']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['field'][filterParams[0]]))
    #     if not (filterParams[1] == '' or filterParams[1] == '-1'):  # type
    #         if filterParams[1] not in self.featureIndex[typee]['type']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['type'][filterParams[1]]))
    #     if not (filterParams[3] == '' or filterParams[3] == '-1'):  # unit
    #         if filterParams[3] not in self.featureIndex[typee]['unit']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['unit'][filterParams[3]]))
    #     result = []
    #
    #     topDocsMap = {}
    #     for i in range(len(topDocs)):
    #         topDocsMap[topDocs[i][0]]=topDocs[i][1]
    #     for id in topDocIds:
    #         listTemp = [id,topDocsMap[id]]
    #         result.append(listTemp)
    #     return result

    def most_similar_paper(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.paper_index.most_similar(vec, topn)

    def most_similar_patent(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.patent_index.most_similar(vec, topn)

    def most_similar_project(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.project_index.most_similar(vec, topn)

    def getSimExpertsIds(self, topDocs):
        expertInfoOut = {}
        expertMap = {}
        authorSeqWeiht = [1.0, 0.85, 0.7, 0.5]
        for typee in topDocs:
            order = {}
            order[typee] = {}
            k = 0
            for i, j in topDocs[typee]:
                order[typee][i] = k
                k = k + 1
            ids = [i for i, j in topDocs[typee]]
            docExpertIds = self.db.getAuthors(typee, ids)
            for id in docExpertIds:
                if not self.db.idInDB(typee, id):
                    print "docId:" + id + "is not in db"
                    continue
                expertIds = docExpertIds[id]
                qs = 1.0
                sim = qs
                for i, j in topDocs[typee]:
                    if i == id:
                        sim = j * sim
                        break
                for i in range(len(expertIds)):
                    if i >= 4:  # 一个成果考虑4个作者
                        break
                    if expertIds[i] not in expertInfoOut:
                        expertInfoOut[expertIds[i]] = []
                    expertInfoOut[expertIds[i]].append([
                        typee + str(order[typee][id]), sim * authorSeqWeiht[i],
                        i
                    ])
                    if expertIds[i] not in expertMap:
                        expertMap[expertIds[i]] = []
                    expertMap[expertIds[i]].append(sim * authorSeqWeiht[i])
        return expertMap, expertInfoOut

    # 从成果提取专家,有些专家在不过滤省份时排在前,但过滤省份后排在后,为避免此情况,先不过滤成果的地区,
    # 从这些不过滤地区的成果中提取专家,再按地区过滤专家,若不足topN,再在过滤地区的成果中找剩余的专家
    #
    # 这个函数需要重构,但是八成需求会改,所以先不重构了
    def most_similar_expert(self, topPapers, topPatents, topProjects,
                            filterParams, expertTopN):
        file = open("config.ini", 'r')
        config = ConfigParser.ConfigParser()
        config.readfp(file)
        LEN = int(config.get('global', 'len'))  # 对于一个专家要计算多少他的成果
        COE = float(config.get('global', 'coe'))  # 对于一个专家,从第二个的成果相似度乘的系数
        topDocs = {}
        topDocs['paper'] = self.filter('paper', topPapers, filterParams, 50)
        topDocs['patent'] = self.filter('patent', topPatents, filterParams, 50)
        topDocs['project'] = self.filter('project', topProjects, filterParams,
                                         15)
        expertMap, expertInfoOut = self.getSimExpertsIds(
            topDocs)  # 专家id为key,各项成果的相似度list为value
        expertScoreMap = {}  # 专家为key,评分为value
        for expert in expertMap:
            expertMap[expert].sort(reverse=True)
            sim = expertMap[expert][0]
            for i in range(1, len(expertMap[expert])):
                if i >= LEN:
                    break
                sim = sim + COE * expertMap[expert][i]
            expertScoreMap[expert] = sim
        result = sorted(expertScoreMap.items(),
                        key=lambda item: item[1],
                        reverse=True)[0:expertTopN]
        out = []
        for i in result:
            if i[0] in expertInfoOut:
                out.append({i[0]: expertInfoOut[i[0]]})
                # out[i[0]]=expertInfoOut[i[0]]
        self.printOut(out, LEN)
        return result

    def printOut(self, out, l):
        name = str('log/' + time.strftime("%Y-%m-%d %H-%M-%S" +
                                          ".txt", time.localtime()))
        print name
        output = open(name, 'w')
        for expert in out:
            for i in expert:
                list = expert[i]
                expert[i] = sorted(list, key=lambda doc: doc[1],
                                   reverse=True)[0:l]
        for expert in out:
            for i in expert:
                # print i  # 作者id
                output.write(i + '\n')
                list = expert[i]  # list为doc信息
                docOrder = ''
                for j in list:
                    docOrder = docOrder + j[0] + '                  '
                # print docOrder
                output.write(docOrder + '\n')
                sim = ''
                for j in list:
                    sim = sim + str(j[1]) + '             '
                # print sim
                output.write(sim + '\n')
                expertOrder = ''
                for j in list:
                    expertOrder = expertOrder + str(
                        j[2]) + '                            '
                # print expertOrder
                output.write(expertOrder + '\n')
                output.write("\n")
        output.close()

    # def most_similar_expert(self, text, topDocs):
    #     expertMap = self.getSimExpertsIds(topDocs)  # 专家id为key,各项成果的相似度list为value
    #     expertScoreMap = {}  # 专家为key,评分为value
    #     for expert in expertMap:
    #         expertMap[expert].sort(reverse=True)
    #         sim = expertMap[expert][0]
    #         for i in range(1, len(expertMap[expert])):
    #             if i >= 4:
    #                 break
    #             sim = sim + 0.04 * expertMap[expert][i]
    #         expertScoreMap[expert] = sim
    #     return sorted(expertScoreMap.items(), key=lambda item: item[1], reverse=True)

    def get_model(self):
        return self.wm

    def get_cuttor(self):
        return self.cuttor