Ejemplo n.º 1
0
    def load_train_features(self):
        print("start to dump train_features")

        features = {}
        out_feature_path = self.args['feature_train_path']
        for name in tqdm(self.train_author_data):
            pubs = []
            for authorid in self.train_author_data[name]:

                for clusters in self.train_author_data[name][authorid]:
                    pubs.append(clusters)
                # print(pubs)

            name_pubs_raw = {}
            for i, pid in enumerate(pubs):
                name_pubs_raw[pid] = self.train_pub_data[pid]
            save_relation(name_pubs_raw, name)
            ptext_emb = load_data('gene', 'ptext_emb.pkl')

            tembs = []
            for i, pid in enumerate(pubs):
                # tembs.append (ptext_emb[pid])
                features[pid] = ptext_emb[pid]
        train_dataframe = pd.DataFrame(features)
        train_dataframe.to_pickle(out_feature_path)
Ejemplo n.º 2
0
    def load_train_local_features(self):
        print("start to dump train_features")

        local_feaures = {}
        out_feature_path = self.args['feature_local_train_path']
        for name in tqdm(self.train_author_data):
            pubs = []
            for authorid in self.train_author_data[name]:

                for clusters in self.train_author_data[name][authorid]:
                    pubs.append(clusters)
                # print(pubs)

            name_pubs_raw = {}
            for i, pid in enumerate(pubs):
                name_pubs_raw[pid] = self.train_pub_data[pid]
            save_relation(name_pubs_raw, name)
            mpg = MetaPathGenerator()
            mpg.read_data("gene")

            rw_num = 10
            cp = set()

            for k in range(rw_num):
                mpg.generate_WMRW("gene/RW.txt", 5, 20)
                sentences = word2vec.Text8Corpus(r'gene/RW.txt')
                model = word2vec.Word2Vec(sentences,
                                          size=128,
                                          negative=25,
                                          min_count=1,
                                          window=10)
                #embs = []
                for i, pid in enumerate(pubs):
                    if pid in model:
                        embs = model[pid]
                    else:
                        cp.add(i)
                        embs = np.zeros(128)

                    if pid not in local_feaures:
                        local_feaures[pid] = [embs]
                    else:
                        local_feaures[pid].append(embs)

        for pid in local_feaures:
            local_feaures[pid] = np.array(local_feaures[pid])
            local_feaures[pid] = np.mean(local_feaures[pid], axis=0)

        train_Dataframe = pd.DataFrame(local_feaures)
        train_Dataframe.to_pickle(out_feature_path)
Ejemplo n.º 3
0
    def train_val(self):
        result = {}

        for n, name in tqdm(enumerate(self.val_author_data)):

            pubs = []
            #get the author's all paper
            for clusters in self.val_author_data[name]:

                pubs.append(clusters)
            #print(pubs)

            name_pubs_raw = {}
            for i, pid in enumerate(pubs):
                name_pubs_raw[pid] = self.val_pub_data[pid]
            #load the author's features
            save_relation(name_pubs_raw, name)

            mpg = MetaPathGenerator()
            mpg.read_data("gene")

            all_embs = []
            rw_num = 10
            cp = set()
            #start to random walk
            for k in range(rw_num):
                mpg.generate_WMRW("gene/RW.txt", 5, 20)
                sentences = word2vec.Text8Corpus(r'gene/RW.txt')
                ##########use word2vec to train the paper's embedding###############
                model = word2vec.Word2Vec(sentences,
                                          size=128,
                                          negative=25,
                                          min_count=1,
                                          window=10)
                embs = []
                for i, pid in enumerate(pubs):
                    if pid in model:
                        embs.append(model[pid])
                    else:
                        cp.add(i)
                        embs.append(np.zeros(128))
                all_embs.append(embs)
            all_embs = np.array(all_embs)

            ##########################loading the sematic feautures#################
            ptext_emb = load_data('gene', 'ptext_emb.pkl')
            tcp = load_data('gene', 'tcp.pkl')

            tembs = []
            for i, pid in enumerate(pubs):
                #tembs.append (ptext_emb[pid])
                tembs.append(self.val_features[pid])

            ##############get the paper's connection's cosine matrix####################
            sk_sim = np.zeros((len(pubs), len(pubs)))
            for k in range(rw_num):
                sk_sim = sk_sim + pairwise_distances(all_embs[k],
                                                     metric="cosine")
            sk_sim = sk_sim / rw_num

            ##############get the paper's semantic's cosine matrix####################
            tembs = pairwise_distances(tembs, metric="cosine")

            w = 0.25
            sim = (np.array(sk_sim) + w * np.array(tembs)) / (1 + w)

            pre = DBSCAN(eps=0.2, min_samples=3,
                         metric="precomputed").fit_predict(sim)
            pre = np.array(pre)

            ##离群论文集
            outlier = set()
            for i in range(len(pre)):
                if pre[i] == -1:
                    outlier.add(i)
            for i in cp:
                outlier.add(i)
            for i in tcp:
                outlier.add(i)

            ##基于阈值的相似性匹配
            paper_pair = generate_pair(pubs, outlier)
            paper_pair1 = paper_pair.copy()
            K = len(set(pre))
            for i in range(len(pre)):
                if i not in outlier:
                    continue
                j = np.argmax(paper_pair[i])
                while j in outlier:
                    paper_pair[i][j] = -1
                    j = np.argmax(paper_pair[i])
                if paper_pair[i][j] >= 1.5:
                    pre[i] = pre[j]
                else:
                    pre[i] = K
                    K = K + 1

            for ii, i in enumerate(outlier):
                for jj, j in enumerate(outlier):
                    if jj <= ii:
                        continue
                    else:
                        if paper_pair1[i][j] >= 1.5:
                            pre[j] = pre[i]

            #print (pre, len (set (pre)))

            result[name] = []
            for i in set(pre):
                oneauthor = []
                for idx, j in enumerate(pre):
                    if i == j:
                        oneauthor.append(pubs[idx])
                result[name].append(oneauthor)

        json.dump(result,
                  open(self.args['val_result'], 'w', encoding='utf-8'),
                  indent=4)
        f1 = f1_score(result, self.args)
        print("f1:", f1)