Beispiel #1
0
    def __init__(self, directory=None, train_size=0.3, textweight=0.8, size=300, seed=1, workers=1, passes=10, dm=0, min_count=3):

        # Read the data
        alldocs, docindex, classlabels = net.readNetworkData(directory)
        print('%d documents, %d classes, training ratio=%f' % (len(alldocs), len(classlabels), train_size))
        print('%d classes' % len(classlabels))

        #Initilize Doc2Vec
        if train_size  > 0: #label information is available for learning
            print('Adding Label Information')
            train, test = train_test_split(alldocs, train_size=train_size, random_state=seed)

            """
                Add supervised information to training data, use label information for learning
                Specifically, the doc2vec algorithm used the tags information as document IDs,
                and learn a vector representation for each tag (ID). We add the class label into the tags,
                so each class label will acts as a ID and is used to learn the latent representation
            """
            alldata = train[:]
            for x in alldata:
                x.tags.append('Label_'+x.labels)
            alldata.extend(test)
        else: # no label information is available, pure unsupervised learning
            alldata = alldocs[:]


        d2v = net.trainDoc2Vec(alldata, workers=workers, size=size, dm=dm, passes=passes, min_count=min_count)

        raw_walks, netwalks = net.getdeepwalks(directory, number_walks=20, walk_length=8)
        w2v = net.trainWord2Vec(raw_walks, buildvoc=1, passes=passes, size=size, workers=workers)

        if train_size > 0: #Print out the initial results
            print('Initialize Doc2Vec Model With Supervised Information...')
            Evaluation.evaluationEmbedModelFromTrainTest(d2v, train, test, classifierStr='SVM')
            print('Initialize Deep Walk Model')
            Evaluation.evaluationEmbedModelFromTrainTest(w2v, train, test, classifierStr='SVM')

        self.d2v = d2v
        self.w2v = w2v

        self.train(d2v, w2v, directory, alldata, passes=passes, weight=textweight)

        if textweight > 0.5:
            self.model = d2v
        else:
            self.model = w2v
Beispiel #2
0
print('Classification Performance on Doc2Vec Model')
doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \
     Evaluation.evaluationEmbedModelFromTrainTest(doc2vec_model, train, test, classifierStr='SVM')

print("##################")

#### Baseline 2, Deep Walk Model
print("##################")
print("Baseline 2, Deep Walk Model")
raw_walks, netwalks = net.getdeepwalks(directory,
                                       number_walks=20,
                                       walk_length=8)
deepwalk_model = net.trainWord2Vec(raw_walks,
                                   buildvoc=1,
                                   sg=1,
                                   passes=passes,
                                   size=numFea,
                                   workers=cores)
print('Classification Performance on DeepWalk Model')
doc2vec_acc, doc2vec_macro_f1, doc_2vec_micro_f1 = \
    Evaluation.evaluationEmbedModelFromTrainTest(deepwalk_model, train, test, classifierStr='SVM')

print("##################")

### Baseline 3, D2V+DW
print("##################")
print("Baseline 3, Simple Combination of DeepWalk + Doc2Vec")

d2v_train_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in train]
d2v_test_vecs = [doc2vec_model.docvecs[doc.tags[0]] for doc in test]