예제 #1
0
def checkIdenticals():
    old = ptd.getDataWithMeta()
    old_2011 = old[old.Publication_year == 2011]
    old_2011_wos = old_2011.WOS.tolist()
    new = ptd.getUnlabelledData()
    print("len of new data: {}".format(len(new)))
    new_2011 = new[new.Publication_year == "2011"]
    new_2011_wos = new_2011.WOS.tolist()

    print("old length 2011: {}".format(len(old_2011_wos)))
    print("new length 2011: {}".format(len(new_2011_wos)))

    print old_2011_wos[:5]
    print new_2011_wos[:5]

    identical = []
    for wos in new_2011_wos:
        for wos2 in old_2011_wos:
            if wos == wos2:
                print("{}\n{}\n".format(wos, wos2))
                identical.append(wos)

    print("Number of identical papers = {}".format(len(identical)))

    new_data = ptd.getUnlabelledDataAsList()

    print ("len of old before: {}".format(len(new_data)))
    new_data_after = []
    for dic in new_data:
        if dic["WOS"] not in identical:
            new_data_after.append(dic)

    print ("len of old after: {}".format(len(new_data_after)))
예제 #2
0
#################
#    Parameters #
#################

store_to_file = 0


################
#    Load Data #
################


print("Loading data...")
train_data = pd.concat([ptd.getTrainingData(), ptd.getValidationData(), ptd.getTestData()])
unlabelled_data = ptd.getUnlabelledData()


#########################
#   Train classifier    #
#########################


print("Training classifier")
best_classifier = LinearSVC(C=1.178)

pipeline = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                              analyzer='word',
                                              ngram_range=(1, 2),
                                              stop_words= None,
                                              max_features=None)),