def checkIdenticals(): old = ptd.getDataWithMeta() old_2011 = old[old.Publication_year == 2011] old_2011_wos = old_2011.WOS.tolist() new = ptd.getUnlabelledData() print("len of new data: {}".format(len(new))) new_2011 = new[new.Publication_year == "2011"] new_2011_wos = new_2011.WOS.tolist() print("old length 2011: {}".format(len(old_2011_wos))) print("new length 2011: {}".format(len(new_2011_wos))) print old_2011_wos[:5] print new_2011_wos[:5] identical = [] for wos in new_2011_wos: for wos2 in old_2011_wos: if wos == wos2: print("{}\n{}\n".format(wos, wos2)) identical.append(wos) print("Number of identical papers = {}".format(len(identical))) new_data = ptd.getUnlabelledDataAsList() print ("len of old before: {}".format(len(new_data))) new_data_after = [] for dic in new_data: if dic["WOS"] not in identical: new_data_after.append(dic) print ("len of old after: {}".format(len(new_data_after)))
################# # Parameters # ################# store_to_file = 0 ################ # Load Data # ################ print("Loading data...") train_data = pd.concat([ptd.getTrainingData(), ptd.getValidationData(), ptd.getTestData()]) unlabelled_data = ptd.getUnlabelledData() ######################### # Train classifier # ######################### print("Training classifier") best_classifier = LinearSVC(C=1.178) pipeline = Pipeline([('vect', CountVectorizer(decode_error='ignore', analyzer='word', ngram_range=(1, 2), stop_words= None, max_features=None)),