def run_MLP(word2vec_src): """ Run SVM+word embedding experiment ! This is the baseline method. :return:None """ # Create a subplot with 1 row and 2 columns print("# word2vec:", word2vec_src) clf = MLPClassifier() word2vec_model = gensim.models.Word2Vec.load(word2vec_src) data = PaperData(word2vec=word2vec_model) train_pd = load_vec(data, data.train_data, use_pkl=False) test_pd = load_vec(data, data.test_data, use_pkl=False) train_X = train_pd.loc[:, "Output"].tolist() train_Y = train_pd.loc[:, "LinkTypeId"].tolist() test_X = test_pd.loc[:, "Output"].tolist() test_Y = test_pd.loc[:, "LinkTypeId"].tolist() start = timeit.default_timer() clf.fit(train_X, train_Y) stop = timeit.default_timer() predicted = clf.predict(test_X) print( metrics.classification_report(test_Y, predicted, labels=["1", "2", "3", "4"], digits=3)) cm = metrics.confusion_matrix(test_Y, predicted, labels=["1", "2", "3", "4"]) print("accuracy ", get_acc(cm)) print("Model training time: ", stop - start)
def run_SVM(word2vec_src): """ Run SVM+word embedding experiment ! This is the baseline method. :return:None """ print("# word2vec:", word2vec_src) clf = svm.SVC(kernel="rbf", gamma=0.005) word2vec_model = gensim.models.Word2Vec.load(word2vec_src) data = PaperData(word2vec=word2vec_model) train_pd = load_vec(data, data.train_data, use_pkl=False) test_pd = load_vec(data, data.test_data, use_pkl=False) train_X = train_pd.loc[:, "Output"].tolist() train_Y = train_pd.loc[:, "LinkTypeId"].tolist() test_X = test_pd.loc[:, "Output"].tolist() test_Y = test_pd.loc[:, "LinkTypeId"].tolist() clf.fit(train_X, train_Y) predicted = clf.predict(test_X) print(metrics.classification_report(test_Y, predicted, labels=["1", "2", "3", "4"], # target_names=["Duplicates", "DirectLink", # "IndirectLink", # "Isolated"], digits=3)) cm=metrics.confusion_matrix(test_Y, predicted, labels=["1", "2", "3", "4"]) print("accuracy ", get_acc(cm))
def run_tuning_MLP(word2vec_src, repeats=1, fold=2, tuning=True): """ :param word2vec_src:str, path of word2vec model :param repeats:int, number of repeats :param fold: int,number of folds :param tuning: boolean, tuning or not. :return: None """ print("# word2vec:", word2vec_src) word2vec_model = gensim.models.Word2Vec.load(word2vec_src) data = PaperData(word2vec=word2vec_model) train_pd = load_vec(data, data.train_data, file_name=False) print(train_pd) test_pd = load_vec(data, data.test_data, file_name=False) learner = [SK_MLP][0] goal = { 0: "PD", 1: "PF", 2: "PREC", 3: "ACC", 4: "F", 5: "G", 6: "Macro_F", 7: "Micro_F" }[6] print(goal) F = {} clfs = [] start = timeit.default_timer() for i in range(repeats): # repeat n times here kf = StratifiedKFold(train_pd.loc[:, "LinkTypeId"].values, fold, shuffle=True) for train_index, tune_index in kf: print(train_pd) print(train_index) train_data = train_pd.ix[train_index] print(train_data) tune_data = train_pd.ix[tune_index] train_X = train_data.loc[:, "Output"].values train_Y = train_data.loc[:, "LinkTypeId"].values tune_X = tune_data.loc[:, "Output"].values tune_Y = tune_data.loc[:, "LinkTypeId"].values test_X = test_pd.loc[:, "Output"].values test_Y = test_pd.loc[:, "LinkTypeId"].values params, evaluation = tune_learner(learner, train_X, train_Y, tune_X, tune_Y, goal) if tuning else ({}, 0) clf = learner(train_X, train_Y, test_X, test_Y, goal) F = clf.learn(F, **params) clfs.append(clf) stop = timeit.default_timer() print("Model training time: ", stop - start) print_results(clfs, stop, start)
def preprocess_data(word2vec_src): """ Preprocess word2vec and gets the training and testing set of data :param word2vec_src: the source file of word2vec :return: train_X, train_Y, test_X, test_Y """ print("# word2vec:", word2vec_src) word2vec_model = gensim.models.Word2Vec.load(word2vec_src) data = PaperData(word2vec=word2vec_model) train_pd = load_vec(data, data.train_data, use_pkl=False) test_pd = load_vec(data, data.test_data, use_pkl=False) return train_pd, test_pd
def run_tuning_SVM(word2vec_src, repeats=1, fold=5, tuning=True): print(time.time()) print("# word2vec:", word2vec_src) word2vec_model = gensim.models.Word2Vec.load(word2vec_src) print(time.time()) data = PaperData(word2vec=word2vec_model) train_pd = load_vec(data, data.train_data, file_name=False) test_pd = load_vec(data, data.test_data, file_name=False) print(time.time()) learner = [SK_SVM][0] goal = {0: "PD", 1: "PF", 2: "PREC", 3: "ACC", 4: "F", 5: "G", 6: "Macro_F", 7: "Micro_F"}[6] F = {} clfs = [] for i in xrange(repeats): # repeat n times here kf = StratifiedKFold(train_pd.loc[:, "LinkTypeId"].values, fold, shuffle=True) print("Stratified") print(time.time()) for train_index, tune_index in kf: train_data = train_pd.ix[train_index] tune_data = train_pd.ix[tune_index] train_X = train_data.loc[:, "Output"].values train_Y = train_data.loc[:, "LinkTypeId"].values tune_X = tune_data.loc[:, "Output"].values tune_Y = tune_data.loc[:, "LinkTypeId"].values test_X = test_pd.loc[:, "Output"].values test_Y = test_pd.loc[:, "LinkTypeId"].values params, evaluation = tune_learner(learner, train_X, train_Y, tune_X, tune_Y, goal) if tuning else ({}, 0) # params = {'kernel':'rbf','C':1,'gamma':'auto'} print("Tuning Done...now running") print("********************") print(params) print("********************") clf = learner(train_X, train_Y, test_X, test_Y, goal) F = clf.learn(F, **params) clfs.append(clf) print_results(clfs)
def run_kmeans_mp(word2vec_src): print("# word2vec:", word2vec_src) word2vec_model = gensim.models.Word2Vec.load(word2vec_src) data = PaperData(word2vec=word2vec_model) train_pd = load_vec(data, data.train_data, use_pkl=False) test_pd = load_vec(data, data.test_data, use_pkl=False) train_X = train_pd.loc[:, "Output"].tolist() queue = Queue() pool = multiprocessing.Pool() processes = [] start = timeit.default_timer() numClusters = optimalK(pd.DataFrame(train_X)) stop = timeit.default_timer() #numClusters = 5 print("Found optimal k: " + str(numClusters)) clf = KMeans(n_clusters=numClusters, init='k-means++', max_iter=200, n_init=1) start0 = timeit.default_timer() clf.fit(train_X) stop0 = timeit.default_timer() svm_models = [] # maintain a list of svms s1 = timeit.default_timer() data.train_data['clabel'] = clf.labels_ s2 = timeit.default_timer() print("Inter - ", (s2 - s1)) start1 = timeit.default_timer() # Change the target here as this will be used result validation purpose target_model = run_tuning_KNN_C for l in range(numClusters): cluster = data.train_data.loc[data.train_data['clabel'] == l] print("Thread No", l) pool.apply_async(run_tuning_KNN_C, ( word2vec_src, cluster, queue, l, test_pd, )) #t = threading.Thread(target = run_tuning_SVM_C, args = [word2vec_src,cluster,queue,l,test_pd]) # for pr in processes: # pr.start() for pr in range(numClusters): response = queue.get() svm_models.append(response) print(svm_models) svm_models = sorted(svm_models, key=lambda th: th[-1]) stop1 = timeit.default_timer() print(svm_models) svm_results = [] # maintain a list of svm results test_X = test_pd.loc[:, "Output"].tolist() predicted = clf.predict(test_X) data.test_data['clabel'] = predicted total_predicted = [] total_cluster_Y = [] avg_predicted = [] avg_cluster_Y = [] for i in range(len(svm_models[l]) - 1): total_predicted = [] total_cluster_Y = [] for l in range(numClusters): cluster = data.test_data.loc[data.test_data['clabel'] == l] svm_model = svm_models[l][i] cluster_X = cluster.loc[:, "Output"].tolist() cluster_Y = cluster.loc[:, "LinkTypeId"].tolist() total_cluster_Y = np.append(total_cluster_Y, cluster_Y) avg_cluster_Y = np.append(avg_cluster_Y, cluster_Y) if target_model == run_tuning_SVM_C or target_model == run_tuning_KNN_C: predicted_C = svm_model.learner.predict(cluster_X) else: predicted_C = svm_model.predict(cluster_X) total_predicted = np.append(total_predicted, predicted_C) avg_predicted = np.append(avg_predicted, predicted_C) # store all the SVM result report in a dictionary svm_results.append(results_SVM_C(total_predicted, total_cluster_Y)) svm_results.append(results_SVM_C(avg_predicted, avg_cluster_Y)) # call the helper method to summarize the svm results total_summary(svm_results, test_pd.shape[0], start0, start1, stop0, stop1, start, stop)