def run_MLP(word2vec_src):
    """
    Run SVM+word embedding experiment !
    This is the baseline method.
    :return:None
    """
    # Create a subplot with 1 row and 2 columns
    print("# word2vec:", word2vec_src)
    clf = MLPClassifier()
    word2vec_model = gensim.models.Word2Vec.load(word2vec_src)
    data = PaperData(word2vec=word2vec_model)
    train_pd = load_vec(data, data.train_data, use_pkl=False)
    test_pd = load_vec(data, data.test_data, use_pkl=False)
    train_X = train_pd.loc[:, "Output"].tolist()
    train_Y = train_pd.loc[:, "LinkTypeId"].tolist()
    test_X = test_pd.loc[:, "Output"].tolist()
    test_Y = test_pd.loc[:, "LinkTypeId"].tolist()
    start = timeit.default_timer()
    clf.fit(train_X, train_Y)
    stop = timeit.default_timer()
    predicted = clf.predict(test_X)
    print(
        metrics.classification_report(test_Y,
                                      predicted,
                                      labels=["1", "2", "3", "4"],
                                      digits=3))
    cm = metrics.confusion_matrix(test_Y,
                                  predicted,
                                  labels=["1", "2", "3", "4"])
    print("accuracy  ", get_acc(cm))
    print("Model training time: ", stop - start)
Beispiel #2
0
def run_SVM(word2vec_src):
  """
  Run SVM+word embedding experiment !
  This is the baseline method.
  :return:None
  """
  print("# word2vec:", word2vec_src)
  clf = svm.SVC(kernel="rbf", gamma=0.005)
  word2vec_model = gensim.models.Word2Vec.load(word2vec_src)
  data = PaperData(word2vec=word2vec_model)
  train_pd = load_vec(data, data.train_data, use_pkl=False)
  test_pd = load_vec(data, data.test_data, use_pkl=False)
  train_X = train_pd.loc[:, "Output"].tolist()
  train_Y = train_pd.loc[:, "LinkTypeId"].tolist()
  test_X = test_pd.loc[:, "Output"].tolist()
  test_Y = test_pd.loc[:, "LinkTypeId"].tolist()
  clf.fit(train_X, train_Y)
  predicted = clf.predict(test_X)
  print(metrics.classification_report(test_Y, predicted,
                                      labels=["1", "2", "3", "4"],
                                      # target_names=["Duplicates", "DirectLink",
                                      #               "IndirectLink",
                                      #               "Isolated"],
                                      digits=3))

  cm=metrics.confusion_matrix(test_Y, predicted, labels=["1", "2", "3", "4"])
  print("accuracy  ", get_acc(cm))
def run_tuning_MLP(word2vec_src, repeats=1, fold=2, tuning=True):
    """
    :param word2vec_src:str, path of word2vec model
    :param repeats:int, number of repeats
    :param fold: int,number of folds
    :param tuning: boolean, tuning or not.
    :return: None
    """
    print("# word2vec:", word2vec_src)
    word2vec_model = gensim.models.Word2Vec.load(word2vec_src)
    data = PaperData(word2vec=word2vec_model)
    train_pd = load_vec(data, data.train_data, file_name=False)
    print(train_pd)
    test_pd = load_vec(data, data.test_data, file_name=False)
    learner = [SK_MLP][0]
    goal = {
        0: "PD",
        1: "PF",
        2: "PREC",
        3: "ACC",
        4: "F",
        5: "G",
        6: "Macro_F",
        7: "Micro_F"
    }[6]
    print(goal)
    F = {}
    clfs = []
    start = timeit.default_timer()
    for i in range(repeats):  # repeat n times here
        kf = StratifiedKFold(train_pd.loc[:, "LinkTypeId"].values,
                             fold,
                             shuffle=True)
        for train_index, tune_index in kf:
            print(train_pd)
            print(train_index)
            train_data = train_pd.ix[train_index]
            print(train_data)
            tune_data = train_pd.ix[tune_index]
            train_X = train_data.loc[:, "Output"].values
            train_Y = train_data.loc[:, "LinkTypeId"].values
            tune_X = tune_data.loc[:, "Output"].values
            tune_Y = tune_data.loc[:, "LinkTypeId"].values
            test_X = test_pd.loc[:, "Output"].values
            test_Y = test_pd.loc[:, "LinkTypeId"].values
            params, evaluation = tune_learner(learner, train_X, train_Y,
                                              tune_X, tune_Y,
                                              goal) if tuning else ({}, 0)
            clf = learner(train_X, train_Y, test_X, test_Y, goal)
            F = clf.learn(F, **params)
            clfs.append(clf)
    stop = timeit.default_timer()
    print("Model training time: ", stop - start)
    print_results(clfs, stop, start)
Beispiel #4
0
def preprocess_data(word2vec_src):
    """
    Preprocess word2vec and gets the training and testing set of data
    :param word2vec_src: the source file of word2vec
    :return: train_X, train_Y, test_X, test_Y
    """
    print("# word2vec:", word2vec_src)
    word2vec_model = gensim.models.Word2Vec.load(word2vec_src)
    data = PaperData(word2vec=word2vec_model)

    train_pd = load_vec(data, data.train_data, use_pkl=False)
    test_pd = load_vec(data, data.test_data, use_pkl=False)
    return train_pd, test_pd
Beispiel #5
0
def run_tuning_SVM(word2vec_src, repeats=1,
                   fold=5,
                   tuning=True):
  
  print(time.time())
  print("# word2vec:", word2vec_src)
  word2vec_model = gensim.models.Word2Vec.load(word2vec_src)
  print(time.time())
  data = PaperData(word2vec=word2vec_model)
  train_pd = load_vec(data, data.train_data, file_name=False)
  test_pd = load_vec(data, data.test_data, file_name=False)
  print(time.time())
  learner = [SK_SVM][0]
  goal = {0: "PD", 1: "PF", 2: "PREC", 3: "ACC", 4: "F", 5: "G", 6: "Macro_F",
          7: "Micro_F"}[6]
  F = {}
  clfs = []
  for i in xrange(repeats):  # repeat n times here
    kf = StratifiedKFold(train_pd.loc[:, "LinkTypeId"].values, fold,
                         shuffle=True)
    print("Stratified")
    print(time.time())
    for train_index, tune_index in kf:
      train_data = train_pd.ix[train_index]
      tune_data = train_pd.ix[tune_index]
      train_X = train_data.loc[:, "Output"].values
      train_Y = train_data.loc[:, "LinkTypeId"].values
      tune_X = tune_data.loc[:, "Output"].values
      tune_Y = tune_data.loc[:, "LinkTypeId"].values
      test_X = test_pd.loc[:, "Output"].values
      test_Y = test_pd.loc[:, "LinkTypeId"].values
      params, evaluation = tune_learner(learner, train_X, train_Y, tune_X,
                                            tune_Y, goal) if tuning else ({}, 0)
#       params = {'kernel':'rbf','C':1,'gamma':'auto'}
      print("Tuning Done...now running")
      print("********************")
      print(params)
      print("********************")
      clf = learner(train_X, train_Y, test_X, test_Y, goal)
      F = clf.learn(F, **params)
      clfs.append(clf)
    print_results(clfs)
def run_kmeans_mp(word2vec_src):

    print("# word2vec:", word2vec_src)
    word2vec_model = gensim.models.Word2Vec.load(word2vec_src)
    data = PaperData(word2vec=word2vec_model)
    train_pd = load_vec(data, data.train_data, use_pkl=False)
    test_pd = load_vec(data, data.test_data, use_pkl=False)
    train_X = train_pd.loc[:, "Output"].tolist()
    queue = Queue()
    pool = multiprocessing.Pool()
    processes = []

    start = timeit.default_timer()
    numClusters = optimalK(pd.DataFrame(train_X))
    stop = timeit.default_timer()
    #numClusters = 5
    print("Found optimal k: " + str(numClusters))
    clf = KMeans(n_clusters=numClusters,
                 init='k-means++',
                 max_iter=200,
                 n_init=1)

    start0 = timeit.default_timer()
    clf.fit(train_X)
    stop0 = timeit.default_timer()

    svm_models = []  # maintain a list of svms
    s1 = timeit.default_timer()
    data.train_data['clabel'] = clf.labels_
    s2 = timeit.default_timer()
    print("Inter - ", (s2 - s1))
    start1 = timeit.default_timer()
    # Change the target here as this will be used result validation purpose
    target_model = run_tuning_KNN_C
    for l in range(numClusters):
        cluster = data.train_data.loc[data.train_data['clabel'] == l]
        print("Thread No", l)
        pool.apply_async(run_tuning_KNN_C, (
            word2vec_src,
            cluster,
            queue,
            l,
            test_pd,
        ))
        #t = threading.Thread(target = run_tuning_SVM_C, args = [word2vec_src,cluster,queue,l,test_pd])

#  for pr in processes:
#      pr.start()
    for pr in range(numClusters):
        response = queue.get()
        svm_models.append(response)
    print(svm_models)
    svm_models = sorted(svm_models, key=lambda th: th[-1])
    stop1 = timeit.default_timer()
    print(svm_models)

    svm_results = []  # maintain a list of svm results
    test_X = test_pd.loc[:, "Output"].tolist()
    predicted = clf.predict(test_X)
    data.test_data['clabel'] = predicted
    total_predicted = []
    total_cluster_Y = []
    avg_predicted = []
    avg_cluster_Y = []
    for i in range(len(svm_models[l]) - 1):
        total_predicted = []
        total_cluster_Y = []
        for l in range(numClusters):
            cluster = data.test_data.loc[data.test_data['clabel'] == l]
            svm_model = svm_models[l][i]
            cluster_X = cluster.loc[:, "Output"].tolist()
            cluster_Y = cluster.loc[:, "LinkTypeId"].tolist()
            total_cluster_Y = np.append(total_cluster_Y, cluster_Y)
            avg_cluster_Y = np.append(avg_cluster_Y, cluster_Y)
            if target_model == run_tuning_SVM_C or target_model == run_tuning_KNN_C:
                predicted_C = svm_model.learner.predict(cluster_X)
            else:
                predicted_C = svm_model.predict(cluster_X)
            total_predicted = np.append(total_predicted, predicted_C)
            avg_predicted = np.append(avg_predicted, predicted_C)
        # store all the SVM result report in a dictionary
        svm_results.append(results_SVM_C(total_predicted, total_cluster_Y))
    svm_results.append(results_SVM_C(avg_predicted, avg_cluster_Y))
    # call the helper method to summarize the svm results
    total_summary(svm_results, test_pd.shape[0], start0, start1, stop0, stop1,
                  start, stop)