def main(_):
    #os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if FLAGS.dataset == "bibsonomy-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_bib
        traning_data_path = FLAGS.training_data_path_bib
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 11.59

    elif FLAGS.dataset == "zhihu-sample":
        word2vec_model_path = FLAGS.word2vec_model_path_zhihu
        traning_data_path = FLAGS.training_data_path_zhihu
        FLAGS.sequence_length = 100
        FLAGS.ave_labels_per_doc = 2.45

    elif FLAGS.dataset == "citeulike-a-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_cua
        traning_data_path = FLAGS.training_data_path_cua
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 11.6

    elif FLAGS.dataset == "citeulike-t-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_cut
        traning_data_path = FLAGS.training_data_path_cut
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 7.68

    # 1. create trainlist, validlist and testlist
    trainX, trainY, testX, testY = None, None, None, None
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        word2vec_model_path,
        name_scope=FLAGS.dataset + "-lda")  #simple='simple'
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        voabulary_label=traning_data_path, name_scope=FLAGS.dataset + "-lda")
    num_classes = len(vocabulary_word2index_label)
    print(vocabulary_index2word_label[0], vocabulary_index2word_label[1])

    vocab_size = len(vocabulary_word2index)
    print("vocab_size:", vocab_size)

    # choosing whether to use k-fold cross-validation or hold-out validation
    if FLAGS.kfold == -1:  # hold-out
        train, valid, test = load_data_multilabel_new(
            vocabulary_word2index,
            vocabulary_word2index_label,
            keep_label_percent=FLAGS.keep_label_percent,
            valid_portion=FLAGS.valid_portion,
            test_portion=FLAGS.test_portion,
            multi_label_flag=FLAGS.multi_label_flag,
            traning_data_path=traning_data_path)
        # here train, test are tuples; turn train into trainlist.
        trainlist, validlist, testlist = list(), list(), list()
        trainlist.append(train)
        validlist.append(valid)
        testlist.append(test)
    else:  # k-fold
        trainlist, validlist, testlist = load_data_multilabel_new_k_fold(
            vocabulary_word2index,
            vocabulary_word2index_label,
            keep_label_percent=FLAGS.keep_label_percent,
            kfold=FLAGS.kfold,
            test_portion=FLAGS.test_portion,
            multi_label_flag=FLAGS.multi_label_flag,
            traning_data_path=traning_data_path)
        # here trainlist, testlist are list of tuples.
    # get and pad testing data: there is only one testing data, but kfold training and validation data
    assert len(testlist) == 1
    testX, testY = testlist[0]
    testX = pad_sequences(testX, maxlen=FLAGS.sequence_length,
                          value=0.)  # padding to max length

    # 3. transform trainlist to the format. x_train, x_test: training and test feature matrices of size (n_samples, n_features)
    #print(len(trainlist))
    #trainX,trainY = trainlist[0]
    #trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
    #print(len(trainX))
    #print(len(trainX[0]))
    #print(trainX[0])
    #print(len(trainY))
    #print(len(trainY[0]))
    #print(trainY[0])
    #print(np.asarray(trainY).shape)

    num_runs = len(trainlist)
    #validation results variables
    valid_acc_th, valid_prec_th, valid_rec_th, valid_fmeasure_th, valid_hamming_loss_th = [
        0
    ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [
        0
    ] * num_runs  # initialise the result lists
    final_valid_acc_th, final_valid_prec_th, final_valid_rec_th, final_valid_fmeasure_th, final_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    min_valid_acc_th, min_valid_prec_th, min_valid_rec_th, min_valid_fmeasure_th, min_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    max_valid_acc_th, max_valid_prec_th, max_valid_rec_th, max_valid_fmeasure_th, max_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    std_valid_acc_th, std_valid_prec_th, std_valid_rec_th, std_valid_fmeasure_th, std_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    #testing results variables
    test_acc_th, test_prec_th, test_rec_th, test_fmeasure_th, test_hamming_loss_th = [
        0
    ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [
        0
    ] * num_runs  # initialise the testing result lists
    final_test_acc_th, final_test_prec_th, final_test_rec_th, final_test_fmeasure_th, final_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    min_test_acc_th, min_test_prec_th, min_test_rec_th, min_test_fmeasure_th, min_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    max_test_acc_th, max_test_prec_th, max_test_rec_th, max_test_fmeasure_th, max_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    std_test_acc_th, std_test_prec_th, std_test_rec_th, std_test_fmeasure_th, std_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    #output variables
    output_valid = ""
    output_test = ""
    output_csv_valid = "fold,hamming_loss,acc,prec,rec,f1"
    output_csv_test = "fold,hamming_loss,acc,prec,rec,f1"

    time_train = [0] * num_runs  # get time spent in training
    num_run = 0

    mallet_path = FLAGS.mallet_path
    num_topics = FLAGS.num_topics
    alpha = 50 / num_topics
    iterations = FLAGS.iterations
    k_num_doc = FLAGS.k_num_doc

    remove_pad_id = True
    remove_dot = True
    docs_test = generateLDAdocFromIndex(testX,
                                        vocabulary_index2word,
                                        remove_pad_id=remove_pad_id,
                                        remove_dot=remove_dot)

    for trainfold in trainlist:
        # get training and validation data
        trainX, trainY = trainfold
        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
        # generate training data for gensim MALLET wrapper for LDA
        docs = generateLDAdocFromIndex(trainX,
                                       vocabulary_index2word,
                                       remove_pad_id=remove_pad_id,
                                       remove_dot=remove_dot)
        #print(docs[10])
        id2word = corpora.Dictionary(docs)
        corpus = [id2word.doc2bow(text) for text in docs]
        #print(corpus[10])
        # generate validation data for gensim MALLET wrapper for LDA
        validX, validY = validlist[num_run]
        validX = pad_sequences(validX, maxlen=FLAGS.sequence_length, value=0.)
        docs_valid = generateLDAdocFromIndex(validX,
                                             vocabulary_index2word,
                                             remove_pad_id=remove_pad_id,
                                             remove_dot=remove_dot)
        corpus_valid = [id2word.doc2bow(text) for text in docs_valid]
        # generate testing data for gensim MALLET wrapper for LDA
        corpus_test = [id2word.doc2bow(text) for text in docs_test]

        # training
        start_time_train = time.time()
        print('start training fold', str(num_run))

        model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 alpha=alpha,
                                                 id2word=id2word,
                                                 iterations=iterations)
        pprint(model.show_topics(formatted=False))

        print('num_run', str(num_run), 'train done.')

        time_train[num_run] = time.time() - start_time_train
        print("--- training of fold %s took %s seconds ---" %
              (num_run, time_train[num_run]))

        # represent each document as a topic vector
        #mat_train = np.array(model[corpus]) # this will cause an Error with large num_topics, e.g. 1000 or higher.
        #Thus, we turn the MALLET LDA model to a native Gensim LDA model
        model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
        mat_train = np.array(
            model.get_document_topics(corpus, minimum_probability=0.0))
        #print(len(model[corpus[0]]))
        #print(len(model[corpus[1]]))
        #print(len(model[corpus[2]]))
        #print(mat_train.shape)
        mat_train = mat_train[:, :,
                              1]  # documents in training set as a matrix of topic probabilities

        # evaluate on training data
        #if num_run == 0 and FLAGS.kfold != -1: # do this only for the first fold in k-fold cross-validation to save time
        #    acc, prec, rec, f_measure, hamming_loss = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus, trainY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc)
        #    print('training:', acc, prec, rec, f_measure, hamming_loss)

        # validation
        valid_acc_th[num_run], valid_prec_th[num_run], valid_rec_th[
            num_run], valid_fmeasure_th[num_run], valid_hamming_loss_th[
                num_run] = do_eval_lda(model,
                                       k_num_doc,
                                       mat_train,
                                       trainY,
                                       corpus_valid,
                                       validY,
                                       vocabulary_index2word_label,
                                       hamming_q=FLAGS.ave_labels_per_doc)
        print(
            "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f"
            % (num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run],
               valid_prec_th[num_run], valid_rec_th[num_run],
               valid_fmeasure_th[num_run]))
        output_valid = output_valid + "\n" + "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (
            num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run],
            valid_prec_th[num_run], valid_rec_th[num_run], valid_fmeasure_th[
                num_run]) + "\n"  # also output the results of each run.
        output_csv_valid = output_csv_valid + "\n" + str(num_run) + "," + str(
            valid_hamming_loss_th[num_run]) + "," + str(
                valid_acc_th[num_run]) + "," + str(
                    valid_prec_th[num_run]) + "," + str(
                        valid_rec_th[num_run]) + "," + str(
                            valid_fmeasure_th[num_run])

        start_time_test = time.time()
        # evaluate on testing data
        test_acc_th[num_run], test_prec_th[num_run], test_rec_th[
            num_run], test_fmeasure_th[num_run], test_hamming_loss_th[
                num_run] = do_eval_lda(model,
                                       k_num_doc,
                                       mat_train,
                                       trainY,
                                       corpus_test,
                                       testY,
                                       vocabulary_index2word_label,
                                       hamming_q=FLAGS.ave_labels_per_doc)
        print(
            "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f"
            % (num_run, test_acc_th[num_run], test_hamming_loss_th[num_run],
               test_prec_th[num_run], test_rec_th[num_run],
               test_fmeasure_th[num_run]))
        output_test = output_test + "\n" + "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (
            num_run, test_acc_th[num_run], test_hamming_loss_th[num_run],
            test_prec_th[num_run], test_rec_th[num_run], test_fmeasure_th[
                num_run]) + "\n"  # also output the results of each run.
        output_csv_test = output_csv_test + "\n" + str(num_run) + "," + str(
            test_hamming_loss_th[num_run]) + "," + str(
                test_acc_th[num_run]) + "," + str(
                    test_prec_th[num_run]) + "," + str(
                        test_rec_th[num_run]) + "," + str(
                            test_fmeasure_th[num_run])

        print("--- testing of fold %s took %s seconds ---" %
              (num_run, time.time() - start_time_test))

        prediction_str = ""
        # output final predictions for qualitative analysis
        if FLAGS.report_rand_pred == True:
            prediction_str = display_for_qualitative_evaluation(
                model,
                k_num_doc,
                mat_train,
                trainY,
                corpus_test,
                testX,
                testY,
                vocabulary_index2word,
                vocabulary_index2word_label,
                hamming_q=FLAGS.ave_labels_per_doc)
        # update the num_run
        num_run = num_run + 1

    print('\n--Final Results--\n')
    #print('C', FLAGS.C, 'gamma', FLAGS.gamma)

    # report min, max, std, average for the validation results
    min_valid_acc_th = min(valid_acc_th)
    min_valid_prec_th = min(valid_prec_th)
    min_valid_rec_th = min(valid_rec_th)
    min_valid_fmeasure_th = min(valid_fmeasure_th)
    min_valid_hamming_loss_th = min(valid_hamming_loss_th)

    max_valid_acc_th = max(valid_acc_th)
    max_valid_prec_th = max(valid_prec_th)
    max_valid_rec_th = max(valid_rec_th)
    max_valid_fmeasure_th = max(valid_fmeasure_th)
    max_valid_hamming_loss_th = max(valid_hamming_loss_th)

    if FLAGS.kfold != -1:
        std_valid_acc_th = statistics.stdev(valid_acc_th)  # to change
        std_valid_prec_th = statistics.stdev(valid_prec_th)
        std_valid_rec_th = statistics.stdev(valid_rec_th)
        std_valid_fmeasure_th = statistics.stdev(valid_fmeasure_th)
        std_valid_hamming_loss_th = statistics.stdev(valid_hamming_loss_th)

    final_valid_acc_th = sum(valid_acc_th) / num_runs
    final_valid_prec_th = sum(valid_prec_th) / num_runs
    final_valid_rec_th = sum(valid_rec_th) / num_runs
    final_valid_fmeasure_th = sum(valid_fmeasure_th) / num_runs
    final_valid_hamming_loss_th = sum(valid_hamming_loss_th) / num_runs

    print(
        "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)"
        % (final_valid_acc_th, std_valid_acc_th, min_valid_acc_th,
           max_valid_acc_th, final_valid_hamming_loss_th,
           std_valid_hamming_loss_th, min_valid_hamming_loss_th,
           max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th,
           min_valid_prec_th, max_valid_prec_th, final_valid_rec_th,
           std_valid_rec_th, min_valid_rec_th, max_valid_rec_th,
           final_valid_fmeasure_th, std_valid_fmeasure_th,
           min_valid_fmeasure_th, max_valid_fmeasure_th))
    #output the result to a file
    output_valid = output_valid + "\n" + "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (
        final_valid_acc_th, std_valid_acc_th, min_valid_acc_th,
        max_valid_acc_th, final_valid_hamming_loss_th,
        std_valid_hamming_loss_th, min_valid_hamming_loss_th,
        max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th,
        min_valid_prec_th, max_valid_prec_th, final_valid_rec_th,
        std_valid_rec_th, min_valid_rec_th, max_valid_rec_th,
        final_valid_fmeasure_th, std_valid_fmeasure_th, min_valid_fmeasure_th,
        max_valid_fmeasure_th) + "\n"
    output_csv_valid = output_csv_valid + "\n" + "average" + "," + str(
        round(final_valid_hamming_loss_th, 3)) + "±" + str(
            round(std_valid_hamming_loss_th, 3)
        ) + "," + str(round(final_valid_acc_th, 3)) + "±" + str(
            round(std_valid_acc_th, 3)) + "," + str(
                round(final_valid_prec_th, 3)) + "±" + str(
                    round(std_valid_prec_th, 3)) + "," + str(
                        round(final_valid_rec_th, 3)) + "±" + str(
                            round(std_valid_rec_th, 3)) + "," + str(
                                round(final_valid_fmeasure_th, 3)) + "±" + str(
                                    round(std_valid_fmeasure_th, 3))

    # report min, max, std, average for the testing results
    min_test_acc_th = min(test_acc_th)
    min_test_prec_th = min(test_prec_th)
    min_test_rec_th = min(test_rec_th)
    min_test_fmeasure_th = min(test_fmeasure_th)
    min_test_hamming_loss_th = min(test_hamming_loss_th)

    max_test_acc_th = max(test_acc_th)
    max_test_prec_th = max(test_prec_th)
    max_test_rec_th = max(test_rec_th)
    max_test_fmeasure_th = max(test_fmeasure_th)
    max_test_hamming_loss_th = max(test_hamming_loss_th)

    if FLAGS.kfold != -1:
        std_test_acc_th = statistics.stdev(test_acc_th)  # to change
        std_test_prec_th = statistics.stdev(test_prec_th)
        std_test_rec_th = statistics.stdev(test_rec_th)
        std_test_fmeasure_th = statistics.stdev(test_fmeasure_th)
        std_test_hamming_loss_th = statistics.stdev(test_hamming_loss_th)

    final_test_acc_th = sum(test_acc_th) / num_runs
    final_test_prec_th = sum(test_prec_th) / num_runs
    final_test_rec_th = sum(test_rec_th) / num_runs
    final_test_fmeasure_th = sum(test_fmeasure_th) / num_runs
    final_test_hamming_loss_th = sum(test_hamming_loss_th) / num_runs

    print(
        "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)"
        %
        (final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th,
         final_test_hamming_loss_th, std_test_hamming_loss_th,
         min_test_hamming_loss_th, max_test_hamming_loss_th,
         final_test_prec_th, std_test_prec_th, min_test_prec_th,
         max_test_prec_th, final_test_rec_th, std_test_rec_th, min_test_rec_th,
         max_test_rec_th, final_test_fmeasure_th, std_test_fmeasure_th,
         min_test_fmeasure_th, max_test_fmeasure_th))
    #output the result to a file
    output_test = output_test + "\n" + "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (
        final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th,
        final_test_hamming_loss_th, std_test_hamming_loss_th,
        min_test_hamming_loss_th, max_test_hamming_loss_th, final_test_prec_th,
        std_test_prec_th, min_test_prec_th, max_test_prec_th,
        final_test_rec_th, std_test_rec_th, min_test_rec_th, max_test_rec_th,
        final_test_fmeasure_th, std_test_fmeasure_th, min_test_fmeasure_th,
        max_test_fmeasure_th) + "\n"
    output_csv_test = output_csv_test + "\n" + "average" + "," + str(
        round(final_test_hamming_loss_th, 3)) + "±" + str(
            round(std_test_hamming_loss_th, 3)) + "," + str(
                round(final_test_acc_th, 3)
            ) + "±" + str(round(std_test_acc_th, 3)) + "," + str(
                round(final_test_prec_th, 3)) + "±" + str(
                    round(std_test_prec_th, 3)) + "," + str(
                        round(final_test_rec_th, 3)) + "±" + str(
                            round(std_test_rec_th, 3)) + "," + str(
                                round(final_test_fmeasure_th, 3)) + "±" + str(
                                    round(std_test_fmeasure_th, 3))

    setting = "dataset:" + str(FLAGS.dataset) + "\nT: " + str(
        FLAGS.num_topics) + "\nk: " + str(FLAGS.k_num_doc) + ' \ni: ' + str(
            FLAGS.iterations)
    print("--- The whole program took %s seconds ---" %
          (time.time() - start_time))
    time_used = "--- The whole program took %s seconds ---" % (time.time() -
                                                               start_time)
    if FLAGS.kfold != -1:
        print("--- The average training took %s ± %s seconds ---" %
              (sum(time_train) / num_runs, statistics.stdev(time_train)))
        average_time_train = "--- The average training took %s ± %s seconds ---" % (
            sum(time_train) / num_runs, statistics.stdev(time_train))
    else:
        print("--- The average training took %s ± %s seconds ---" %
              (sum(time_train) / num_runs, 0))
        average_time_train = "--- The average training took %s ± %s seconds ---" % (
            sum(time_train) / num_runs, 0)

    # output setting configuration, results, prediction and time used
    output_to_file(
        'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' +
        str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' +
        str(FLAGS.marking_id) + '.txt',
        setting + '\n' + output_valid + '\n' + output_test + '\n' +
        prediction_str + '\n' + time_used + '\n' + average_time_train)
    # output structured evaluation results
    output_to_file(
        'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' +
        str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' +
        str(FLAGS.marking_id) + ' valid.csv', output_csv_valid)
    output_to_file(
        'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' +
        str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' +
        str(FLAGS.marking_id) + ' test.csv', output_csv_test)
def main(_):
    #os.environ['CUDA_VISIBLE_DEVICES'] = ''
    
    if FLAGS.dataset == "bibsonomy-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_bib
        traning_data_path = FLAGS.training_data_path_bib
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 11.59
        
    elif FLAGS.dataset == "zhihu-sample":
        word2vec_model_path = FLAGS.word2vec_model_path_zhihu
        traning_data_path = FLAGS.training_data_path_zhihu
        FLAGS.sequence_length = 100
        FLAGS.ave_labels_per_doc = 2.45
        
    elif FLAGS.dataset == "citeulike-a-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_cua
        traning_data_path = FLAGS.training_data_path_cua
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 11.6
        
    elif FLAGS.dataset == "citeulike-t-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_cut
        traning_data_path = FLAGS.training_data_path_cut
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 7.68
        
    # 1. create trainlist, validlist and testlist 
    trainX, trainY, testX, testY = None, None, None, None
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path,name_scope=FLAGS.dataset + "-svm") #simple='simple'
    vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(voabulary_label=traning_data_path, name_scope=FLAGS.dataset + "-svm")
    num_classes=len(vocabulary_word2index_label)
    print(vocabulary_index2word_label[0],vocabulary_index2word_label[1])

    vocab_size = len(vocabulary_word2index)
    print("vocab_size:",vocab_size)

    # choosing whether to use k-fold cross-validation or hold-out validation
    if FLAGS.kfold == -1: # hold-out
        train, valid, test = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,keep_label_percent=FLAGS.keep_label_percent,valid_portion=FLAGS.valid_portion,test_portion=FLAGS.test_portion,multi_label_flag=FLAGS.multi_label_flag,traning_data_path=traning_data_path) 
        # here train, test are tuples; turn train into trainlist.
        trainlist, validlist, testlist = list(), list(), list()
        trainlist.append(train)
        validlist.append(valid)
        testlist.append(test)
    else: # k-fold
        trainlist, validlist, testlist = load_data_multilabel_new_k_fold(vocabulary_word2index, vocabulary_word2index_label,keep_label_percent=FLAGS.keep_label_percent,kfold=FLAGS.kfold,test_portion=FLAGS.test_portion,multi_label_flag=FLAGS.multi_label_flag,traning_data_path=traning_data_path)
        # here trainlist, testlist are list of tuples.
    # get and pad testing data: there is only one testing data, but kfold training and validation data
    assert len(testlist) == 1
    testX, testY = testlist[0]
    testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length

    # 2. get word_embedding matrix: shape (21425,100)
    word2vec_model = word2vec.load(word2vec_model_path, kind='bin')
    word2vec_dict = {}
    for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors):
        word2vec_dict[word] = vector
    word_embedding_2dlist = [[]] * vocab_size  # create an empty word_embedding list: which is a list of list, i.e. a list of word, where each word is a list of values as an embedding vector.
    word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size)  # assign empty for first word:'PAD'
    bound = np.sqrt(6.0) / np.sqrt(vocab_size)  # bound for random variables.
    count_exist = 0;
    count_not_exist = 0
    for i in range(1, vocab_size):  # loop each word
        word = vocabulary_index2word[i]  # get a word
        embedding = None
        try:
            embedding = word2vec_dict[word]  # try to get vector:it is an array.
        except Exception:
            embedding = None
        if embedding is not None:  # the 'word' exist a embedding
            word_embedding_2dlist[i] = embedding;
            count_exist = count_exist + 1  # assign array to this word.
        else:  # no embedding for this word
            word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size);
            count_not_exist = count_not_exist + 1  # init a random value for the word.
    word_embedding_final = np.array(word_embedding_2dlist)  # covert to 2d array.
    print('embedding per word:',word_embedding_final)
    print('embedding per word, shape:',word_embedding_final.shape)

    # 3. transform trainlist to the format. x_train, x_test: training and test feature matrices of size (n_samples, n_features)
    #print(len(trainlist))
    #trainX,trainY = trainlist[0]
    #trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
    #print(len(trainX))
    #print(len(trainX[0]))
    #print(trainX[0])
    #print(len(trainY))
    #print(len(trainY[0]))
    #print(trainY[0])
    #print(np.asarray(trainY).shape)
    
    num_runs = len(trainlist)
    #validation results variables
    valid_acc_th,valid_prec_th,valid_rec_th,valid_fmeasure_th,valid_hamming_loss_th =[0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs # initialise the result lists
    final_valid_acc_th,final_valid_prec_th,final_valid_rec_th,final_valid_fmeasure_th,final_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    min_valid_acc_th,min_valid_prec_th,min_valid_rec_th,min_valid_fmeasure_th,min_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    max_valid_acc_th,max_valid_prec_th,max_valid_rec_th,max_valid_fmeasure_th,max_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    std_valid_acc_th,std_valid_prec_th,std_valid_rec_th,std_valid_fmeasure_th,std_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    #testing results variables
    test_acc_th,test_prec_th,test_rec_th,test_fmeasure_th,test_hamming_loss_th = [0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs # initialise the testing result lists
    final_test_acc_th,final_test_prec_th,final_test_rec_th,final_test_fmeasure_th,final_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    min_test_acc_th,min_test_prec_th,min_test_rec_th,min_test_fmeasure_th,min_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    max_test_acc_th,max_test_prec_th,max_test_rec_th,max_test_fmeasure_th,max_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    std_test_acc_th,std_test_prec_th,std_test_rec_th,std_test_fmeasure_th,std_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0
    #output variables
    output_valid = ""
    output_test = ""
    output_csv_valid = "fold,hamming_loss,acc,prec,rec,f1"
    output_csv_test = "fold,hamming_loss,acc,prec,rec,f1"
        
    time_train = [0]*num_runs # get time spent in training    
    num_run = 0
    testX_embedded = get_embedded_words(testX,word_embedding_final,vocab_size)
    print('testX_embedded:',testX_embedded)
    print('testX_embedded:',testX_embedded.shape)
    
    for trainfold in trainlist:
        # get training and validation data
        trainX,trainY=trainfold
        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
        trainX_embedded = get_embedded_words(trainX,word_embedding_final,vocab_size)
        print('trainX_embedded:',trainX_embedded)
        print('trainX_embedded:',trainX_embedded.shape)
        # code for debugging with less training data
        # debugging_num=1000
        # print('trainX_embedded_for_debugging:',trainX_embedded[1:debugging_num].shape) # for quick debugging
        # trainX_embedded = trainX_embedded[1:debugging_num] # for quick debugging
        # trainY = trainY[1:debugging_num] # for quick debugging
        
        validX,validY=validlist[num_run]
        validX = pad_sequences(validX, maxlen=FLAGS.sequence_length, value=0.)
        validX_embedded = get_embedded_words(validX,word_embedding_final,vocab_size)
        print('validX_embedded:',validX_embedded)
        print('validX_embedded:',validX_embedded.shape)
        
    # ** training **
        start_time_train = time.time()
        print('start training fold',str(num_run))
        
        trainY_int = np.asarray(trainY).astype(int)
        #print(type(trainY_int)) # <class 'numpy.ndarray'>
        #print(np.asarray(trainY).astype(int) == 1)
        
        #check trainY and remove labels that are False for all training instances
        one_class_label_list = list() # the list of labels that are not associated with any training instances.
        #print(trainY_int.shape)
        #print(sum(trainY_int[:,2]))
        for k in range(num_classes):
            if sum(trainY_int[:,k]) == 0:
                #print(k)
                one_class_label_list.append(k)
        # to delete the labels not associated to any labels in the training data        
        trainY_int_pruned =np.delete(trainY_int,one_class_label_list,1)
        print(trainY_int_pruned.shape)        
        #print(len(one_class_label_list),one_class_label_list)
        # base_lr = LogisticRegression()
        # #base_svm = SVC(kernel='rbf',C=FLAGS.C,gamma=FLAGS.gamma,probability=False)
        # #model = train_svm(trainX_embedded,np.asarray(trainY).astype(int))
        # chains = [ClassifierChain(base_lr, order='random', random_state=i) for i in range(3)]
        # count_chain=0
        # for chain in chains:
            # chain.fit(trainX_embedded,trainY_int_pruned == 1)
            # print('chain',count_chain,'out of',3,'done')
            # count_chain=count_chain+1
        # print('num_run',str(num_run),'train done.')
        chains = train_cc(trainX_embedded,trainY_int_pruned,num_chains=FLAGS.num_chains)
        time_train[num_run] = time.time() - start_time_train
        print("--- training of fold %s took %s seconds ---" % (num_run,time_train[num_run]))
        
        # evaluate on training data
        #acc, prec, rec, f_measure, hamming_loss = do_eval(model,trainX_embedded,np.asarray(trainY),hamming_q=FLAGS.ave_labels_per_doc)
        acc, prec, rec, f_measure, hamming_loss = do_eval_chains(chains,one_class_label_list,trainX_embedded,np.asarray(trainY),hamming_q=FLAGS.ave_labels_per_doc)
        #print('training:', acc, prec, rec, f_measure, hamming_loss)
        #pp = model.predict_proba(trainX_embedded)
        #print('pp',pp)
        #print('pp:',pp.shape)
        #print('pp_sum',np.sum(pp,0))
        #print('pp_sum',np.sum(pp,1))
        
        # evaluate on validation data
        #valid_acc_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run],valid_hamming_loss_th[num_run] = do_eval(model,validX_embedded,validY,hamming_q=FLAGS.ave_labels_per_doc)
        valid_acc_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run],valid_hamming_loss_th[num_run] = do_eval_chains(chains,one_class_label_list,validX_embedded,validY,hamming_q=FLAGS.ave_labels_per_doc)
        #print('validation:', acc, prec, rec, f_measure, hamming_loss)
        print("CC==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (num_run,valid_acc_th[num_run],valid_hamming_loss_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run]))
        output_valid = output_valid + "\n" + "CC==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (num_run,valid_acc_th[num_run],valid_hamming_loss_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run]) + "\n" # also output the results of each run.
        output_csv_valid = output_csv_valid + "\n" + str(num_run) + "," + str(valid_hamming_loss_th[num_run]) + "," + str(valid_acc_th[num_run]) + "," + str(valid_prec_th[num_run]) + "," + str(valid_rec_th[num_run]) + "," + str(valid_fmeasure_th[num_run])
        
        start_time_test = time.time()
        # evaluate on testing data
        #test_acc_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run],test_hamming_loss_th[num_run] = do_eval(model,testX_embedded,testY,hamming_q=FLAGS.ave_labels_per_doc)
        test_acc_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run],test_hamming_loss_th[num_run] = do_eval_chains(chains,one_class_label_list,testX_embedded,testY,hamming_q=FLAGS.ave_labels_per_doc)
        #print('testing:', acc, prec, rec, f_measure, hamming_loss)
        print("CC==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (num_run,test_acc_th[num_run],test_hamming_loss_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run]))
        output_test = output_test + "\n" + "CC==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (num_run,test_acc_th[num_run],test_hamming_loss_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run]) + "\n" # also output the results of each run.
        output_csv_test = output_csv_test + "\n" + str(num_run) + "," + str(test_hamming_loss_th[num_run]) + "," + str(test_acc_th[num_run]) + "," + str(test_prec_th[num_run]) + "," + str(test_rec_th[num_run]) + "," + str(test_fmeasure_th[num_run])
        
        print("--- testing of fold %s took %s seconds ---" % (num_run, time.time() - start_time_test))
        
        prediction_str = ""
        # output final predictions for qualitative analysis
        if FLAGS.report_rand_pred == True:
            #prediction_str = display_for_qualitative_evaluation(model, testX_embedded,testX,testY,vocabulary_index2word,vocabulary_index2word_label)
            prediction_str = display_for_qualitative_evaluation_chains(chains, one_class_label_list,testX_embedded,testX,testY,vocabulary_index2word,vocabulary_index2word_label)
        # update the num_run
        num_run = num_run + 1
    
    print('\n--Final Results--\n')
    #print('C', FLAGS.C, 'gamma', FLAGS.gamma)
    
    # report min, max, std, average for the validation results
    min_valid_acc_th = min(valid_acc_th)
    min_valid_prec_th = min(valid_prec_th)
    min_valid_rec_th = min(valid_rec_th)
    min_valid_fmeasure_th = min(valid_fmeasure_th)
    min_valid_hamming_loss_th = min(valid_hamming_loss_th)
    
    max_valid_acc_th = max(valid_acc_th)
    max_valid_prec_th = max(valid_prec_th)
    max_valid_rec_th = max(valid_rec_th)
    max_valid_fmeasure_th = max(valid_fmeasure_th)
    max_valid_hamming_loss_th = max(valid_hamming_loss_th)
    
    if FLAGS.kfold != -1:
        std_valid_acc_th = statistics.stdev(valid_acc_th) # to change
        std_valid_prec_th = statistics.stdev(valid_prec_th)
        std_valid_rec_th = statistics.stdev(valid_rec_th)
        std_valid_fmeasure_th = statistics.stdev(valid_fmeasure_th)
        std_valid_hamming_loss_th = statistics.stdev(valid_hamming_loss_th)
    
    final_valid_acc_th = sum(valid_acc_th)/num_runs
    final_valid_prec_th = sum(valid_prec_th)/num_runs
    final_valid_rec_th = sum(valid_rec_th)/num_runs
    final_valid_fmeasure_th = sum(valid_fmeasure_th)/num_runs
    final_valid_hamming_loss_th = sum(valid_hamming_loss_th)/num_runs
    
    print("CC==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_valid_acc_th,std_valid_acc_th,min_valid_acc_th,max_valid_acc_th,final_valid_hamming_loss_th,std_valid_hamming_loss_th,min_valid_hamming_loss_th,max_valid_hamming_loss_th,final_valid_prec_th,std_valid_prec_th,min_valid_prec_th,max_valid_prec_th,final_valid_rec_th,std_valid_rec_th,min_valid_rec_th,max_valid_rec_th,final_valid_fmeasure_th,std_valid_fmeasure_th,min_valid_fmeasure_th,max_valid_fmeasure_th))
    #output the result to a file
    output_valid = output_valid + "\n" + "CC==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_valid_acc_th,std_valid_acc_th,min_valid_acc_th,max_valid_acc_th,final_valid_hamming_loss_th,std_valid_hamming_loss_th,min_valid_hamming_loss_th,max_valid_hamming_loss_th,final_valid_prec_th,std_valid_prec_th,min_valid_prec_th,max_valid_prec_th,final_valid_rec_th,std_valid_rec_th,min_valid_rec_th,max_valid_rec_th,final_valid_fmeasure_th,std_valid_fmeasure_th,min_valid_fmeasure_th,max_valid_fmeasure_th) + "\n"
    output_csv_valid = output_csv_valid + "\n" + "average" + "," + str(round(final_valid_hamming_loss_th,3)) + "±" + str(round(std_valid_hamming_loss_th,3)) + "," + str(round(final_valid_acc_th,3)) + "±" + str(round(std_valid_acc_th,3)) + "," + str(round(final_valid_prec_th,3)) + "±" + str(round(std_valid_prec_th,3)) + "," + str(round(final_valid_rec_th,3)) + "±" + str(round(std_valid_rec_th,3)) + "," + str(round(final_valid_fmeasure_th,3)) + "±" + str(round(std_valid_fmeasure_th,3))
    
    # report min, max, std, average for the testing results
    min_test_acc_th = min(test_acc_th)
    min_test_prec_th = min(test_prec_th)
    min_test_rec_th = min(test_rec_th)
    min_test_fmeasure_th = min(test_fmeasure_th)
    min_test_hamming_loss_th = min(test_hamming_loss_th)
    
    max_test_acc_th = max(test_acc_th)
    max_test_prec_th = max(test_prec_th)
    max_test_rec_th = max(test_rec_th)
    max_test_fmeasure_th = max(test_fmeasure_th)
    max_test_hamming_loss_th = max(test_hamming_loss_th)
    
    if FLAGS.kfold != -1:
        std_test_acc_th = statistics.stdev(test_acc_th) # to change
        std_test_prec_th = statistics.stdev(test_prec_th)
        std_test_rec_th = statistics.stdev(test_rec_th)
        std_test_fmeasure_th = statistics.stdev(test_fmeasure_th)
        std_test_hamming_loss_th = statistics.stdev(test_hamming_loss_th)
    
    final_test_acc_th = sum(test_acc_th)/num_runs
    final_test_prec_th = sum(test_prec_th)/num_runs
    final_test_rec_th = sum(test_rec_th)/num_runs
    final_test_fmeasure_th = sum(test_fmeasure_th)/num_runs
    final_test_hamming_loss_th = sum(test_hamming_loss_th)/num_runs
    
    print("SVM==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_test_acc_th,std_test_acc_th,min_test_acc_th,max_test_acc_th,final_test_hamming_loss_th,std_test_hamming_loss_th,min_test_hamming_loss_th,max_test_hamming_loss_th,final_test_prec_th,std_test_prec_th,min_test_prec_th,max_test_prec_th,final_test_rec_th,std_test_rec_th,min_test_rec_th,max_test_rec_th,final_test_fmeasure_th,std_test_fmeasure_th,min_test_fmeasure_th,max_test_fmeasure_th))
    #output the result to a file
    output_test = output_test + "\n" + "SVM==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_test_acc_th,std_test_acc_th,min_test_acc_th,max_test_acc_th,final_test_hamming_loss_th,std_test_hamming_loss_th,min_test_hamming_loss_th,max_test_hamming_loss_th,final_test_prec_th,std_test_prec_th,min_test_prec_th,max_test_prec_th,final_test_rec_th,std_test_rec_th,min_test_rec_th,max_test_rec_th,final_test_fmeasure_th,std_test_fmeasure_th,min_test_fmeasure_th,max_test_fmeasure_th) + "\n"
    output_csv_test = output_csv_test + "\n" + "average" + "," + str(round(final_test_hamming_loss_th,3)) + "±" + str(round(std_test_hamming_loss_th,3)) + "," + str(round(final_test_acc_th,3)) + "±" + str(round(std_test_acc_th,3)) + "," + str(round(final_test_prec_th,3)) + "±" + str(round(std_test_prec_th,3)) + "," + str(round(final_test_rec_th,3)) + "±" + str(round(std_test_rec_th,3)) + "," + str(round(final_test_fmeasure_th,3)) + "±" + str(round(std_test_fmeasure_th,3))
    
    setting = "dataset:" + str(FLAGS.dataset) + "\nC: " + str(FLAGS.C) + "\ngamma: " + str(FLAGS.gamma)
    print("--- The whole program took %s seconds ---" % (time.time() - start_time))
    time_used = "--- The whole program took %s seconds ---" % (time.time() - start_time)
    if FLAGS.kfold != -1:
        print("--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,statistics.stdev(time_train)))
        average_time_train = "--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,statistics.stdev(time_train))
    else:
        print("--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,0))
        average_time_train = "--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,0)

    # output setting configuration, results, prediction and time used
    output_to_file('svm ' + str(FLAGS.dataset) + " C " + str(FLAGS.C) + ' gamma' + str(FLAGS.gamma) + ' gp_id' + str(FLAGS.marking_id) + '.txt',setting + '\n' + output_valid + '\n' + output_test + '\n' + prediction_str + '\n' + time_used + '\n' + average_time_train)
    # output structured evaluation results
    output_to_file('svm ' + str(FLAGS.dataset) + " C " + str(FLAGS.C) + ' gamma' + str(FLAGS.gamma) + ' gp_id' + str(FLAGS.marking_id) + ' valid.csv',output_csv_valid)
    output_to_file('svm ' + str(FLAGS.dataset) + " C " + str(FLAGS.C) + ' gamma' + str(FLAGS.gamma) + ' gp_id' + str(FLAGS.marking_id) + ' test.csv',output_csv_test)