def main(_): #os.environ['CUDA_VISIBLE_DEVICES'] = '' if FLAGS.dataset == "bibsonomy-clean": word2vec_model_path = FLAGS.word2vec_model_path_bib traning_data_path = FLAGS.training_data_path_bib FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 11.59 elif FLAGS.dataset == "zhihu-sample": word2vec_model_path = FLAGS.word2vec_model_path_zhihu traning_data_path = FLAGS.training_data_path_zhihu FLAGS.sequence_length = 100 FLAGS.ave_labels_per_doc = 2.45 elif FLAGS.dataset == "citeulike-a-clean": word2vec_model_path = FLAGS.word2vec_model_path_cua traning_data_path = FLAGS.training_data_path_cua FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 11.6 elif FLAGS.dataset == "citeulike-t-clean": word2vec_model_path = FLAGS.word2vec_model_path_cut traning_data_path = FLAGS.training_data_path_cut FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 7.68 # 1. create trainlist, validlist and testlist trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( word2vec_model_path, name_scope=FLAGS.dataset + "-lda") #simple='simple' vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( voabulary_label=traning_data_path, name_scope=FLAGS.dataset + "-lda") num_classes = len(vocabulary_word2index_label) print(vocabulary_index2word_label[0], vocabulary_index2word_label[1]) vocab_size = len(vocabulary_word2index) print("vocab_size:", vocab_size) # choosing whether to use k-fold cross-validation or hold-out validation if FLAGS.kfold == -1: # hold-out train, valid, test = load_data_multilabel_new( vocabulary_word2index, vocabulary_word2index_label, keep_label_percent=FLAGS.keep_label_percent, valid_portion=FLAGS.valid_portion, test_portion=FLAGS.test_portion, multi_label_flag=FLAGS.multi_label_flag, traning_data_path=traning_data_path) # here train, test are tuples; turn train into trainlist. trainlist, validlist, testlist = list(), list(), list() trainlist.append(train) validlist.append(valid) testlist.append(test) else: # k-fold trainlist, validlist, testlist = load_data_multilabel_new_k_fold( vocabulary_word2index, vocabulary_word2index_label, keep_label_percent=FLAGS.keep_label_percent, kfold=FLAGS.kfold, test_portion=FLAGS.test_portion, multi_label_flag=FLAGS.multi_label_flag, traning_data_path=traning_data_path) # here trainlist, testlist are list of tuples. # get and pad testing data: there is only one testing data, but kfold training and validation data assert len(testlist) == 1 testX, testY = testlist[0] testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length # 3. transform trainlist to the format. x_train, x_test: training and test feature matrices of size (n_samples, n_features) #print(len(trainlist)) #trainX,trainY = trainlist[0] #trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) #print(len(trainX)) #print(len(trainX[0])) #print(trainX[0]) #print(len(trainY)) #print(len(trainY[0])) #print(trainY[0]) #print(np.asarray(trainY).shape) num_runs = len(trainlist) #validation results variables valid_acc_th, valid_prec_th, valid_rec_th, valid_fmeasure_th, valid_hamming_loss_th = [ 0 ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [ 0 ] * num_runs # initialise the result lists final_valid_acc_th, final_valid_prec_th, final_valid_rec_th, final_valid_fmeasure_th, final_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 min_valid_acc_th, min_valid_prec_th, min_valid_rec_th, min_valid_fmeasure_th, min_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 max_valid_acc_th, max_valid_prec_th, max_valid_rec_th, max_valid_fmeasure_th, max_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 std_valid_acc_th, std_valid_prec_th, std_valid_rec_th, std_valid_fmeasure_th, std_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 #testing results variables test_acc_th, test_prec_th, test_rec_th, test_fmeasure_th, test_hamming_loss_th = [ 0 ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [ 0 ] * num_runs # initialise the testing result lists final_test_acc_th, final_test_prec_th, final_test_rec_th, final_test_fmeasure_th, final_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 min_test_acc_th, min_test_prec_th, min_test_rec_th, min_test_fmeasure_th, min_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 max_test_acc_th, max_test_prec_th, max_test_rec_th, max_test_fmeasure_th, max_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 std_test_acc_th, std_test_prec_th, std_test_rec_th, std_test_fmeasure_th, std_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0 #output variables output_valid = "" output_test = "" output_csv_valid = "fold,hamming_loss,acc,prec,rec,f1" output_csv_test = "fold,hamming_loss,acc,prec,rec,f1" time_train = [0] * num_runs # get time spent in training num_run = 0 mallet_path = FLAGS.mallet_path num_topics = FLAGS.num_topics alpha = 50 / num_topics iterations = FLAGS.iterations k_num_doc = FLAGS.k_num_doc remove_pad_id = True remove_dot = True docs_test = generateLDAdocFromIndex(testX, vocabulary_index2word, remove_pad_id=remove_pad_id, remove_dot=remove_dot) for trainfold in trainlist: # get training and validation data trainX, trainY = trainfold trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # generate training data for gensim MALLET wrapper for LDA docs = generateLDAdocFromIndex(trainX, vocabulary_index2word, remove_pad_id=remove_pad_id, remove_dot=remove_dot) #print(docs[10]) id2word = corpora.Dictionary(docs) corpus = [id2word.doc2bow(text) for text in docs] #print(corpus[10]) # generate validation data for gensim MALLET wrapper for LDA validX, validY = validlist[num_run] validX = pad_sequences(validX, maxlen=FLAGS.sequence_length, value=0.) docs_valid = generateLDAdocFromIndex(validX, vocabulary_index2word, remove_pad_id=remove_pad_id, remove_dot=remove_dot) corpus_valid = [id2word.doc2bow(text) for text in docs_valid] # generate testing data for gensim MALLET wrapper for LDA corpus_test = [id2word.doc2bow(text) for text in docs_test] # training start_time_train = time.time() print('start training fold', str(num_run)) model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, alpha=alpha, id2word=id2word, iterations=iterations) pprint(model.show_topics(formatted=False)) print('num_run', str(num_run), 'train done.') time_train[num_run] = time.time() - start_time_train print("--- training of fold %s took %s seconds ---" % (num_run, time_train[num_run])) # represent each document as a topic vector #mat_train = np.array(model[corpus]) # this will cause an Error with large num_topics, e.g. 1000 or higher. #Thus, we turn the MALLET LDA model to a native Gensim LDA model model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model) mat_train = np.array( model.get_document_topics(corpus, minimum_probability=0.0)) #print(len(model[corpus[0]])) #print(len(model[corpus[1]])) #print(len(model[corpus[2]])) #print(mat_train.shape) mat_train = mat_train[:, :, 1] # documents in training set as a matrix of topic probabilities # evaluate on training data #if num_run == 0 and FLAGS.kfold != -1: # do this only for the first fold in k-fold cross-validation to save time # acc, prec, rec, f_measure, hamming_loss = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus, trainY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) # print('training:', acc, prec, rec, f_measure, hamming_loss) # validation valid_acc_th[num_run], valid_prec_th[num_run], valid_rec_th[ num_run], valid_fmeasure_th[num_run], valid_hamming_loss_th[ num_run] = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus_valid, validY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) print( "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run], valid_prec_th[num_run], valid_rec_th[num_run], valid_fmeasure_th[num_run])) output_valid = output_valid + "\n" + "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % ( num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run], valid_prec_th[num_run], valid_rec_th[num_run], valid_fmeasure_th[ num_run]) + "\n" # also output the results of each run. output_csv_valid = output_csv_valid + "\n" + str(num_run) + "," + str( valid_hamming_loss_th[num_run]) + "," + str( valid_acc_th[num_run]) + "," + str( valid_prec_th[num_run]) + "," + str( valid_rec_th[num_run]) + "," + str( valid_fmeasure_th[num_run]) start_time_test = time.time() # evaluate on testing data test_acc_th[num_run], test_prec_th[num_run], test_rec_th[ num_run], test_fmeasure_th[num_run], test_hamming_loss_th[ num_run] = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus_test, testY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) print( "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (num_run, test_acc_th[num_run], test_hamming_loss_th[num_run], test_prec_th[num_run], test_rec_th[num_run], test_fmeasure_th[num_run])) output_test = output_test + "\n" + "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % ( num_run, test_acc_th[num_run], test_hamming_loss_th[num_run], test_prec_th[num_run], test_rec_th[num_run], test_fmeasure_th[ num_run]) + "\n" # also output the results of each run. output_csv_test = output_csv_test + "\n" + str(num_run) + "," + str( test_hamming_loss_th[num_run]) + "," + str( test_acc_th[num_run]) + "," + str( test_prec_th[num_run]) + "," + str( test_rec_th[num_run]) + "," + str( test_fmeasure_th[num_run]) print("--- testing of fold %s took %s seconds ---" % (num_run, time.time() - start_time_test)) prediction_str = "" # output final predictions for qualitative analysis if FLAGS.report_rand_pred == True: prediction_str = display_for_qualitative_evaluation( model, k_num_doc, mat_train, trainY, corpus_test, testX, testY, vocabulary_index2word, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc) # update the num_run num_run = num_run + 1 print('\n--Final Results--\n') #print('C', FLAGS.C, 'gamma', FLAGS.gamma) # report min, max, std, average for the validation results min_valid_acc_th = min(valid_acc_th) min_valid_prec_th = min(valid_prec_th) min_valid_rec_th = min(valid_rec_th) min_valid_fmeasure_th = min(valid_fmeasure_th) min_valid_hamming_loss_th = min(valid_hamming_loss_th) max_valid_acc_th = max(valid_acc_th) max_valid_prec_th = max(valid_prec_th) max_valid_rec_th = max(valid_rec_th) max_valid_fmeasure_th = max(valid_fmeasure_th) max_valid_hamming_loss_th = max(valid_hamming_loss_th) if FLAGS.kfold != -1: std_valid_acc_th = statistics.stdev(valid_acc_th) # to change std_valid_prec_th = statistics.stdev(valid_prec_th) std_valid_rec_th = statistics.stdev(valid_rec_th) std_valid_fmeasure_th = statistics.stdev(valid_fmeasure_th) std_valid_hamming_loss_th = statistics.stdev(valid_hamming_loss_th) final_valid_acc_th = sum(valid_acc_th) / num_runs final_valid_prec_th = sum(valid_prec_th) / num_runs final_valid_rec_th = sum(valid_rec_th) / num_runs final_valid_fmeasure_th = sum(valid_fmeasure_th) / num_runs final_valid_hamming_loss_th = sum(valid_hamming_loss_th) / num_runs print( "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_valid_acc_th, std_valid_acc_th, min_valid_acc_th, max_valid_acc_th, final_valid_hamming_loss_th, std_valid_hamming_loss_th, min_valid_hamming_loss_th, max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th, min_valid_prec_th, max_valid_prec_th, final_valid_rec_th, std_valid_rec_th, min_valid_rec_th, max_valid_rec_th, final_valid_fmeasure_th, std_valid_fmeasure_th, min_valid_fmeasure_th, max_valid_fmeasure_th)) #output the result to a file output_valid = output_valid + "\n" + "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % ( final_valid_acc_th, std_valid_acc_th, min_valid_acc_th, max_valid_acc_th, final_valid_hamming_loss_th, std_valid_hamming_loss_th, min_valid_hamming_loss_th, max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th, min_valid_prec_th, max_valid_prec_th, final_valid_rec_th, std_valid_rec_th, min_valid_rec_th, max_valid_rec_th, final_valid_fmeasure_th, std_valid_fmeasure_th, min_valid_fmeasure_th, max_valid_fmeasure_th) + "\n" output_csv_valid = output_csv_valid + "\n" + "average" + "," + str( round(final_valid_hamming_loss_th, 3)) + "±" + str( round(std_valid_hamming_loss_th, 3) ) + "," + str(round(final_valid_acc_th, 3)) + "±" + str( round(std_valid_acc_th, 3)) + "," + str( round(final_valid_prec_th, 3)) + "±" + str( round(std_valid_prec_th, 3)) + "," + str( round(final_valid_rec_th, 3)) + "±" + str( round(std_valid_rec_th, 3)) + "," + str( round(final_valid_fmeasure_th, 3)) + "±" + str( round(std_valid_fmeasure_th, 3)) # report min, max, std, average for the testing results min_test_acc_th = min(test_acc_th) min_test_prec_th = min(test_prec_th) min_test_rec_th = min(test_rec_th) min_test_fmeasure_th = min(test_fmeasure_th) min_test_hamming_loss_th = min(test_hamming_loss_th) max_test_acc_th = max(test_acc_th) max_test_prec_th = max(test_prec_th) max_test_rec_th = max(test_rec_th) max_test_fmeasure_th = max(test_fmeasure_th) max_test_hamming_loss_th = max(test_hamming_loss_th) if FLAGS.kfold != -1: std_test_acc_th = statistics.stdev(test_acc_th) # to change std_test_prec_th = statistics.stdev(test_prec_th) std_test_rec_th = statistics.stdev(test_rec_th) std_test_fmeasure_th = statistics.stdev(test_fmeasure_th) std_test_hamming_loss_th = statistics.stdev(test_hamming_loss_th) final_test_acc_th = sum(test_acc_th) / num_runs final_test_prec_th = sum(test_prec_th) / num_runs final_test_rec_th = sum(test_rec_th) / num_runs final_test_fmeasure_th = sum(test_fmeasure_th) / num_runs final_test_hamming_loss_th = sum(test_hamming_loss_th) / num_runs print( "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th, final_test_hamming_loss_th, std_test_hamming_loss_th, min_test_hamming_loss_th, max_test_hamming_loss_th, final_test_prec_th, std_test_prec_th, min_test_prec_th, max_test_prec_th, final_test_rec_th, std_test_rec_th, min_test_rec_th, max_test_rec_th, final_test_fmeasure_th, std_test_fmeasure_th, min_test_fmeasure_th, max_test_fmeasure_th)) #output the result to a file output_test = output_test + "\n" + "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % ( final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th, final_test_hamming_loss_th, std_test_hamming_loss_th, min_test_hamming_loss_th, max_test_hamming_loss_th, final_test_prec_th, std_test_prec_th, min_test_prec_th, max_test_prec_th, final_test_rec_th, std_test_rec_th, min_test_rec_th, max_test_rec_th, final_test_fmeasure_th, std_test_fmeasure_th, min_test_fmeasure_th, max_test_fmeasure_th) + "\n" output_csv_test = output_csv_test + "\n" + "average" + "," + str( round(final_test_hamming_loss_th, 3)) + "±" + str( round(std_test_hamming_loss_th, 3)) + "," + str( round(final_test_acc_th, 3) ) + "±" + str(round(std_test_acc_th, 3)) + "," + str( round(final_test_prec_th, 3)) + "±" + str( round(std_test_prec_th, 3)) + "," + str( round(final_test_rec_th, 3)) + "±" + str( round(std_test_rec_th, 3)) + "," + str( round(final_test_fmeasure_th, 3)) + "±" + str( round(std_test_fmeasure_th, 3)) setting = "dataset:" + str(FLAGS.dataset) + "\nT: " + str( FLAGS.num_topics) + "\nk: " + str(FLAGS.k_num_doc) + ' \ni: ' + str( FLAGS.iterations) print("--- The whole program took %s seconds ---" % (time.time() - start_time)) time_used = "--- The whole program took %s seconds ---" % (time.time() - start_time) if FLAGS.kfold != -1: print("--- The average training took %s ± %s seconds ---" % (sum(time_train) / num_runs, statistics.stdev(time_train))) average_time_train = "--- The average training took %s ± %s seconds ---" % ( sum(time_train) / num_runs, statistics.stdev(time_train)) else: print("--- The average training took %s ± %s seconds ---" % (sum(time_train) / num_runs, 0)) average_time_train = "--- The average training took %s ± %s seconds ---" % ( sum(time_train) / num_runs, 0) # output setting configuration, results, prediction and time used output_to_file( 'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' + str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' + str(FLAGS.marking_id) + '.txt', setting + '\n' + output_valid + '\n' + output_test + '\n' + prediction_str + '\n' + time_used + '\n' + average_time_train) # output structured evaluation results output_to_file( 'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' + str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' + str(FLAGS.marking_id) + ' valid.csv', output_csv_valid) output_to_file( 'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' + str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' + str(FLAGS.marking_id) + ' test.csv', output_csv_test)
def main(_): #os.environ['CUDA_VISIBLE_DEVICES'] = '' if FLAGS.dataset == "bibsonomy-clean": word2vec_model_path = FLAGS.word2vec_model_path_bib traning_data_path = FLAGS.training_data_path_bib FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 11.59 elif FLAGS.dataset == "zhihu-sample": word2vec_model_path = FLAGS.word2vec_model_path_zhihu traning_data_path = FLAGS.training_data_path_zhihu FLAGS.sequence_length = 100 FLAGS.ave_labels_per_doc = 2.45 elif FLAGS.dataset == "citeulike-a-clean": word2vec_model_path = FLAGS.word2vec_model_path_cua traning_data_path = FLAGS.training_data_path_cua FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 11.6 elif FLAGS.dataset == "citeulike-t-clean": word2vec_model_path = FLAGS.word2vec_model_path_cut traning_data_path = FLAGS.training_data_path_cut FLAGS.sequence_length = 300 FLAGS.ave_labels_per_doc = 7.68 # 1. create trainlist, validlist and testlist trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path,name_scope=FLAGS.dataset + "-svm") #simple='simple' vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(voabulary_label=traning_data_path, name_scope=FLAGS.dataset + "-svm") num_classes=len(vocabulary_word2index_label) print(vocabulary_index2word_label[0],vocabulary_index2word_label[1]) vocab_size = len(vocabulary_word2index) print("vocab_size:",vocab_size) # choosing whether to use k-fold cross-validation or hold-out validation if FLAGS.kfold == -1: # hold-out train, valid, test = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,keep_label_percent=FLAGS.keep_label_percent,valid_portion=FLAGS.valid_portion,test_portion=FLAGS.test_portion,multi_label_flag=FLAGS.multi_label_flag,traning_data_path=traning_data_path) # here train, test are tuples; turn train into trainlist. trainlist, validlist, testlist = list(), list(), list() trainlist.append(train) validlist.append(valid) testlist.append(test) else: # k-fold trainlist, validlist, testlist = load_data_multilabel_new_k_fold(vocabulary_word2index, vocabulary_word2index_label,keep_label_percent=FLAGS.keep_label_percent,kfold=FLAGS.kfold,test_portion=FLAGS.test_portion,multi_label_flag=FLAGS.multi_label_flag,traning_data_path=traning_data_path) # here trainlist, testlist are list of tuples. # get and pad testing data: there is only one testing data, but kfold training and validation data assert len(testlist) == 1 testX, testY = testlist[0] testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length # 2. get word_embedding matrix: shape (21425,100) word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list: which is a list of list, i.e. a list of word, where each word is a list of values as an embedding vector. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. print('embedding per word:',word_embedding_final) print('embedding per word, shape:',word_embedding_final.shape) # 3. transform trainlist to the format. x_train, x_test: training and test feature matrices of size (n_samples, n_features) #print(len(trainlist)) #trainX,trainY = trainlist[0] #trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) #print(len(trainX)) #print(len(trainX[0])) #print(trainX[0]) #print(len(trainY)) #print(len(trainY[0])) #print(trainY[0]) #print(np.asarray(trainY).shape) num_runs = len(trainlist) #validation results variables valid_acc_th,valid_prec_th,valid_rec_th,valid_fmeasure_th,valid_hamming_loss_th =[0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs # initialise the result lists final_valid_acc_th,final_valid_prec_th,final_valid_rec_th,final_valid_fmeasure_th,final_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 min_valid_acc_th,min_valid_prec_th,min_valid_rec_th,min_valid_fmeasure_th,min_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 max_valid_acc_th,max_valid_prec_th,max_valid_rec_th,max_valid_fmeasure_th,max_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 std_valid_acc_th,std_valid_prec_th,std_valid_rec_th,std_valid_fmeasure_th,std_valid_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 #testing results variables test_acc_th,test_prec_th,test_rec_th,test_fmeasure_th,test_hamming_loss_th = [0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs,[0]*num_runs # initialise the testing result lists final_test_acc_th,final_test_prec_th,final_test_rec_th,final_test_fmeasure_th,final_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 min_test_acc_th,min_test_prec_th,min_test_rec_th,min_test_fmeasure_th,min_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 max_test_acc_th,max_test_prec_th,max_test_rec_th,max_test_fmeasure_th,max_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 std_test_acc_th,std_test_prec_th,std_test_rec_th,std_test_fmeasure_th,std_test_hamming_loss_th = 0.0,0.0,0.0,0.0,0.0 #output variables output_valid = "" output_test = "" output_csv_valid = "fold,hamming_loss,acc,prec,rec,f1" output_csv_test = "fold,hamming_loss,acc,prec,rec,f1" time_train = [0]*num_runs # get time spent in training num_run = 0 testX_embedded = get_embedded_words(testX,word_embedding_final,vocab_size) print('testX_embedded:',testX_embedded) print('testX_embedded:',testX_embedded.shape) for trainfold in trainlist: # get training and validation data trainX,trainY=trainfold trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) trainX_embedded = get_embedded_words(trainX,word_embedding_final,vocab_size) print('trainX_embedded:',trainX_embedded) print('trainX_embedded:',trainX_embedded.shape) # code for debugging with less training data # debugging_num=1000 # print('trainX_embedded_for_debugging:',trainX_embedded[1:debugging_num].shape) # for quick debugging # trainX_embedded = trainX_embedded[1:debugging_num] # for quick debugging # trainY = trainY[1:debugging_num] # for quick debugging validX,validY=validlist[num_run] validX = pad_sequences(validX, maxlen=FLAGS.sequence_length, value=0.) validX_embedded = get_embedded_words(validX,word_embedding_final,vocab_size) print('validX_embedded:',validX_embedded) print('validX_embedded:',validX_embedded.shape) # ** training ** start_time_train = time.time() print('start training fold',str(num_run)) trainY_int = np.asarray(trainY).astype(int) #print(type(trainY_int)) # <class 'numpy.ndarray'> #print(np.asarray(trainY).astype(int) == 1) #check trainY and remove labels that are False for all training instances one_class_label_list = list() # the list of labels that are not associated with any training instances. #print(trainY_int.shape) #print(sum(trainY_int[:,2])) for k in range(num_classes): if sum(trainY_int[:,k]) == 0: #print(k) one_class_label_list.append(k) # to delete the labels not associated to any labels in the training data trainY_int_pruned =np.delete(trainY_int,one_class_label_list,1) print(trainY_int_pruned.shape) #print(len(one_class_label_list),one_class_label_list) # base_lr = LogisticRegression() # #base_svm = SVC(kernel='rbf',C=FLAGS.C,gamma=FLAGS.gamma,probability=False) # #model = train_svm(trainX_embedded,np.asarray(trainY).astype(int)) # chains = [ClassifierChain(base_lr, order='random', random_state=i) for i in range(3)] # count_chain=0 # for chain in chains: # chain.fit(trainX_embedded,trainY_int_pruned == 1) # print('chain',count_chain,'out of',3,'done') # count_chain=count_chain+1 # print('num_run',str(num_run),'train done.') chains = train_cc(trainX_embedded,trainY_int_pruned,num_chains=FLAGS.num_chains) time_train[num_run] = time.time() - start_time_train print("--- training of fold %s took %s seconds ---" % (num_run,time_train[num_run])) # evaluate on training data #acc, prec, rec, f_measure, hamming_loss = do_eval(model,trainX_embedded,np.asarray(trainY),hamming_q=FLAGS.ave_labels_per_doc) acc, prec, rec, f_measure, hamming_loss = do_eval_chains(chains,one_class_label_list,trainX_embedded,np.asarray(trainY),hamming_q=FLAGS.ave_labels_per_doc) #print('training:', acc, prec, rec, f_measure, hamming_loss) #pp = model.predict_proba(trainX_embedded) #print('pp',pp) #print('pp:',pp.shape) #print('pp_sum',np.sum(pp,0)) #print('pp_sum',np.sum(pp,1)) # evaluate on validation data #valid_acc_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run],valid_hamming_loss_th[num_run] = do_eval(model,validX_embedded,validY,hamming_q=FLAGS.ave_labels_per_doc) valid_acc_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run],valid_hamming_loss_th[num_run] = do_eval_chains(chains,one_class_label_list,validX_embedded,validY,hamming_q=FLAGS.ave_labels_per_doc) #print('validation:', acc, prec, rec, f_measure, hamming_loss) print("CC==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (num_run,valid_acc_th[num_run],valid_hamming_loss_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run])) output_valid = output_valid + "\n" + "CC==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (num_run,valid_acc_th[num_run],valid_hamming_loss_th[num_run],valid_prec_th[num_run],valid_rec_th[num_run],valid_fmeasure_th[num_run]) + "\n" # also output the results of each run. output_csv_valid = output_csv_valid + "\n" + str(num_run) + "," + str(valid_hamming_loss_th[num_run]) + "," + str(valid_acc_th[num_run]) + "," + str(valid_prec_th[num_run]) + "," + str(valid_rec_th[num_run]) + "," + str(valid_fmeasure_th[num_run]) start_time_test = time.time() # evaluate on testing data #test_acc_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run],test_hamming_loss_th[num_run] = do_eval(model,testX_embedded,testY,hamming_q=FLAGS.ave_labels_per_doc) test_acc_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run],test_hamming_loss_th[num_run] = do_eval_chains(chains,one_class_label_list,testX_embedded,testY,hamming_q=FLAGS.ave_labels_per_doc) #print('testing:', acc, prec, rec, f_measure, hamming_loss) print("CC==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (num_run,test_acc_th[num_run],test_hamming_loss_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run])) output_test = output_test + "\n" + "CC==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (num_run,test_acc_th[num_run],test_hamming_loss_th[num_run],test_prec_th[num_run],test_rec_th[num_run],test_fmeasure_th[num_run]) + "\n" # also output the results of each run. output_csv_test = output_csv_test + "\n" + str(num_run) + "," + str(test_hamming_loss_th[num_run]) + "," + str(test_acc_th[num_run]) + "," + str(test_prec_th[num_run]) + "," + str(test_rec_th[num_run]) + "," + str(test_fmeasure_th[num_run]) print("--- testing of fold %s took %s seconds ---" % (num_run, time.time() - start_time_test)) prediction_str = "" # output final predictions for qualitative analysis if FLAGS.report_rand_pred == True: #prediction_str = display_for_qualitative_evaluation(model, testX_embedded,testX,testY,vocabulary_index2word,vocabulary_index2word_label) prediction_str = display_for_qualitative_evaluation_chains(chains, one_class_label_list,testX_embedded,testX,testY,vocabulary_index2word,vocabulary_index2word_label) # update the num_run num_run = num_run + 1 print('\n--Final Results--\n') #print('C', FLAGS.C, 'gamma', FLAGS.gamma) # report min, max, std, average for the validation results min_valid_acc_th = min(valid_acc_th) min_valid_prec_th = min(valid_prec_th) min_valid_rec_th = min(valid_rec_th) min_valid_fmeasure_th = min(valid_fmeasure_th) min_valid_hamming_loss_th = min(valid_hamming_loss_th) max_valid_acc_th = max(valid_acc_th) max_valid_prec_th = max(valid_prec_th) max_valid_rec_th = max(valid_rec_th) max_valid_fmeasure_th = max(valid_fmeasure_th) max_valid_hamming_loss_th = max(valid_hamming_loss_th) if FLAGS.kfold != -1: std_valid_acc_th = statistics.stdev(valid_acc_th) # to change std_valid_prec_th = statistics.stdev(valid_prec_th) std_valid_rec_th = statistics.stdev(valid_rec_th) std_valid_fmeasure_th = statistics.stdev(valid_fmeasure_th) std_valid_hamming_loss_th = statistics.stdev(valid_hamming_loss_th) final_valid_acc_th = sum(valid_acc_th)/num_runs final_valid_prec_th = sum(valid_prec_th)/num_runs final_valid_rec_th = sum(valid_rec_th)/num_runs final_valid_fmeasure_th = sum(valid_fmeasure_th)/num_runs final_valid_hamming_loss_th = sum(valid_hamming_loss_th)/num_runs print("CC==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_valid_acc_th,std_valid_acc_th,min_valid_acc_th,max_valid_acc_th,final_valid_hamming_loss_th,std_valid_hamming_loss_th,min_valid_hamming_loss_th,max_valid_hamming_loss_th,final_valid_prec_th,std_valid_prec_th,min_valid_prec_th,max_valid_prec_th,final_valid_rec_th,std_valid_rec_th,min_valid_rec_th,max_valid_rec_th,final_valid_fmeasure_th,std_valid_fmeasure_th,min_valid_fmeasure_th,max_valid_fmeasure_th)) #output the result to a file output_valid = output_valid + "\n" + "CC==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_valid_acc_th,std_valid_acc_th,min_valid_acc_th,max_valid_acc_th,final_valid_hamming_loss_th,std_valid_hamming_loss_th,min_valid_hamming_loss_th,max_valid_hamming_loss_th,final_valid_prec_th,std_valid_prec_th,min_valid_prec_th,max_valid_prec_th,final_valid_rec_th,std_valid_rec_th,min_valid_rec_th,max_valid_rec_th,final_valid_fmeasure_th,std_valid_fmeasure_th,min_valid_fmeasure_th,max_valid_fmeasure_th) + "\n" output_csv_valid = output_csv_valid + "\n" + "average" + "," + str(round(final_valid_hamming_loss_th,3)) + "±" + str(round(std_valid_hamming_loss_th,3)) + "," + str(round(final_valid_acc_th,3)) + "±" + str(round(std_valid_acc_th,3)) + "," + str(round(final_valid_prec_th,3)) + "±" + str(round(std_valid_prec_th,3)) + "," + str(round(final_valid_rec_th,3)) + "±" + str(round(std_valid_rec_th,3)) + "," + str(round(final_valid_fmeasure_th,3)) + "±" + str(round(std_valid_fmeasure_th,3)) # report min, max, std, average for the testing results min_test_acc_th = min(test_acc_th) min_test_prec_th = min(test_prec_th) min_test_rec_th = min(test_rec_th) min_test_fmeasure_th = min(test_fmeasure_th) min_test_hamming_loss_th = min(test_hamming_loss_th) max_test_acc_th = max(test_acc_th) max_test_prec_th = max(test_prec_th) max_test_rec_th = max(test_rec_th) max_test_fmeasure_th = max(test_fmeasure_th) max_test_hamming_loss_th = max(test_hamming_loss_th) if FLAGS.kfold != -1: std_test_acc_th = statistics.stdev(test_acc_th) # to change std_test_prec_th = statistics.stdev(test_prec_th) std_test_rec_th = statistics.stdev(test_rec_th) std_test_fmeasure_th = statistics.stdev(test_fmeasure_th) std_test_hamming_loss_th = statistics.stdev(test_hamming_loss_th) final_test_acc_th = sum(test_acc_th)/num_runs final_test_prec_th = sum(test_prec_th)/num_runs final_test_rec_th = sum(test_rec_th)/num_runs final_test_fmeasure_th = sum(test_fmeasure_th)/num_runs final_test_hamming_loss_th = sum(test_hamming_loss_th)/num_runs print("SVM==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_test_acc_th,std_test_acc_th,min_test_acc_th,max_test_acc_th,final_test_hamming_loss_th,std_test_hamming_loss_th,min_test_hamming_loss_th,max_test_hamming_loss_th,final_test_prec_th,std_test_prec_th,min_test_prec_th,max_test_prec_th,final_test_rec_th,std_test_rec_th,min_test_rec_th,max_test_rec_th,final_test_fmeasure_th,std_test_fmeasure_th,min_test_fmeasure_th,max_test_fmeasure_th)) #output the result to a file output_test = output_test + "\n" + "SVM==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (final_test_acc_th,std_test_acc_th,min_test_acc_th,max_test_acc_th,final_test_hamming_loss_th,std_test_hamming_loss_th,min_test_hamming_loss_th,max_test_hamming_loss_th,final_test_prec_th,std_test_prec_th,min_test_prec_th,max_test_prec_th,final_test_rec_th,std_test_rec_th,min_test_rec_th,max_test_rec_th,final_test_fmeasure_th,std_test_fmeasure_th,min_test_fmeasure_th,max_test_fmeasure_th) + "\n" output_csv_test = output_csv_test + "\n" + "average" + "," + str(round(final_test_hamming_loss_th,3)) + "±" + str(round(std_test_hamming_loss_th,3)) + "," + str(round(final_test_acc_th,3)) + "±" + str(round(std_test_acc_th,3)) + "," + str(round(final_test_prec_th,3)) + "±" + str(round(std_test_prec_th,3)) + "," + str(round(final_test_rec_th,3)) + "±" + str(round(std_test_rec_th,3)) + "," + str(round(final_test_fmeasure_th,3)) + "±" + str(round(std_test_fmeasure_th,3)) setting = "dataset:" + str(FLAGS.dataset) + "\nC: " + str(FLAGS.C) + "\ngamma: " + str(FLAGS.gamma) print("--- The whole program took %s seconds ---" % (time.time() - start_time)) time_used = "--- The whole program took %s seconds ---" % (time.time() - start_time) if FLAGS.kfold != -1: print("--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,statistics.stdev(time_train))) average_time_train = "--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,statistics.stdev(time_train)) else: print("--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,0)) average_time_train = "--- The average training took %s ± %s seconds ---" % (sum(time_train)/num_runs,0) # output setting configuration, results, prediction and time used output_to_file('svm ' + str(FLAGS.dataset) + " C " + str(FLAGS.C) + ' gamma' + str(FLAGS.gamma) + ' gp_id' + str(FLAGS.marking_id) + '.txt',setting + '\n' + output_valid + '\n' + output_test + '\n' + prediction_str + '\n' + time_used + '\n' + average_time_train) # output structured evaluation results output_to_file('svm ' + str(FLAGS.dataset) + " C " + str(FLAGS.C) + ' gamma' + str(FLAGS.gamma) + ' gp_id' + str(FLAGS.marking_id) + ' valid.csv',output_csv_valid) output_to_file('svm ' + str(FLAGS.dataset) + " C " + str(FLAGS.C) + ' gamma' + str(FLAGS.gamma) + ' gp_id' + str(FLAGS.marking_id) + ' test.csv',output_csv_test)