def main(): parse = argparse.ArgumentParser() # ---------- environment setting: which gpu ------- parse.add_argument('-gpu', '--gpu', type=str, default='0', help='which gpu to use: 0 or 1') # ---------- foler path of train/test data ------- parse.add_argument('-folder', '--folder_path', type=str, default='datasets/Amazon-670K/data/deeplearning_data/adjacent_labels/', help='path to train/test data') # ---------- vocab and word embeddings -------- #parse.add_argument('-vocab', '--vocab_path', type=str, default='vocab.6B.300d.pkl', help='path to the vocab') parse.add_argument('-word_embeddings', '--word_embedding_path', type=str, default='word_emb.6B.300d.npy', help='path to the word embeddings') # ---------- model ---------- parse.add_argument('-max_seq_len', '--max_seq_len', type=int, default=500, help='maximum sequence length') parse.add_argument('-model', '--model', type=str, default='CNN', help='model: LSTM, biLSTM, CNN') parse.add_argument('-pretrained_model', '--pretrained_model_path', type=str, default=None, help='path to the pretrained model') parse.add_argument('-cal_metrics', '--cal_metrics', type=int, default=1, help='if calculate wts_p and wts_ndcg for baseline results') parse.add_argument('-alpha', '--alpha', type=float, default=0.2, help='trade off parameter between baseline score and refinement score') # ---------- params for CNN ------------ parse.add_argument('-num_filters', '--num_filters', type=int, default=32, help='number of filters in CNN') parse.add_argument('-pooling_units', '--pooling_units', type=int, default=64, help='number of pooling units') parse.add_argument('-dropout_keep_prob', '--dropout_keep_prob', type=float, default=0.5, help='keep probability in dropout layer') filter_sizes = [2, 4, 8] # ---------- training parameters -------- #parse.add_argument('-if_use_all_true', '--if_use_all_true', type=int, default=0, help='if use all true labels for training') parse.add_argument('-if_output_all_labels', '--if_output_all_labels', type=int, default=0, help='if output all labels') parse.add_argument('-n_epochs', '--n_epochs', type=int, default=10, help='number of epochs') parse.add_argument('-batch_size', '--batch_size', type=int, default=256, help='batch size for training') parse.add_argument('-batch_pid_size', '--batch_pid_size', type=int, default=5, help='batch pid size for testing') #parse.add_argument('-num_candidate', '--num_candidate', type=int, default=20, help='number of candidate labels') #parse.add_argument('-topk', '--topk', type=int, default=6, help='k in competitive layer') parse.add_argument('-show_batches', '--show_batches', type=int, default=1000, help='show how many batches have been processed.') parse.add_argument('-lr', '--learning_rate', type=float, default=0.0001, help='learning rate') parse.add_argument('-update_rule', '--update_rule', type=str, default='adam', help='update rule') # ------ train or predict ------- parse.add_argument('-train', '--train', type=int, default=1, help='if training') parse.add_argument('-test', '--test', type=int, default=0, help='if testing') parse.add_argument('-predict', '--predict', type=int, default=0, help='if predicting') args = parse.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('-------------- load vocab and word embeddings ---------------') word_embeddings = np.load('datasets/material/' + args.word_embedding_path) # word_embedding_dim word_embedding_dim = word_embeddings.shape[-1] print('word_embeddings shape: ' + str(word_embeddings.shape)) # add '<PAD>' embedding word_embeddings = np.concatenate((np.zeros((1, word_embedding_dim)), word_embeddings), axis=0) print('after add PAD embedding, word_embeddings shape:' + str(word_embeddings.shape)) print('-------------- load label embeddings ------------------------') all_labels, label_embeddings = generate_label_embedding_from_file_2(args.folder_path + 'label.embeddings') label_embeddings = np.array(label_embeddings) label_dict = dict(zip(all_labels, range(len(all_labels)))) print('number of labels: ' + str(len(all_labels))) # label_embedding_dim label_embedding_dim = len(label_embeddings[all_labels[0]]) print('-------------- load label propensity ------------------------') label_prop = load_pickle(args.folder_path + 'inv_prop_dict.pkl') print('-------------- load train/test data -------------------------') train_doc = load_pickle(args.folder_path + 'train_doc_wordID.pkl') test_doc = load_pickle(args.folder_path + 'test_doc_wordID.pkl') train_label = load_pickle(args.folder_path + 'train_label.pkl') test_label = load_pickle(args.folder_path + 'test_label.pkl') print('-------------- load candidate labels ------------------------') if 'sleec' in args.model: candidate_type = 'sleec' elif 'pfastrexml' in args.model: candidate_type = 'pfastrexml' elif 'pfastxml' in args.model: candidate_type = 'pfastxml' elif 'fastxml' in args.model: candidate_type = 'fastxml' print('candidate from: ' + candidate_type) candidate_folder_path = args.folder_path + candidate_type + '_candidate/' train_candidate_label = load_pickle(candidate_folder_path + 'train_candidate_label.pkl') test_candidate_label = load_pickle(candidate_folder_path + 'test_candidate_label.pkl') print('============== create train/test data loader ...') if 'XML' not in args.model: train_loader = TrainDataLoader2(train_doc, train_label, train_candidate_label, label_dict, label_prop, 10, 10, max_seq_len=args.max_seq_len) max_seq_len = train_loader.max_seq_len #max_seq_len = args.max_seq_len #train_loader = {} print('max_seq_len: ' + str(max_seq_len)) test_loader = TestDataLoader2(test_doc, test_label, test_candidate_label, label_dict, label_prop, max_seq_len=max_seq_len, if_cal_metrics=args.cal_metrics) # test_loader = DataLoader3(test_doc, test_label, test_candidate_label, label_dict, args.batch_size, # given_seq_len=True, max_seq_len=max_seq_len) # ----------------------- train ------------------------ print('============== build model ...') if 'biLSTM' in args.model: print('build biLSTM model ...') # biLSTM: max_seq_len, word_embedding_dim, hidden_dim, label_embedding_dim, num_classify_hidden, args.batch_size model = biLSTM(max_seq_len, word_embedding_dim, 64, label_embedding_dim, 32, args) args.if_use_seq_len = 1 elif 'LSTM' in args.model: print('build LSTM model ...') # LSTM: max_seq_len, word_embedding_dim, hidden_dim, label_embedding_dim, num_classify_hidden, args.batch_size model = LSTM(max_seq_len, word_embedding_dim, 64, label_embedding_dim, 32, args) args.if_use_seq_len = 1 elif 'CNN' in args.model: print('build CNN model ...') # CNN: sequence_length, word_embeddings, filter_sizes, label_embeddings, num_classify_hidden, args # args.num_filters, args.pooling_units, args.batch_size, args.dropout_keep_prob model = CNN(max_seq_len, word_embeddings, filter_sizes, label_embeddings, 32, args) args.if_use_seq_len = 0 elif 'XML' in args.model: print('build XML-CNN model ...') train_loader = DataLoader5(train_doc, train_label, all_labels, args.batch_size, given_seq_len=False, max_seq_len=args.max_seq_len) max_seq_len = train_loader.max_seq_len test_loader = DataLoader5(test_doc, test_label, all_labels, args.batch_size, given_seq_len=True, max_seq_len=max_seq_len) # max_seq_len, word_embedding, filter_sizes, label_output_dim, hidden_dim, args # args.num_filters, args.pooling_units, args.batch_size, args.dropout_keep_prob model = XML_CNN(max_seq_len, word_embeddings, filter_sizes, len(all_labels), 128, args) args.if_output_all_labels = 1 args.if_use_seq_len = 0 print('================= model solver ...') # solver: __init__(self, model, train_data, test_data, **kwargs): solver = ModelSolver(model, train_loader, test_loader, if_use_seq_len=args.if_use_seq_len, if_output_all_labels=args.if_output_all_labels, show_batches=args.show_batches, n_epochs=args.n_epochs, batch_size=args.batch_size, update_rule=args.update_rule, learning_rate=args.learning_rate, pretrained_model=args.pretrained_model_path, model_path=args.folder_path + args.model + '/', test_path=args.folder_path + args.model + '/') # train if args.train: print('================= begin training...') solver.train(args.folder_path + args.model + '/outcome.txt') # test if args.test: print('================= begin testing...') solver.test(args.folder_path + args.model + '/' + args.pretrained_model_path, args.folder_path + args.model + '/test_outcome.txt') # predict if args.predict: print('================= begin predicting...') predict_path = args.folder_path+'model_save/'+args.model+'/' solver.predict(trained_model_path=predict_path, output_file_path=predict_path+'predict_outcome.txt', k=10, emb_saved=1, can_saved=1)
def main(): parse = argparse.ArgumentParser() # ---------- environment setting: which gpu ------- parse.add_argument('-gpu', '--gpu', type=str, default='0', help='which gpu to use: 0 or 1') # ---------- foler path of train/test data ------- parse.add_argument('-valid_labels', '--valid_labels', type=int, default=0, help='-if remove invalid labels') parse.add_argument('-data', '--data', type=str, default='eurlex', help='dataset') # ---------- model ---------- parse.add_argument('-word_embedding_dim', '--word_embedding_dim', type=int, default=100, help='dim of word embedding') parse.add_argument('-aggr_type', '--aggr_type', type=str, default='sum', help='aggregation type in embedding layer') parse.add_argument('-vocab_size', '--vocab_size', type=int, default=5000, help='vocabulary size') parse.add_argument('-max_seq_len', '--max_seq_len', type=int, default=500, help='maximum sequence length') parse.add_argument('-model', '--model', type=str, default='NN', help='model: NN, LSTM, biLSTM, CNN') parse.add_argument('-pretrained_model', '--pretrained_model_path', type=str, default=None, help='path to the pretrained model') parse.add_argument('-dropout_keep_prob', '--dropout_keep_prob', type=float, default=0.5, help='keep probability in dropout layer') parse.add_argument('-saved_dis', '--saved_dis', type=int, default=0, help='whether to use saved dis matrix') parse.add_argument('-use_sne', '--use_sne', type=int, default=1, help='whether to use sne regularization') parse.add_argument('-ac_lbl_ratio', '--ac_lbl_ratio', type=float, default=0.5, help='ratio of active labels in sne regularization') parse.add_argument('-use_propensity', '--use_propensity', type=int, default=1, help='whether to use propensity loss') parse.add_argument('-use_bi_inter', '--use_bi_inter', type=int, default=0, help='whether to add bi-interactive layer') parse.add_argument('-use_comp', '--use_comp', type=int, default=0, help='whether to add competitive layer') parse.add_argument('-topk', '--topk', type=int, default=10, help='top k neurons in competitive layer') parse.add_argument('-factor', '--factor', type=float, default=0.01, help='factor in competitive layer') parse.add_argument('-lamb', '--lamb', type=float, default=0.002, help='lambda for weight regularization') # ---------- training parameters -------- parse.add_argument('-n_epochs', '--n_epochs', type=int, default=10, help='number of epochs') parse.add_argument('-batch_size', '--batch_size', type=int, default=32, help='batch size for training') parse.add_argument('-lr', '--learning_rate', type=float, default=0.002, help='learning rate') parse.add_argument('-update_rule', '--update_rule', type=str, default='adam', help='update rule') # ------ train or predict ------- parse.add_argument('-train', '--train', type=int, default=1, help='if training') parse.add_argument('-test', '--test', type=int, default=0, help='if testing') parse.add_argument('-predict', '--predict', type=int, default=0, help='if predicting') args = parse.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print '-------------- load labels ------------------------' # default='datasets/eurlex/trn_tst_data/' if args.valid_labels: args.folder_path = 'datasets/' + args.data + '/trn_tst_data/valid_label_data/' else: args.folder_path = 'datasets/' + args.data + '/trn_tst_data/all_label_data/' #label_prop_dict = load_pickle(args.folder_path + 'inv_prop_dict.pkl') label_prop = load_pickle(args.folder_path + 'inv_prop.pkl') label_dict = load_pickle(args.folder_path + 'label_dict.pkl') all_labels = load_pickle(args.folder_path + 'index_label.pkl') num_labels = len(all_labels) print 'real number of labels: ' + str(num_labels) print 'maximum label: ' + str(np.max(all_labels)) print 'minimum label: ' + str(np.min(all_labels)) print 'number of labels: ' + str(num_labels) print '-------------- load train/test data -------------------------' train_doc = load_pickle(args.folder_path + 'train_doc_wordID.pkl') test_doc = load_pickle(args.folder_path + 'test_doc_wordID.pkl') train_label = load_pickle(args.folder_path + 'train_label.pkl') test_label = load_pickle(args.folder_path + 'test_label.pkl') print '============== create train/test data loader ...' train_loader = DataLoader_all(train_doc, train_label, label_dict, label_prop, batch_size=args.batch_size, max_seq_len=args.max_seq_len, ac_lbl_ratio=args.ac_lbl_ratio, folder_path=args.folder_path) test_loader = DataLoader_all(test_doc, test_label, label_dict, label_prop, batch_size=args.batch_size, max_seq_len=args.max_seq_len, folder_path=args.folder_path) print '============== build model ...' print 'build NN model ...' model = NN(args.max_seq_len, args.vocab_size, args.word_embedding_dim, num_labels, label_prop, 32, args) args.if_use_seq_len = 1 print '================= model solver ...' solver = ModelSolver(model, train_loader, test_loader, n_epochs=args.n_epochs, batch_size=args.batch_size, update_rule=args.update_rule, learning_rate=args.learning_rate, pretrained_model=args.pretrained_model_path, model_path=args.folder_path + args.model + '/', log_path=args.folder_path + args.model + '/', test_path=args.folder_path + args.model + '/', use_sne=args.use_sne, saved_dis=args.saved_dis ) # train if args.train: print '================= begin training...' solver.train(args.folder_path + args.model + '/outcome.txt') # test if args.test: print '================= begin testing...' solver.test(args.folder_path + args.model + '/' + args.pretrained_model_path, args.folder_path + args.model + '/test_outcome.txt') # predict if args.predict: print '================= begin predicting...' predict_path = args.folder_path+'model_save/'+args.model+'/' solver.predict(trained_model_path=predict_path, output_file_path=predict_path+'predict_outcome.txt', k=10, emb_saved=1, can_saved=1)