for i in range(num_tasks): with codecs.open(parse_args.out_files[i], 'r', 'utf-8') as f: test_lines0 = f.readlines() test_lines.append(test_lines0) for i in range(num_tasks): test_word0 = utils.read_features_sentences(test_lines[i]) print(test_word0[0]) print("Number of docs : " + str(len(test_word0))) test_word.append(test_word0) test_word = reorder_list(test_word, reorder_index) #for i in range(num_files): for i in reorder_index: dataset, forw_corp, back_corp = utils.load_data_pkl_file(file_prefix + 'train_' + str(i) + ".pkl") dev_dataset, forw_dev, back_dev = utils.load_data_pkl_file(file_prefix + 'dev_' + str(i) + ".pkl") test_dataset, forw_test, back_test = utils.load_data_pkl_file(file_prefix + 'test_' + str(i) + ".pkl") new_dataset_loader.append([torch.utils.data.DataLoader(tup, parse_args.batch_size, shuffle=True, drop_last=False, num_workers=40) for tup in dataset]) new_dev_dataset_loader.append([torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]) new_test_dataset_loader.append([torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]) print('Loading data dictionary from file ' + save_filename) with open(save_filename, 'rb') as fp: d = pickle.load(fp) args = d['args'] args.gpu = 0 label_maps = d['label_maps']
def get_multi_task_model( prefix='multi_task_tagger/data/pkl_files/num_gen_cas_tense_lem/50k_', args_file='multi_task_tagger/data/pkl_files/num_gen_cas_tense_lem/50k_all_data.pkl', order_pkl_file='num-gen-cas-tense-lem', order='num-gen-cas-tense-lem', num_tasks=5, output_directory='test_data/poetry/', eva_matrix='a', word_dim=200, char_dim=30, char_layers=1, word_layers=1, small_crf=True, checkpoint='multi_task_tagger/checkpoints_new/layered_num_gen_cas_tense_lem_15_May/', word_hidden=256, batch_size=16, char_hidden=100, load_check_point=True, do_eval=True, checkpoint_file='multi_task_tagger/checkpoints_new/layered_num_gen_cas_tense_lem_15_May/cwlm_lstm_crf_cas_2.model', out_files=None): #out_files = out_files assert len(order_pkl_file.split('-')) == len(order.split('-')) pkl_order_dict = {} pkl_orders = order_pkl_file.split('-') for i, o in enumerate(pkl_orders): pkl_order_dict[o] = i order_list = order.split('-') reorder_index = [] for o in order_list: reorder_index.append(pkl_order_dict[o]) print("Re-ordering list is ") print(reorder_index) output_directory = output_directory save_filename = args_file args_file = args_file num_tasks = num_tasks num_files = num_tasks print("Number of tasks : " + str(num_tasks)) print("Order of the tasks is " + order) print("CRF type -- " + str(small_crf)) file_prefix = prefix new_dataset_loader = [] new_dev_dataset_loader = [] new_test_dataset_loader = [] test_word = [] test_lines = [] #print('Loading --out files for annotation....') #for i in range(num_tasks): # with codecs.open(out_files[i], 'r', 'utf-8') as f: # test_lines0 = f.readlines() # test_lines.append(test_lines0) #for i in range(num_tasks): # test_word0 = utils.read_features_sentences(test_lines[i]) # print(test_word0[0]) # print("Number of docs : " + str(len(test_word0))) # test_word.append(test_word0) #test_word = reorder_list(test_word, reorder_index) #for i in range(num_files): for i in reorder_index: dataset, forw_corp, back_corp = utils.load_data_pkl_file(file_prefix + 'train_' + str(i) + ".pkl") dev_dataset, forw_dev, back_dev = utils.load_data_pkl_file( file_prefix + 'dev_' + str(i) + ".pkl") test_dataset, forw_test, back_test = utils.load_data_pkl_file( file_prefix + 'test_' + str(i) + ".pkl") new_dataset_loader.append([ torch.utils.data.DataLoader(tup, batch_size, shuffle=True, drop_last=False, num_workers=40) for tup in dataset ]) new_dev_dataset_loader.append([ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset ]) new_test_dataset_loader.append([ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset ]) print('Loading data dictionary from file ' + save_filename) with open(save_filename, 'rb') as fp: d = pickle.load(fp) args = d['args'] args.gpu = 0 label_maps = d['label_maps'] char_map = d['char_map'] f_map = d['f_map'] file_num = d['file_num'] in_doc_words = d['in_doc_words'] embedding_tensor = d['embedding_tensor'] dataset_loader = d['dataset_loader'] dev_dataset_loader = d['dev_dataset_loader'] test_dataset_loader = d['test_dataset_loader'] forw_corp = d['forw_corp'] back_corp = d['back_corp'] forw_dev = d['forw_dev'] back_dev = d['back_dev'] forw_test = d['forw_test'] back_test = d['back_test'] file_num = num_tasks print('Shape of Embedding Tensor') print(embedding_tensor.shape) # Reorder label_maps label_maps = reorder_list(label_maps, reorder_index) args.checkpoint = checkpoint # Set args args.word_hidden = word_hidden args.char_hidden = char_hidden args.word_dim = word_dim args.char_dim = char_dim args.char_layers = char_layers args.word_layers = word_layers args.small_crf = small_crf args.eva_matrix = eva_matrix args.load_check_point = load_check_point args.do_eval = do_eval args.checkpoint_file = checkpoint_file print(args.word_hidden) print("Will save checkpoint in " + str(args.checkpoint)) inv_f_map = {} for k, v in f_map.items(): inv_f_map[v] = k #print(inv_f_map[6430]) print(f_map['<unk>']) args.output_annotation = True print("Number of files : " + str(file_num)) dataset_loader = new_dataset_loader dev_dataset_loader = new_dev_dataset_loader test_dataset_loader = new_test_dataset_loader if args.gpu >= 0: torch.cuda.set_device(args.gpu) print(args) args.batch_size = batch_size # build model print('building model') print(label_maps) label_maps_sizes = [len(lmap) for lmap in label_maps] print(label_maps_sizes) print('File_num' + str(file_num)) ner_model = LM_LSTM_CRF(label_maps_sizes, len(char_map), args.char_dim, args.char_hidden, args.char_layers, args.word_dim, args.word_hidden, args.word_layers, len(f_map), args.drop_out, file_num, large_CRF=args.small_crf, if_highway=args.high_way, in_doc_words=in_doc_words, highway_layers=args.highway_layers) if args.load_check_point: print(args.checkpoint_file) if os.path.isfile(args.checkpoint_file): print("loading checkpoint: '{}'".format(args.checkpoint_file)) checkpoint_file = torch.load(args.checkpoint_file) else: raise FileNotFoundError('File not found') ner_model.load_state_dict(checkpoint_file['state_dict']) else: if not args.rand_embedding: ner_model.load_pretrained_word_embedding(embedding_tensor) ner_model.rand_init(init_word_embedding=args.rand_embedding) if args.update == 'sgd': optimizer = optim.SGD(ner_model.parameters(), lr=args.lr, momentum=args.momentum) elif args.update == 'adam': optimizer = optim.Adam(ner_model.parameters(), lr=args.lr) if args.load_check_point and args.load_opt: optimizer.load_state_dict(checkpoint_file['optimizer']) crit_lm = nn.CrossEntropyLoss() crit_ner_list = nn.ModuleList() for i in range(file_num): ith_label_map = label_maps[i] crit_ner = CRFLoss_vb(len(ith_label_map), ith_label_map['<start>'], ith_label_map['<pad>']) crit_ner_list.append(crit_ner) if args.gpu >= 0: if_cuda = True print('device: ' + str(args.gpu)) torch.cuda.set_device(args.gpu) crit_lm.cuda() for i in range(file_num): crit_ner_list[i].cuda() ner_model.cuda() packer_list = [] for i in range(file_num): packer = CRFRepack_WC(len(label_maps[i]), True) packer_list.append(packer) else: if_cuda = False packer_list = [] for i in range(file_num): packer = CRFRepack_WC(len(label_maps[i]), False) packer_list.append(packer) tot_length = sum(map(lambda t: len(t), dataset_loader)) best_f1 = [] for i in range(file_num): best_f1.append(float('-inf')) best_pre = [] for i in range(file_num): best_pre.append(float('-inf')) best_rec = [] for i in range(file_num): best_rec.append(float('-inf')) best_acc = [] for i in range(file_num): best_acc.append(float('-inf')) track_list = list() start_time = time.time() print('Num of epochs : ' + str(args.epoch)) epoch_list = range(args.start_epoch, args.start_epoch + args.epoch) patience_count = 0 evaluator_list = [] predictor_list = [] for i in range(file_num): evaluator = eval_wc(packer_list[i], label_maps[i], args.eva_matrix) predictor = predict_wc(if_cuda, f_map, char_map, label_maps[i], f_map['<eof>'], char_map['\n'], label_maps[i]['<pad>'], label_maps[i]['<start>'], True, args.batch_size, args.caseless) #NEW evaluator_list.append(evaluator) predictor_list.append(predictor) return predictor_list, ner_model