def train_pos(args): ###################################################### # # Data preprocessing # ###################################################### datasets = { args.datasetName: #Name of the dataset { 'columns': { 0: 'tokens', 1: 'POS', 2: 'chunk_BIO' }, #CoNLL format for the input data. Column 1 contains tokens, column 3 contains POS information 'label': 'POS', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None } #Lines in the input data starting with this string will be skipped. Can be used to skip comments } # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically :: embeddingsPath = args.embeddings #'komninos_english_embeddings.gz' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = { 'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25) } model = BiLSTM(params) model.setMappings(mappings, embeddings) model.setDataset(datasets, data) model.modelSavePath = args.model_save + '/[ModelName]_[Epoch].h5' model.fit(epochs=25) fpath = args.model_save + '/' + args.datasetName + '_1.h5' save_dir, model_init = os.path.split(fpath) print(save_dir) print(model_init) remove_except_last_model(save_dir, model_init)
# Data preprocessing # ###################################################### datasets = read_dict(args.input_dataset_conf) print("DATASET CONF {} {}".format(type(datasets), datasets)) target_task = get_target_task(datasets) print("TARGET TASK {} {}".format(type(target_task), target_task)) aux_task = get_auxiliary_task(datasets) print("AUX TASK {} {}".format(type(aux_task), aux_task)) prepare_training_data(datasets) embeddingsPath = 'komninos_english_embeddings.gz' #Word embeddings by Levy et al: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/ # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters #params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25),'charEmbeddings': 'CNN', # 'customClassifier': {'unidep_pos': ['Softmax'], 'conll2000_chunking': [('LSTM', 50), 'CRF']}} # TODO Replace customClassifier dengan main task + auxiliary task
datasets = { 'conll2000_chunking': #Name of the dataset { 'columns': { 0: 'tokens', 1: 'POS', 2: 'chunk_BIO' }, #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding 'label': 'chunk_BIO', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None } #Lines in the input data starting with this string will be skipped. Can be used to skip comments } # :: Transform datasets to a pickle file :: pickleFile = perpareDataset(datasets) # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: embeddings_file = 'embeddings/komninos_english_embeddings.gz' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elmo_mode = 'weighted_average' #Which GPU to use for ELMo. -1 for CPU if torch.cuda.is_available(): elmo_cuda_device = 0 else: elmo_cuda_device = -1 embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file, elmo_weight_file, elmo_mode, elmo_cuda_device)
} # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: embeddings_file = 'embeddings/bioEmbeddings.txt' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elmo_mode = 'average' elmo_cuda_device = 0 #Which GPU to use. -1 for CPU embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file, elmo_weight_file, elmo_mode, elmo_cuda_device) # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example. embLookup.loadCache('embeddings/elmo_cache_deid.pkl') pickleFile = perpareDataset(datasets, embLookup) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = { 'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5)
# for k in k_shot: # datasets[args.mod+'__'+k]=seed if args.k_shot != '16.0' and args.k_shot != '1.0' and args.k_shot != '2.0' and args.k_shot != '4.0' and args.k_shot != '8.0': print('k_shot doesn\'t exist') exit() datasets[args.mod + '__' + args.k_shot] = seed # datasets={} # datasets[args.mod+'__'+args.shot]=seeds[args.mod] # :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically :: embeddingsPath = '/datastore/liu121/nosqldb2/emnlp_ukplab/skipgram' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets, args.k_shot) print('data prepare successful: %s' % pickleFile) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # print('mappings type: ',type(mappings)) # for key in mappings: # print(key) # print(mappings[key]) # print('===============')
# USER ACTION NEEDED # specify name of small data set (target data!) dataSmallName = "Stab201X" dataSmallColumns = dataSmallName + 'TARGET_BIO' dataSetFiles = [(dataSmallName,{0:'tokens', 1:dataSmallColumns})] # USER ACTION NEEDED # put embeddings here (GloVe and [Komninos & Mandhar 2016]) embeddingsPath = dname + "/embeddings" # USER ACTION NEEDED # make sure that dataPath contains the small train,dev,test files for dataName and # the full size train,dev,test for all other datasets (as created by splitsFullData.py) # if needed, change path name dataPath = dname + "/data_multiTask" for dataAllName in os.listdir(dataPath): if dataAllName != dataSmallName: dataAllColumns = dataAllName + '_BIO' dataSetFiles.append((dataAllName,{0:'tokens', 1:dataAllColumns})) print(dataSetFiles) for embeddingsName in os.listdir(embeddingsPath): if embeddingsName == "glove.txt" or embeddingsName == "wiki_extvec": embeddingsFull = embeddingsPath + "/" + embeddingsName if os.path.isfile(embeddingsFull): print(embeddingsName) pickleFile = perpareDataset(embeddingsFull, dataSetFiles)
'model_conseil_doctrine': #Name of the dataset {'columns': {0:'tokens', 1:'NER_BIO'}, # {'columns': {0:'tokens', 1:"is_name", 2:'NER_BIO'}, #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding 'label': 'NER_BIO', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None, } } # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically :: embeddingsPath = '/home/pavel/code/conseil_detat/anonymisation_software/train/embeddings.vec' embeddingsPath = 'jurinet_parsed_100.vec.gz' # embeddingsPath = 'embeddings.vec' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets, useExistent=False) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = {'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5), 'charEmbeddings':'LSTM', 'optimizer': 'adam', 'featureNames': ['tokens', 'casing']}
datasets = read_dict(args.input_dataset_conf) print("{} {}".format(type(datasets), datasets)) # :: Needed for simulating the low resource scenarios if args.nb_sentence is not None: datasets[list(datasets.keys())[0]]['nb_sentence'] = args.nb_sentence prepare_training_data(datasets) # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically :: embeddingsPath = 'komninos_english_embeddings.gz' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets, reducePretrainedEmbeddings=True) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = read_dict(args.param_conf) if args.tune == 0:
def pickleData(embeddingsPath, datasetName, dataColumns): datasetFiles = [ (datasetName, dataColumns), ] pickleFile = perpareDataset(embeddingsPath, datasetFiles)
def train_pos(args): ###################################################### # # Data preprocessing # ###################################################### datasets = { args.datasetName: #Name of the dataset { 'columns': { 0: 'tokens', 1: 'POS', 2: 'chunk_BIO' }, #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding 'label': 'POS', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None } #Lines in the input data starting with this string will be skipped. Can be used to skip comments } # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: embeddings_file = None elmo_options_file = args.elmo_options elmo_weight_file = args.elmo_weights elmo_mode = 'weighted_average' #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elmo_cuda_device = args.cuda_device #Which GPU to use. -1 for CPU embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file, elmo_weight_file, elmo_mode, elmo_cuda_device) # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example. embLookup.loadCache(args.pkl_path) pickleFile = perpareDataset(datasets, embLookup) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = { 'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5) } model = ELMoBiLSTM(embLookup, params) model.setMappings(mappings) model.setDataset(datasets, data) model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5" model.fit(epochs=25) fpath = args.model_save + '/' + args.datasetName + '_1.h5' save_dir, model_init = os.path.split(fpath) print(save_dir) print(model_init) # remove trained files except from the last file remove_except_last_model(save_dir, model_init)
def main(): pkl_path = perpareDataset(embeddingsPath, datasets_config) data_holder, task2id, id2task, num_feat, num_voc, num_char, tgt_dict, embeddings = Dataloader_elmo1.multitask_dataloader( pkl_path, num_task=num_task, batch_size=BATCH_SIZE) para = model_para task2label = {"conll2000": "chunk", "unidep": "POS", "conll2003": "NER"} logger = Logger('./logs/' + str(args.gpu)) para["id2task"] = id2task para["n_feats"] = num_feat para["n_vocs"] = num_voc para["n_tasks"] = num_task para["out_size"] = [ len(tgt_dict[task2label[id2task[ids]]]) for ids in range(num_task) ] para["n_chars"] = num_char model = Model_s.build_model_cnn(para) model.Word_embeddings.apply_weights(embeddings) params = list(filter(lambda p: p.requires_grad, model.parameters())) num_params = sum(p.numel() for p in model.parameters()) print(model) def lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015): lr = init_lr / (1 + decay_rate * epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr return optimizer def exp_lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015): lr = init_lr * decay_rate**epoch for param_group in optimizer.param_groups: param_group['lr'] = lr return optimizer if args.optim == "adam": model_optim = optim_custorm.adam( para["d_hid"], DenseSparseAdam(params, lr=0.0015, betas=(0.9, 0.98), eps=1e-9)) args.decay = None elif args.optim == "sgd": model_optim = optim.SGD(params, lr=0.015, momentum=args.momentum, weight_decay=1e-8) if args.mode == "train": best_F1 = 0 if not para["crf"]: calculate_loss = nn.NLLLoss() else: calculate_loss = None print("Start training...") print('-' * 60) KLLoss = None start_point = time.time() for epoch_idx in range(NUM_EPOCH): if args.optim == "sgd": if args.decay == "exp": model_optim = exp_lr_decay(model_optim, epoch_idx) elif args.decay == "normal": model_optim = lr_decay(model_optim, epoch_idx) Pre, Rec, F1, loss_list = run_epoch(model, data_holder, model_optim, calculate_loss, KLLoss, para, epoch_idx, id2task, logger) use_time = time.time() - start_point print("Time using: %f mins" % (use_time / 60)) if not best_F1 or best_F1 < F1: best_F1 = F1 Model_s.save_model(model_path, model, para) print('*' * 60) print( "Save model with average Pre: %f, Rec: %f, F1: %f on dev set." % (Pre, Rec, F1)) save_idx = epoch_idx print('*' * 60) print("save model at epoch:", save_idx) elif args.mode == "finetune": para_path = os.path.join(path, 'para.pkl') with open(para_path, "wb") as f: para_save = pickle.load(f) model = Model_s.build_model(para_save) model = Model_s.read_model(model_path, model) params = list(filter(lambda p: p.requires_grad, model.parameters())) model_optim = optim_custorm.adam( para["d_hid"], 1, 800, torch.optim.SGD(params, lr=0.0, momentum=0.9)) else: para_path = os.path.join(model_path, 'para.pkl') with open(para_path, "rb") as f: para_save = pickle.load(f) model = Model_s.build_model_cnn(para_save) model = Model_s.read_model(model_path, model) prec_list_test, rec_list_test, f1_list_test, acc_list_test = infer( model, data_holder, "test")
# for k in k_shot: # datasets[args.mod+'__'+k]=seed # if args.k_shot!='16.0' and args.k_shot!='1.0' and args.k_shot!='2.0' and args.k_shot!='4.0' and args.k_shot!='8.0': # print('k_shot doesn\'t exist') # exit() datasets[args.mod + '__' + args.k_shot] = seed # datasets={} # datasets[args.mod+'__'+args.shot]=seeds[args.mod] # :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically :: embeddingsPath = '/datastore/liu121/nosqldb2/emnlp_ukplab/skipgram' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile_train, pickleFile_dev, pickleFile_test = perpareDataset( embeddingsPath, datasets, args.k_shot) print('data prepare successful: %s, %s, and %s' % (pickleFile_train, pickleFile_dev, pickleFile_test)) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data_train = loadDatasetPickle(pickleFile_train) embeddings, mappings, data_dev = loadDatasetPickle(pickleFile_dev) embeddings, mappings, data_test = loadDatasetPickle(pickleFile_test) # print('mappings type: ',type(mappings)) # for key in mappings: