nbf = 64 # No. Conv Filters flen = 17 # Conv Filter length nlstm = 100 # No. LSTM layers ndrop = 0.1 # LSTM layer dropout parser = argparse.ArgumentParser(description='AMPscanner') parser.add_argument('checkpoint', help='checkpoint directory') parser.add_argument('log_path', help='The file path of log file', default='AMPscanner.log') args = parser.parse_args() checkpoint = args.checkpoint log_path = args.log_path finetune_models = glob(os.path.join(checkpoint, '*.pth')) finetune_models = [p for p in finetune_models if p.find('epoch=000') < 0] dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path) dictionary = IndexDictionary.load(dictionary_path=dictionary_path, vocabulary_size=vocabulary_size) vocabulary_size = len(dictionary) test_path = test_path if data_dir is None else join(data_dir, test_path) test_dataset = ClassificationDataset(data_path=test_path, dictionary=dictionary) test_dataloader = DataLoader( test_dataset, batch_size=batch_size, shuffle=True, collate_fn=ClassificationDataset.collate_function) BESTS = [0., 0., 0., 0., 0., 0.] BESTS_MODEL = '' for finetune_model in finetune_models: print('=' * 35) print('Model: {}'.format(finetune_model))
def finetuneSigunet(pretrained_checkpoint, data_dir, train_path, val_path, dictionary_path, vocabulary_size, batch_size, max_len, epochs, lr, clip_grads, device, layers_count, hidden_size, heads_count, d_ff, dropout_prob, log_output, checkpoint_dir, print_every, save_every, config, run_name=None, **_): random.seed(0) np.random.seed(0) torch.manual_seed(0) train_path = train_path if data_dir is None else join(data_dir, train_path) val_path = val_path if data_dir is None else join(data_dir, val_path) dictionary_path = dictionary_path if data_dir is None else join( data_dir, dictionary_path) run_name = run_name if run_name is not None else make_run_name( RUN_NAME_FORMAT, phase='finetune', config=config) logger = make_logger(run_name, log_output) logger.info('Run name : {run_name}'.format(run_name=run_name)) logger.info(config) logger.info('Constructing dictionaries...') dictionary = IndexDictionary.load(dictionary_path=dictionary_path, vocabulary_size=vocabulary_size) vocabulary_size = len(dictionary) #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens') logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format( vocabulary_size=vocabulary_size)) logger.info('Loading datasets...') train_dataset = Seq2SeqDataset(data_path=train_path, dictionary=dictionary) val_dataset = Seq2SeqDataset(data_path=val_path, dictionary=dictionary) logger.info('Train dataset size : {dataset_size}'.format( dataset_size=len(train_dataset))) logger.info('Building model...') pretrained_model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size, forward_encoded=True) pretrained_model = stateLoading(pretrained_model, pretrained_checkpoint) # pretrained_model = disable_grad(pretrained_model) model = sigunet(model=pretrained_model, m=28, n=4, kernel_size=7, pool_size=2, threshold=0.1, device=device) logger.info(model) logger.info('{parameters_count} parameters'.format( parameters_count=sum([p.nelement() for p in model.parameters()]))) # Have not figured this out yet metric_functions = [Seq2Seq_Metric] train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=Seq2SeqDataset.collate_function) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=Seq2SeqDataset.collate_function) optimizer = Adam(model.parameters(), lr=lr) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=3) checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config) logger.info('Start training...') trainer = Trainer(loss_model=model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, metric_functions=metric_functions, optimizer=optimizer, clip_grads=clip_grads, logger=logger, checkpoint_dir=checkpoint_dir, print_every=print_every, save_every=save_every, device=device, scheduler=scheduler, monitor='train_loss') trainer.run(epochs=epochs) return trainer
import os, sys sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) sys.path.append(os.getcwd()) from torch.utils.data import DataLoader from tqdm import tqdm from bert.preprocess.dictionary import IndexDictionary from bert.train.utils.convert import convert_to_tensor from bert.train.datasets.seq2seq import Seq2SeqDataset from bert.train.utils.collate import seq2seq_collate_function dictionary = IndexDictionary.load(dictionary_path='dic/dic.txt', vocabulary_size=100) vocabulary_size = len(dictionary) dataset = Seq2SeqDataset('data/seq2seq/example.txt', dictionary) dataloader = DataLoader(dataset, batch_size=16, collate_fn=seq2seq_collate_function) for inputs, targets, batch_count in tqdm(dataloader): inputs = convert_to_tensor(inputs, None) targets = convert_to_tensor(targets, None) assert inputs[0].shape[0] == inputs[1].shape[0] == targets.shape[0] assert inputs[0].shape[1] == inputs[1].shape[1] == targets.shape[1]
def pretrain(data_dir, train_path, val_path, dictionary_path, dataset_limit, vocabulary_size, batch_size, max_len, epochs, clip_grads, device, layers_count, hidden_size, heads_count, d_ff, dropout_prob, log_output, checkpoint_dir, print_every, save_every, config, run_name=None, pretrained_model = None, **_): random.seed(0) np.random.seed(0) torch.manual_seed(0) train_path = train_path if data_dir is None else join(data_dir, train_path) val_path = val_path if data_dir is None else join(data_dir, val_path) dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path) run_name = run_name if run_name is not None else make_run_name(RUN_NAME_FORMAT, phase='pretrain', config=config) logger = make_logger(run_name, log_output) logger.info('Run name : {run_name}'.format(run_name=run_name)) logger.info(config) logger.info('Constructing dictionaries...') dictionary = IndexDictionary.load(dictionary_path=dictionary_path, vocabulary_size=vocabulary_size) vocabulary_size = len(dictionary) #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens') logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format(vocabulary_size=vocabulary_size)) logger.info('Loading datasets...') train_dataset = PairedDataset(data_path=train_path, dictionary=dictionary, dataset_limit=dataset_limit) val_dataset = PairedDataset(data_path=val_path, dictionary=dictionary, dataset_limit=dataset_limit) logger.info('Train dataset size : {dataset_size}'.format(dataset_size=len(train_dataset))) logger.info('Building model...') model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size) if pretrained_model is not None: # Load the weights logger.info('Load the weights from %s' % pretrained_model) model = stateLoading(model, pretrained_model) logger.info(model) logger.info('{parameters_count} parameters'.format( parameters_count=sum([p.nelement() for p in model.parameters()]))) loss_model = MLMLossModel(model) if torch.cuda.device_count() > 1: loss_model = DataParallel(loss_model, output_device=1) metric_functions = [mlm_accuracy, nsp_accuracy] train_dataloader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pretraining_collate_function) val_dataloader = DataLoader( val_dataset, batch_size=batch_size, shuffle=True, collate_fn=pretraining_collate_function) optimizer = Adam(model.parameters()) scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True) checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config) logger.info('Start training...') trainer = Trainer( loss_model=loss_model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, metric_functions=metric_functions, optimizer=optimizer, clip_grads=clip_grads, logger=logger, checkpoint_dir=checkpoint_dir, print_every=print_every, save_every=save_every, device=device, scheduler=scheduler, monitor='val_loss', comment=run_name ) trainer.run(epochs=epochs) return trainer
def finetune(pretrained_checkpoint, data_dir, train_path, val_path, dictionary_path, vocabulary_size, batch_size, max_len, epochs, lr, clip_grads, device, layers_count, hidden_size, heads_count, d_ff, dropout_prob, log_output, checkpoint_dir, print_every, save_every, config, run_name=None, **_): random.seed(0) np.random.seed(0) torch.manual_seed(0) train_path = train_path if data_dir is None else join(data_dir, train_path) val_path = val_path if data_dir is None else join(data_dir, val_path) dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path) run_name = run_name if run_name is not None else make_run_name(RUN_NAME_FORMAT, phase='finetune', config=config) logger = make_logger(run_name, log_output) logger.info('Run name : {run_name}'.format(run_name=run_name)) logger.info(config) logger.info('Constructing dictionaries...') dictionary = IndexDictionary.load(dictionary_path=dictionary_path, vocabulary_size=vocabulary_size) vocabulary_size = len(dictionary) #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens') logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format(vocabulary_size=vocabulary_size)) logger.info('Loading datasets...') train_dataset = SST2IndexedDataset(data_path=train_path, dictionary=dictionary) val_dataset = SST2IndexedDataset(data_path=val_path, dictionary=dictionary) logger.info('Train dataset size : {dataset_size}'.format(dataset_size=len(train_dataset))) logger.info('Building model...') pretrained_model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size) pretrained_model.load_state_dict(torch.load(pretrained_checkpoint, map_location='cpu')['state_dict']) model = FineTuneModel(pretrained_model, hidden_size, num_classes=2) logger.info(model) logger.info('{parameters_count} parameters'.format( parameters_count=sum([p.nelement() for p in model.parameters()]))) loss_model = ClassificationLossModel(model) metric_functions = [classification_accuracy] train_dataloader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=classification_collate_function) val_dataloader = DataLoader( val_dataset, batch_size=batch_size, shuffle=True, collate_fn=classification_collate_function) optimizer = Adam(model.parameters(), lr=lr) checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config) logger.info('Start training...') trainer = Trainer( loss_model=loss_model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, metric_functions=metric_functions, optimizer=optimizer, clip_grads=clip_grads, logger=logger, checkpoint_dir=checkpoint_dir, print_every=print_every, save_every=save_every, device=device ) trainer.run(epochs=epochs) return trainer
def pretrain(data_dir, train_path, val_path, dictionary_path, dataset_limit, vocabulary_size, batch_size, max_len, epochs, clip_grads, device, layers_count, hidden_size, heads_count, d_ff, dropout_prob, log_output, checkpoint_dir, print_every, save_every, config, run_name=None, **_): random.seed(0) np.random.seed(0) torch.manual_seed(0) train_path = train_path if data_dir is None else join(data_dir, train_path) val_path = val_path if data_dir is None else join(data_dir, val_path) dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path) run_name = run_name if run_name is not None else make_run_name(RUN_NAME_FORMAT, phase='pretrain', config=config) logger = make_logger(run_name, log_output) logger.info('Run name : {run_name}'.format(run_name=run_name)) logger.info(config) logger.info('Constructing dictionaries...') dictionary = IndexDictionary.load(dictionary_path=dictionary_path, vocabulary_size=vocabulary_size) vocabulary_size = len(dictionary) logger.info(f'dictionary vocabulary : {vocabulary_size} tokens') logger.info('Loading datasets...') train_dataset = PairedDataset(data_path=train_path, dictionary=dictionary, dataset_limit=dataset_limit) val_dataset = PairedDataset(data_path=val_path, dictionary=dictionary, dataset_limit=dataset_limit) logger.info('Train dataset size : {dataset_size}'.format(dataset_size=len(train_dataset))) logger.info('Building model...') model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size) logger.info(model) logger.info('{parameters_count} parameters'.format( parameters_count=sum([p.nelement() for p in model.parameters()]))) loss_model = MLMNSPLossModel(model) if torch.cuda.device_count() > 1: loss_model = DataParallel(loss_model, output_device=1) metric_functions = [mlm_accuracy, nsp_accuracy] train_dataloader = DataLoader( train_dataset, batch_size=batch_size, collate_fn=pretraining_collate_function) val_dataloader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=pretraining_collate_function) optimizer = NoamOptimizer(model.parameters(), d_model=hidden_size, factor=2, warmup_steps=10000, betas=(0.9, 0.999), weight_decay=0.01) checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config) logger.info('Start training...') trainer = Trainer( loss_model=loss_model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, metric_functions=metric_functions, optimizer=optimizer, clip_grads=clip_grads, logger=logger, checkpoint_dir=checkpoint_dir, print_every=print_every, save_every=save_every, device=device ) trainer.run(epochs=epochs) return trainer
def teacher(data_dir, data_path, dictionary_path, dataset_limit, vocabulary_size, batch_size, max_len, device, layers_count, hidden_size, heads_count, d_ff, dropout_prob, log_output, checkpoint, config, run_name=None, confidence=0.4, pseudo_data_path='pseudo_data.txt', **_): random.seed(0) np.random.seed(0) torch.manual_seed(0) data_path = data_path if data_dir is None else join(data_dir, data_path) dictionary_path = dictionary_path if data_dir is None else join( data_dir, dictionary_path) run_name = run_name if run_name is not None else make_run_name( RUN_NAME_FORMAT, phase='teacher', config=config) logger = make_logger(run_name, log_output) logger.info('Run name : {run_name}'.format(run_name=run_name)) logger.info(config) logger.info('Constructing dictionaries...') dictionary = IndexDictionary.load(dictionary_path=dictionary_path, vocabulary_size=vocabulary_size) vocabulary_size = len(dictionary) #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens') logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format( vocabulary_size=vocabulary_size)) logger.info('Loading datasets...') dataset = PairedDataset(data_path=data_path, dictionary=dictionary, dataset_limit=dataset_limit) logger.info( 'dataset size : {dataset_size}'.format(dataset_size=len(dataset))) logger.info('Building model...') model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size) model = stateLoading(model, checkpoint) model = disable_grad(model) model.to(device=device) logger.info(model) logger.info('{parameters_count} parameters'.format( parameters_count=sum([p.nelement() for p in model.parameters()]))) if torch.cuda.device_count() > 1: loss_model = DataParallel(loss_model, output_device=1) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=pretraining_collate_function) true_sequences = [] predicted_sequences = [] hints = [] confidences = [] for inputs, targets, batch_count in tqdm(dataloader, ncols=60): inputs = convert_to_tensor(inputs, device) targets = convert_to_tensor(targets, device) token_predictions, _ = model(inputs) token_predictions = F.softmax(token_predictions, dim=-1) token_targets = targets[0] indexed_sequences, _ = inputs for t, p, g in zip(indexed_sequences, token_predictions, token_targets): tmp_input = [dictionary.index_to_token(i.item()) for i in t] tmp_pred = [(dictionary.index_to_token(torch.argmax(i).item()), torch.max(i).item()) for i in p] tmp_target = [dictionary.index_to_token(i.item()) for i in g] tmp_input = tmp_input[1:] tmp_pseudo = tmp_input.copy() tmp_pred = tmp_pred[1:] tmp_target = tmp_target[1:] tmp_hint = ['='] * len(tmp_input) prob_num = 0. prob_denom = 0 while MASK_TOKEN in tmp_input: index = tmp_input.index(MASK_TOKEN) tmp_input[index] = tmp_target[index] p = tmp_pred[index] tmp_pseudo[index] = p[0] prob_num += p[1] prob_denom += 1 tmp_hint[index] = '*' if prob_denom == 0: continue prob = prob_num / prob_denom if prob > confidence: true_sequences.append(' '.join(tmp_input).replace( PAD_TOKEN, '')) predicted_sequences.append(' '.join(tmp_pseudo).replace( PAD_TOKEN, '')) hints.append(' '.join(tmp_hint)) confidences.append(prob) # print(' '.join(tmp_input).replace(PAD_TOKEN, '')) # print(' '.join(tmp_pseudo).replace(PAD_TOKEN, '')) with open(pseudo_data_path, 'w') as f: for t, p, h, c in zip(true_sequences, predicted_sequences, hints, confidences): f.write('%s\n' % t) f.write('%s\n' % p) f.write('%s\n' % ''.join(h)) f.write('confidence: %s\n' % c) f.write('-\n') f.close()
def finetuneAMPscanner(pretrained_checkpoint, data_dir, train_path, val_path, dictionary_path, vocabulary_size, batch_size, max_len, epochs, lr, clip_grads, device, layers_count, hidden_size, heads_count, d_ff, dropout_prob, log_output, checkpoint_dir, print_every, save_every, embedding_vector_length, nbf, flen, nlstm, ndrop, config, run_name=None, fixed_length=None, **_): random.seed(0) np.random.seed(0) torch.manual_seed(0) train_path = train_path if data_dir is None else join(data_dir, train_path) val_path = val_path if data_dir is None else join(data_dir, val_path) dictionary_path = dictionary_path if data_dir is None else join( data_dir, dictionary_path) run_name = run_name if run_name is not None else make_run_name( RUN_NAME_FORMAT, phase='finetune', config=config) logger = make_logger(run_name, log_output) logger.info('Run name : {run_name}'.format(run_name=run_name)) logger.info(config) logger.info('Constructing dictionaries...') dictionary = IndexDictionary.load(dictionary_path=dictionary_path, vocabulary_size=vocabulary_size) vocabulary_size = len(dictionary) #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens') logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format( vocabulary_size=vocabulary_size)) logger.info('Loading datasets...') train_dataset = ClassificationDataset(data_path=train_path, dictionary=dictionary) val_dataset = ClassificationDataset(data_path=val_path, dictionary=dictionary) logger.info('Train dataset size : {dataset_size}'.format( dataset_size=len(train_dataset))) logger.info('Building model...') pretrained_model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size, forward_encoded=True) pretrained_model = stateLoading(pretrained_model, pretrained_checkpoint) pretrained_model = disable_grad(pretrained_model) pretrained_model.eval() model = AMPscanner(model=pretrained_model, embedding_vector_length=embedding_vector_length, nbf=nbf, flen=flen, nlstm=nlstm, ndrop=ndrop) logger.info(model) logger.info('{parameters_count} parameters'.format( parameters_count=sum([p.nelement() for p in model.parameters()]))) # Have not figured this out yet eva = evaluator() metric_functions = [eva.MCC] train_dataloader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=ClassificationDataset.collate_function) val_dataloader = DataLoader( val_dataset, batch_size=batch_size, shuffle=True, collate_fn=ClassificationDataset.collate_function) optimizer = Adam(model.parameters(), lr=lr) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=3) checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config) logger.info('Start training...') trainer = Trainer(loss_model=model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, metric_functions=metric_functions, optimizer=optimizer, clip_grads=clip_grads, logger=logger, checkpoint_dir=checkpoint_dir, print_every=print_every, save_every=save_every, device=device, scheduler=scheduler, monitor='val_loss', comment='AMPscanner_Reproduce') trainer.run(epochs=epochs) return trainer