Esempio n. 1
0
nbf = 64 	# No. Conv Filters
flen = 17 	# Conv Filter length
nlstm = 100 	# No. LSTM layers
ndrop = 0.1     # LSTM layer dropout

parser = argparse.ArgumentParser(description='AMPscanner')
parser.add_argument('checkpoint', help='checkpoint directory')
parser.add_argument('log_path', help='The file path of log file', default='AMPscanner.log')
args = parser.parse_args()
checkpoint = args.checkpoint
log_path = args.log_path
finetune_models = glob(os.path.join(checkpoint, '*.pth'))
finetune_models = [p for p in finetune_models if p.find('epoch=000') < 0]

dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path)
dictionary = IndexDictionary.load(dictionary_path=dictionary_path,
                                  vocabulary_size=vocabulary_size)
vocabulary_size = len(dictionary)

test_path = test_path if data_dir is None else join(data_dir, test_path)
test_dataset = ClassificationDataset(data_path=test_path, dictionary=dictionary)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=ClassificationDataset.collate_function)
BESTS = [0., 0., 0., 0., 0., 0.]
BESTS_MODEL = ''

for finetune_model in finetune_models:
    print('=' * 35)
    print('Model: {}'.format(finetune_model))
def finetuneSigunet(pretrained_checkpoint,
                    data_dir,
                    train_path,
                    val_path,
                    dictionary_path,
                    vocabulary_size,
                    batch_size,
                    max_len,
                    epochs,
                    lr,
                    clip_grads,
                    device,
                    layers_count,
                    hidden_size,
                    heads_count,
                    d_ff,
                    dropout_prob,
                    log_output,
                    checkpoint_dir,
                    print_every,
                    save_every,
                    config,
                    run_name=None,
                    **_):

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    train_path = train_path if data_dir is None else join(data_dir, train_path)
    val_path = val_path if data_dir is None else join(data_dir, val_path)
    dictionary_path = dictionary_path if data_dir is None else join(
        data_dir, dictionary_path)

    run_name = run_name if run_name is not None else make_run_name(
        RUN_NAME_FORMAT, phase='finetune', config=config)
    logger = make_logger(run_name, log_output)
    logger.info('Run name : {run_name}'.format(run_name=run_name))
    logger.info(config)

    logger.info('Constructing dictionaries...')
    dictionary = IndexDictionary.load(dictionary_path=dictionary_path,
                                      vocabulary_size=vocabulary_size)
    vocabulary_size = len(dictionary)
    #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens')
    logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format(
        vocabulary_size=vocabulary_size))

    logger.info('Loading datasets...')
    train_dataset = Seq2SeqDataset(data_path=train_path, dictionary=dictionary)
    val_dataset = Seq2SeqDataset(data_path=val_path, dictionary=dictionary)
    logger.info('Train dataset size : {dataset_size}'.format(
        dataset_size=len(train_dataset)))

    logger.info('Building model...')
    pretrained_model = build_model(layers_count,
                                   hidden_size,
                                   heads_count,
                                   d_ff,
                                   dropout_prob,
                                   max_len,
                                   vocabulary_size,
                                   forward_encoded=True)
    pretrained_model = stateLoading(pretrained_model, pretrained_checkpoint)
    # pretrained_model = disable_grad(pretrained_model)

    model = sigunet(model=pretrained_model,
                    m=28,
                    n=4,
                    kernel_size=7,
                    pool_size=2,
                    threshold=0.1,
                    device=device)

    logger.info(model)
    logger.info('{parameters_count} parameters'.format(
        parameters_count=sum([p.nelement() for p in model.parameters()])))

    # Have not figured this out yet
    metric_functions = [Seq2Seq_Metric]

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  collate_fn=Seq2SeqDataset.collate_function)

    val_dataloader = DataLoader(val_dataset,
                                batch_size=batch_size,
                                collate_fn=Seq2SeqDataset.collate_function)

    optimizer = Adam(model.parameters(), lr=lr)
    scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=3)

    checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config)

    logger.info('Start training...')
    trainer = Trainer(loss_model=model,
                      train_dataloader=train_dataloader,
                      val_dataloader=val_dataloader,
                      metric_functions=metric_functions,
                      optimizer=optimizer,
                      clip_grads=clip_grads,
                      logger=logger,
                      checkpoint_dir=checkpoint_dir,
                      print_every=print_every,
                      save_every=save_every,
                      device=device,
                      scheduler=scheduler,
                      monitor='train_loss')

    trainer.run(epochs=epochs)
    return trainer
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
sys.path.append(os.getcwd())

from torch.utils.data import DataLoader
from tqdm import tqdm

from bert.preprocess.dictionary import IndexDictionary
from bert.train.utils.convert import convert_to_tensor

from bert.train.datasets.seq2seq import Seq2SeqDataset
from bert.train.utils.collate import seq2seq_collate_function

dictionary = IndexDictionary.load(dictionary_path='dic/dic.txt',
                                  vocabulary_size=100)
vocabulary_size = len(dictionary)
dataset = Seq2SeqDataset('data/seq2seq/example.txt', dictionary)
dataloader = DataLoader(dataset,
                        batch_size=16,
                        collate_fn=seq2seq_collate_function)

for inputs, targets, batch_count in tqdm(dataloader):
    inputs = convert_to_tensor(inputs, None)
    targets = convert_to_tensor(targets, None)
    assert inputs[0].shape[0] == inputs[1].shape[0] == targets.shape[0]
    assert inputs[0].shape[1] == inputs[1].shape[1] == targets.shape[1]
Esempio n. 4
0
def pretrain(data_dir, train_path, val_path, dictionary_path,
             dataset_limit, vocabulary_size, batch_size, max_len, epochs, clip_grads, device,
             layers_count, hidden_size, heads_count, d_ff, dropout_prob,
             log_output, checkpoint_dir, print_every, save_every, config, run_name=None, pretrained_model = None, **_):

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    train_path = train_path if data_dir is None else join(data_dir, train_path)
    val_path = val_path if data_dir is None else join(data_dir, val_path)
    dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path)

    run_name = run_name if run_name is not None else make_run_name(RUN_NAME_FORMAT, phase='pretrain', config=config)
    logger = make_logger(run_name, log_output)
    logger.info('Run name : {run_name}'.format(run_name=run_name))
    logger.info(config)

    logger.info('Constructing dictionaries...')
    dictionary = IndexDictionary.load(dictionary_path=dictionary_path,
                                      vocabulary_size=vocabulary_size)
    vocabulary_size = len(dictionary)
    #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens')
    logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format(vocabulary_size=vocabulary_size))

    logger.info('Loading datasets...')
    train_dataset = PairedDataset(data_path=train_path, dictionary=dictionary, dataset_limit=dataset_limit)
    val_dataset = PairedDataset(data_path=val_path, dictionary=dictionary, dataset_limit=dataset_limit)
    logger.info('Train dataset size : {dataset_size}'.format(dataset_size=len(train_dataset)))

    logger.info('Building model...')
    model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size)
    if pretrained_model is not None:
        # Load the weights
        logger.info('Load the weights from %s' % pretrained_model)
        model = stateLoading(model, pretrained_model)

    logger.info(model)
    logger.info('{parameters_count} parameters'.format(
        parameters_count=sum([p.nelement() for p in model.parameters()])))

    loss_model = MLMLossModel(model)
    if torch.cuda.device_count() > 1:
        loss_model = DataParallel(loss_model, output_device=1)

    metric_functions = [mlm_accuracy, nsp_accuracy]

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=pretraining_collate_function)

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=pretraining_collate_function)

    optimizer = Adam(model.parameters())
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True)

    checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config)

    logger.info('Start training...')
    trainer = Trainer(
        loss_model=loss_model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        metric_functions=metric_functions,
        optimizer=optimizer,
        clip_grads=clip_grads,
        logger=logger,
        checkpoint_dir=checkpoint_dir,
        print_every=print_every,
        save_every=save_every,
        device=device,
        scheduler=scheduler,
        monitor='val_loss',
        comment=run_name
    )

    trainer.run(epochs=epochs)
    return trainer
Esempio n. 5
0
def finetune(pretrained_checkpoint,
             data_dir, train_path, val_path, dictionary_path,
             vocabulary_size, batch_size, max_len, epochs, lr, clip_grads, device,
             layers_count, hidden_size, heads_count, d_ff, dropout_prob,
             log_output, checkpoint_dir, print_every, save_every, config, run_name=None, **_):

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    train_path = train_path if data_dir is None else join(data_dir, train_path)
    val_path = val_path if data_dir is None else join(data_dir, val_path)
    dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path)

    run_name = run_name if run_name is not None else make_run_name(RUN_NAME_FORMAT, phase='finetune', config=config)
    logger = make_logger(run_name, log_output)
    logger.info('Run name : {run_name}'.format(run_name=run_name))
    logger.info(config)

    logger.info('Constructing dictionaries...')
    dictionary = IndexDictionary.load(dictionary_path=dictionary_path,
                                      vocabulary_size=vocabulary_size)
    vocabulary_size = len(dictionary)
    #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens')
    logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format(vocabulary_size=vocabulary_size))

    logger.info('Loading datasets...')
    train_dataset = SST2IndexedDataset(data_path=train_path, dictionary=dictionary)
    val_dataset = SST2IndexedDataset(data_path=val_path, dictionary=dictionary)
    logger.info('Train dataset size : {dataset_size}'.format(dataset_size=len(train_dataset)))

    logger.info('Building model...')
    pretrained_model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size)
    pretrained_model.load_state_dict(torch.load(pretrained_checkpoint, map_location='cpu')['state_dict'])

    model = FineTuneModel(pretrained_model, hidden_size, num_classes=2)

    logger.info(model)
    logger.info('{parameters_count} parameters'.format(
        parameters_count=sum([p.nelement() for p in model.parameters()])))

    loss_model = ClassificationLossModel(model)
    metric_functions = [classification_accuracy]

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=classification_collate_function)

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=classification_collate_function)

    optimizer = Adam(model.parameters(), lr=lr)

    checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config)

    logger.info('Start training...')
    trainer = Trainer(
        loss_model=loss_model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        metric_functions=metric_functions,
        optimizer=optimizer,
        clip_grads=clip_grads,
        logger=logger,
        checkpoint_dir=checkpoint_dir,
        print_every=print_every,
        save_every=save_every,
        device=device
    )

    trainer.run(epochs=epochs)
    return trainer
Esempio n. 6
0
def pretrain(data_dir, train_path, val_path, dictionary_path,
             dataset_limit, vocabulary_size, batch_size, max_len, epochs, clip_grads, device,
             layers_count, hidden_size, heads_count, d_ff, dropout_prob,
             log_output, checkpoint_dir, print_every, save_every, config, run_name=None, **_):

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    train_path = train_path if data_dir is None else join(data_dir, train_path)
    val_path = val_path if data_dir is None else join(data_dir, val_path)
    dictionary_path = dictionary_path if data_dir is None else join(data_dir, dictionary_path)

    run_name = run_name if run_name is not None else make_run_name(RUN_NAME_FORMAT, phase='pretrain', config=config)
    logger = make_logger(run_name, log_output)
    logger.info('Run name : {run_name}'.format(run_name=run_name))
    logger.info(config)

    logger.info('Constructing dictionaries...')
    dictionary = IndexDictionary.load(dictionary_path=dictionary_path,
                                      vocabulary_size=vocabulary_size)
    vocabulary_size = len(dictionary)
    logger.info(f'dictionary vocabulary : {vocabulary_size} tokens')

    logger.info('Loading datasets...')
    train_dataset = PairedDataset(data_path=train_path, dictionary=dictionary, dataset_limit=dataset_limit)
    val_dataset = PairedDataset(data_path=val_path, dictionary=dictionary, dataset_limit=dataset_limit)
    logger.info('Train dataset size : {dataset_size}'.format(dataset_size=len(train_dataset)))

    logger.info('Building model...')
    model = build_model(layers_count, hidden_size, heads_count, d_ff, dropout_prob, max_len, vocabulary_size)

    logger.info(model)
    logger.info('{parameters_count} parameters'.format(
        parameters_count=sum([p.nelement() for p in model.parameters()])))

    loss_model = MLMNSPLossModel(model)
    if torch.cuda.device_count() > 1:
        loss_model = DataParallel(loss_model, output_device=1)

    metric_functions = [mlm_accuracy, nsp_accuracy]

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        collate_fn=pretraining_collate_function)

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=pretraining_collate_function)

    optimizer = NoamOptimizer(model.parameters(),
                              d_model=hidden_size, factor=2, warmup_steps=10000, betas=(0.9, 0.999), weight_decay=0.01)

    checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config)

    logger.info('Start training...')
    trainer = Trainer(
        loss_model=loss_model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        metric_functions=metric_functions,
        optimizer=optimizer,
        clip_grads=clip_grads,
        logger=logger,
        checkpoint_dir=checkpoint_dir,
        print_every=print_every,
        save_every=save_every,
        device=device
    )

    trainer.run(epochs=epochs)
    return trainer
def teacher(data_dir,
            data_path,
            dictionary_path,
            dataset_limit,
            vocabulary_size,
            batch_size,
            max_len,
            device,
            layers_count,
            hidden_size,
            heads_count,
            d_ff,
            dropout_prob,
            log_output,
            checkpoint,
            config,
            run_name=None,
            confidence=0.4,
            pseudo_data_path='pseudo_data.txt',
            **_):

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    data_path = data_path if data_dir is None else join(data_dir, data_path)
    dictionary_path = dictionary_path if data_dir is None else join(
        data_dir, dictionary_path)

    run_name = run_name if run_name is not None else make_run_name(
        RUN_NAME_FORMAT, phase='teacher', config=config)
    logger = make_logger(run_name, log_output)
    logger.info('Run name : {run_name}'.format(run_name=run_name))
    logger.info(config)

    logger.info('Constructing dictionaries...')
    dictionary = IndexDictionary.load(dictionary_path=dictionary_path,
                                      vocabulary_size=vocabulary_size)
    vocabulary_size = len(dictionary)
    #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens')
    logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format(
        vocabulary_size=vocabulary_size))

    logger.info('Loading datasets...')
    dataset = PairedDataset(data_path=data_path,
                            dictionary=dictionary,
                            dataset_limit=dataset_limit)
    logger.info(
        'dataset size : {dataset_size}'.format(dataset_size=len(dataset)))

    logger.info('Building model...')
    model = build_model(layers_count, hidden_size, heads_count, d_ff,
                        dropout_prob, max_len, vocabulary_size)
    model = stateLoading(model, checkpoint)
    model = disable_grad(model)
    model.to(device=device)

    logger.info(model)
    logger.info('{parameters_count} parameters'.format(
        parameters_count=sum([p.nelement() for p in model.parameters()])))

    if torch.cuda.device_count() > 1:
        loss_model = DataParallel(loss_model, output_device=1)

    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            collate_fn=pretraining_collate_function)

    true_sequences = []
    predicted_sequences = []
    hints = []
    confidences = []
    for inputs, targets, batch_count in tqdm(dataloader, ncols=60):
        inputs = convert_to_tensor(inputs, device)
        targets = convert_to_tensor(targets, device)

        token_predictions, _ = model(inputs)
        token_predictions = F.softmax(token_predictions, dim=-1)
        token_targets = targets[0]
        indexed_sequences, _ = inputs
        for t, p, g in zip(indexed_sequences, token_predictions,
                           token_targets):
            tmp_input = [dictionary.index_to_token(i.item()) for i in t]
            tmp_pred = [(dictionary.index_to_token(torch.argmax(i).item()),
                         torch.max(i).item()) for i in p]
            tmp_target = [dictionary.index_to_token(i.item()) for i in g]
            tmp_input = tmp_input[1:]
            tmp_pseudo = tmp_input.copy()
            tmp_pred = tmp_pred[1:]
            tmp_target = tmp_target[1:]
            tmp_hint = ['='] * len(tmp_input)
            prob_num = 0.
            prob_denom = 0
            while MASK_TOKEN in tmp_input:
                index = tmp_input.index(MASK_TOKEN)
                tmp_input[index] = tmp_target[index]
                p = tmp_pred[index]
                tmp_pseudo[index] = p[0]
                prob_num += p[1]
                prob_denom += 1
                tmp_hint[index] = '*'
            if prob_denom == 0:
                continue
            prob = prob_num / prob_denom

            if prob > confidence:
                true_sequences.append(' '.join(tmp_input).replace(
                    PAD_TOKEN, ''))
                predicted_sequences.append(' '.join(tmp_pseudo).replace(
                    PAD_TOKEN, ''))
                hints.append(' '.join(tmp_hint))
                confidences.append(prob)
            # print(' '.join(tmp_input).replace(PAD_TOKEN, ''))
            # print(' '.join(tmp_pseudo).replace(PAD_TOKEN, ''))

    with open(pseudo_data_path, 'w') as f:
        for t, p, h, c in zip(true_sequences, predicted_sequences, hints,
                              confidences):
            f.write('%s\n' % t)
            f.write('%s\n' % p)
            f.write('%s\n' % ''.join(h))
            f.write('confidence: %s\n' % c)
            f.write('-\n')
        f.close()
Esempio n. 8
0
def finetuneAMPscanner(pretrained_checkpoint,
                       data_dir,
                       train_path,
                       val_path,
                       dictionary_path,
                       vocabulary_size,
                       batch_size,
                       max_len,
                       epochs,
                       lr,
                       clip_grads,
                       device,
                       layers_count,
                       hidden_size,
                       heads_count,
                       d_ff,
                       dropout_prob,
                       log_output,
                       checkpoint_dir,
                       print_every,
                       save_every,
                       embedding_vector_length,
                       nbf,
                       flen,
                       nlstm,
                       ndrop,
                       config,
                       run_name=None,
                       fixed_length=None,
                       **_):

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    train_path = train_path if data_dir is None else join(data_dir, train_path)
    val_path = val_path if data_dir is None else join(data_dir, val_path)
    dictionary_path = dictionary_path if data_dir is None else join(
        data_dir, dictionary_path)

    run_name = run_name if run_name is not None else make_run_name(
        RUN_NAME_FORMAT, phase='finetune', config=config)
    logger = make_logger(run_name, log_output)
    logger.info('Run name : {run_name}'.format(run_name=run_name))
    logger.info(config)

    logger.info('Constructing dictionaries...')
    dictionary = IndexDictionary.load(dictionary_path=dictionary_path,
                                      vocabulary_size=vocabulary_size)
    vocabulary_size = len(dictionary)
    #logger.info(f'dictionary vocabulary : {vocabulary_size} tokens')
    logger.info('dictionary vocabulary : {vocabulary_size} tokens'.format(
        vocabulary_size=vocabulary_size))

    logger.info('Loading datasets...')
    train_dataset = ClassificationDataset(data_path=train_path,
                                          dictionary=dictionary)
    val_dataset = ClassificationDataset(data_path=val_path,
                                        dictionary=dictionary)
    logger.info('Train dataset size : {dataset_size}'.format(
        dataset_size=len(train_dataset)))

    logger.info('Building model...')
    pretrained_model = build_model(layers_count,
                                   hidden_size,
                                   heads_count,
                                   d_ff,
                                   dropout_prob,
                                   max_len,
                                   vocabulary_size,
                                   forward_encoded=True)
    pretrained_model = stateLoading(pretrained_model, pretrained_checkpoint)
    pretrained_model = disable_grad(pretrained_model)
    pretrained_model.eval()

    model = AMPscanner(model=pretrained_model,
                       embedding_vector_length=embedding_vector_length,
                       nbf=nbf,
                       flen=flen,
                       nlstm=nlstm,
                       ndrop=ndrop)

    logger.info(model)
    logger.info('{parameters_count} parameters'.format(
        parameters_count=sum([p.nelement() for p in model.parameters()])))

    # Have not figured this out yet
    eva = evaluator()
    metric_functions = [eva.MCC]

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=ClassificationDataset.collate_function)

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=ClassificationDataset.collate_function)

    optimizer = Adam(model.parameters(), lr=lr)
    scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=3)

    checkpoint_dir = make_checkpoint_dir(checkpoint_dir, run_name, config)

    logger.info('Start training...')
    trainer = Trainer(loss_model=model,
                      train_dataloader=train_dataloader,
                      val_dataloader=val_dataloader,
                      metric_functions=metric_functions,
                      optimizer=optimizer,
                      clip_grads=clip_grads,
                      logger=logger,
                      checkpoint_dir=checkpoint_dir,
                      print_every=print_every,
                      save_every=save_every,
                      device=device,
                      scheduler=scheduler,
                      monitor='val_loss',
                      comment='AMPscanner_Reproduce')

    trainer.run(epochs=epochs)
    return trainer