Example #1
0
def test_Batch():

    b1 = Batch(None, None, 1500)
    b2 = Batch(None, None, 5)
    b3 = Batch(None, None, 123)

    batch_list = [b1, b2, b3]
    s = sorted(batch_list)
    assert s[0].batch_loss == 5
    assert s[1].batch_loss == 123
    assert s[2].batch_loss == 1500
Example #2
0
def compute_loss(model, model_info, device, data, loss_fn):

    model.eval()
    all_losses = np.empty((0, 35))

    # LOOP THROUGH MINIBATCHES
    for step, (x, y) in tqdm.tqdm(enumerate(ptb_iterator(data, model.batch_size, model.seq_len)),
                                  total=(len(data)//model.batch_size - 1)//model.seq_len):
        if model_info.model == 'TRANSFORMER':
            batch = Batch(torch.from_numpy(x).long().to(device))
            model.zero_grad()
            outputs = model.forward(batch.data, batch.mask).transpose(1,0)
        else:
            inputs = torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda()
            model.zero_grad()
            hidden = model.init_hidden().to(device)
            outputs, hidden = model(inputs, hidden)

        # Target
        targets = torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous().to(device)

        # Loss computation
        outputs = outputs.contiguous()
        losses_in_batch = []
        for output_t, target_t in zip(outputs, targets):
            losses_in_batch.append(loss_fn(output_t, target_t).data.item())
        all_losses = np.vstack((all_losses, losses_in_batch))
    # Return
    return np.mean(all_losses, axis=0)
Example #3
0
def loadexpt(cellidx, filename, method, history, fraction=1., mean_adapt=False, roll=True):
    """
    Loads an experiment from disk

    Parameters
    ----------
    cellidx : int
        Index of the cell to load

    filename : string
        Name of the hdf5 file to load

    method : string
        The key in the hdf5 file to load ('train' or 'test')

    history : int
        Number of samples of history to include in the toeplitz stimulus

    fraction : float, optional
        Fraction of the experiment to load, must be between 0 and 1. (Default: 1.0)

    """

    assert fraction > 0 and fraction <= 1, "Fraction of data to load must be between 0 and 1"

    # currently only works with the Oct. 07, 15 experiment
    expt = '15-10-07'

    with notify('Loading {}ing data'.format(method)):

        # load the hdf5 file
        f = h5py.File(os.path.join(datadirs[os.uname()[1]], expt, filename + '.h5'), 'r')

        # length of the experiment
        expt_length = f[method]['time'].size
        num_samples = int(np.floor(expt_length * fraction))

        # load the stimulus
        stim = zscore(np.array(f[method]['stimulus'][:num_samples]).astype('float32'))

        # photoreceptor model of mean adaptation
        if mean_adapt:
            stim = pr_filter(10e-3, stim)

        # reshaped stimulus (nsamples, time/channel, space, space)
        if roll:
            stim_reshaped = np.rollaxis(np.rollaxis(rolling_window(stim, history, axis=0), 2), 3, 1)
        else:
            stim_reshaped = stim

        # get the response for this cell
        resp = np.array(f[method]['response/firing_rate_10ms'][cellidx, history:num_samples])

    return Batch(stim_reshaped, resp)
Example #4
0
def compute_loss_one_batch(model):
    if len(model.megabatch) == 0:

        if model.megabatch_anneal == 0:
            for i in range(model.max_megabatch_size):
                if model.curr_idx < len(model.mb):
                    model.megabatch.append(model.mb[model.curr_idx][1])
                    model.curr_idx += 1
        else:
            if model.increment and model.curr_megabatch_size < model.max_megabatch_size:
                model.curr_megabatch_size += 1
                model.increment = False
                print("Increasing megabatch size to {0}".format(
                    model.curr_megabatch_size))

            for i in range(model.curr_megabatch_size):
                if model.curr_idx < len(model.mb):
                    model.megabatch.append(model.mb[model.curr_idx][1])
                    model.curr_idx += 1
                    if model.curr_idx % model.megabatch_anneal == 0:
                        model.increment = True

        megabatch = []
        for n, i in enumerate(model.megabatch):
            arr = [model.data[t] for t in i]
            example_arr = []
            for j in arr:
                example = (j[0], j[1])

                if len(example[0].embeddings) > 0 and len(
                        example[1].embeddings) > 0:
                    example_arr.append(example)
                    continue

                example[0].populate_embeddings(model.vocab, model.zero_unk,
                                               model.ngrams,
                                               model.scramble_rate)

                if not model.share_vocab:
                    example[1].populate_embeddings(model.vocab_fr,
                                                   model.zero_unk,
                                                   model.ngrams,
                                                   model.scramble_rate)
                else:
                    example[1].populate_embeddings(model.vocab, model.zero_unk,
                                                   model.ngrams,
                                                   model.scramble_rate)

                example_arr.append(example)
            megabatch.append(example_arr)

        model.megabatch = megabatch

        if len(model.megabatch) == 0:
            return None

        sents1_list = []
        sents2_list = []

        sents1_lengths_list = []
        sents2_lengths_list = []

        for j in model.megabatch:

            sents1 = [i[0] for i in j]
            sents2 = [i[1] for i in j]

            sents_1_torch, lengths_1_torch = model.torchify_batch(sents1)
            if model.gpu:
                sents_1_torch = sents_1_torch.cuda()
                lengths_1_torch = lengths_1_torch.cuda()

            sents_2_torch, lengths_2_torch = model.torchify_batch(sents2)
            if model.gpu:
                sents_2_torch = sents_2_torch.cuda()
                lengths_2_torch = lengths_2_torch.cuda()

            sents1_list.append(sents_1_torch)
            sents2_list.append(sents_2_torch)

            sents1_lengths_list.append(lengths_1_torch)
            sents2_lengths_list.append(lengths_2_torch)

        p1_sents_list, p1_lengths_list, p2_sents_list, p2_lengths_list, = get_pairs_batch(
            model, sents1_list, sents1_lengths_list, sents2_list,
            sents2_lengths_list)

        model.megabatch = []
        for i in range(len(p1_sents_list)):
            new_batch = Batch()
            new_batch.g1 = sents1_list[i]
            new_batch.g1_l = sents1_lengths_list[i]

            new_batch.g2 = sents2_list[i]
            new_batch.g2_l = sents2_lengths_list[i]

            new_batch.p1 = p1_sents_list[i]
            new_batch.p1_l = p1_lengths_list[i]

            new_batch.p2 = p2_sents_list[i]
            new_batch.p2_l = p2_lengths_list[i]

            model.megabatch.append(new_batch)

    curr_batch = model.megabatch.pop(0)

    g1, g2, p1, p2 = model.forward(curr_batch)

    return model.loss_function(g1, g2, p1, p2)
Example #5
0
def run_epoch(model, data, is_train=False, device='cuda:0', n_devices=1):

    if is_train:
        model.train()  # Set model to training mode
        print("Training..")
        phase = 'train'
    else:
        model.eval()  # Set model to evaluate mode
        print("Evaluating..")
        phase = 'valid'

    start_time = time.time()

    loss = 0.0
    total_loss = 0.0
    total_tokens = 0
    batch_tokens = 0.0
    total_seqs = 0
    tokens = 0
    total_correct = 0.0
    n_correct = 0.0

    wer_score = 0.0
    total_wer_score = 0.0
    count = 0

    gt = []
    hyp = []

    #For progress bar
    bar = progressbar.ProgressBar(maxval=dataset_sizes[phase],
                                  widgets=[
                                      progressbar.Bar('=', '[', ']'), ' ',
                                      progressbar.Percentage()
                                  ])
    bar.start()
    j = 0
    #Loop over minibatches
    for step, (x, x_lengths, y, y_lengths, hand_regions,
               hand_lengths) in enumerate(data):

        #Update progress bar with every iter
        j += len(x)
        bar.update(j)

        y = torch.from_numpy(y).to(device)
        x = x.to(device)

        if (args.hand_query):
            hand_regions = hand_regions.to(device)
        else:
            hand_regions = None

        #NOTE: clone y to avoid overridding it
        batch = Batch(x_lengths,
                      y_lengths,
                      hand_lengths,
                      trg=None,
                      emb_type=args.emb_type,
                      DEVICE=device,
                      fixed_padding=args.fixed_padding,
                      rel_window=args.rel_window)

        if (args.distributed):

            #Zeroing gradients
            feature_extractor.zero_grad()
            encoder.zero_grad()
            position.zero_grad()
            output_layer.zero_grad()

            src_emb, _, _ = feature_extractor(x)
            src_emb = position(src_emb)
            src_emb = encoder(src_emb, None, batch.src_mask)
            output_context = output_layer(src_emb)

            if (args.hand_query):
                hand_extractor.zero_grad()

                hand_emb = hand_extractor(hand_regions)
                hand_emb = position(hand_emb)
                hand_emb = encoder(hand_emb, None, batch.src_mask)
                output_hand = output_layer(hand_emb)

                comb_emb = encoder(src_emb, hand_emb, batch.rel_mask)
                output = output_layer(comb_emb)

            else:
                output = None
                output_hand = None

        else:
            #Zeroing gradients
            model.zero_grad()

            #Shape(batch_size, tgt_seq_length, tgt_vocab_size)
            #NOTE: no need for trg if we dont have a decoder
            output, output_context, output_hand = model.forward(
                x, batch.src_mask, batch.rel_mask, hand_regions, args.arch)

        #CTC loss expects (Seq, batch, vocab)
        if (args.hand_query):
            output = output.transpose(0, 1)
            output_context = output_context.transpose(0, 1)
            output_hand = output_hand.transpose(0, 1)
        else:
            output = output_context.transpose(0, 1)

        x_lengths = torch.IntTensor(x_lengths)
        y_lengths = torch.IntTensor(y_lengths)

        if (is_train == False):

            #Run CTC beam decoder using tensorflow
            #NOTE: blank token in Tensorflow must be  (N-classes - 1)

            #Return tuple of sentences and probs
            decodes, _ = tf.nn.ctc_beam_search_decoder(
                inputs=output.cpu().detach().numpy(),
                sequence_length=x_lengths.cpu().detach().numpy(),
                merge_repeated=False,
                beam_width=10,
                top_paths=1)
            #Get top 1 path
            #(batch, Seq)
            pred = decodes[0]

            #Transform sparse tensor to numpy
            pred = tf.sparse.to_dense(pred).numpy()

            for i in range(len(y)):

                #NOTE: we are doing log inside ctcdecoder
                #pred = (seq, beam, batch)

                ys = y[i, :y_lengths[i]]
                p = pred[i]

                hyp = (' '.join([vocab[x.item()] for x in p]))
                gt = (' '.join([vocab[x.item()] for x in ys]))

                total_wer_score += wer(gt, hyp, standardize=True)
                count += 1

        #output (Seq, batch, vocab_size)
        #y (batch, trg_size)
        #x_lengths (batch)
        #y_lengths (batch)

        #NOTE: produce Nan values if x length > y lengths
        #When extracting keyframes, make sure your src lengths are long enough or simply use zero infinity
        #Doing average loss here

        #IMPORTANT: Use Pytorch CTCloss
        #print(output.shape)
        #print(y.shape)
        loss = ctc_loss(output, y.cpu(), x_lengths.cpu(), y_lengths.cpu())

        if (args.hand_query):
            loss += ctc_loss(output_context, y.cpu(), x_lengths.cpu(),
                             y_lengths.cpu())
            loss += ctc_loss(output_hand, y.cpu(), x_lengths.cpu(),
                             y_lengths.cpu())
            loss = loss / 3

        total_loss += loss
        total_seqs += batch.seq
        total_tokens += (y != blank_index).data.sum()
        tokens += (y != blank_index).data.sum()
        batch_tokens += (y != blank_index).data.sum()

        if is_train:

            loss.backward()

            #Weight clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

            optimizer.step()

            if step % 100 == 0:
                elapsed = time.time() - start_time
                print(
                    "Step: %d, Loss: %f, Frame per Sec: %f, Token per sec: %f"
                    % (step, (loss / batch_tokens),
                       total_seqs * batch_size / elapsed, tokens / elapsed))

                start_time = time.time()
                total_seqs = 0
                tokens = 0

        batch_tokens = 0.0

        #Free some memory
        #NOTE: this helps alot in avoiding cuda out of memory
        del loss, output, output_context, output_hand, y, hand_regions, batch

    if (is_train):
        print("Average Loss: %f" % (total_loss.item() / total_tokens.item()))
        return total_loss.item() / total_tokens.item()

    else:
        #Measure WER of all dataset
        print('Measuring WER..')
        print("Average WER: %f" % (total_wer_score / count))

        return total_loss.item() / total_tokens.item(), total_wer_score / count
Example #6
0
def run_epoch(model, data, is_train=False, device='cuda:0', n_devices=1):

    if is_train:
        model.train()  # Set model to training mode
        print ("Training..")
        phase='train'
    else:
        model.eval()   # Set model to evaluate mode
        print ("Evaluating..")
        phase='valid'

    start_time = time.time()

    loss = 0.0
    total_loss = 0.0
    total_tokens = 0
    total_seqs = 0
    tokens = 0
    total_correct = 0.0
    n_correct = 0.0

    total_wer_score = 0.0
    sentence_count = 0

    targets = []
    hypotheses = []

    #For progress bar
    bar = progressbar.ProgressBar(maxval=dataset_sizes[phase], widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    j = 0

    #Loop over minibatches
    for step, (x, x_lengths, y, y_lengths, gloss, gloss_lengths) in enumerate(data):

        #Update progress bar with every iter
        j += len(x)
        bar.update(j)

        if(type(gloss) != type(None)):
            gloss = torch.from_numpy(gloss).to(device)

        y = torch.from_numpy(y).to(device)
        x = x.to(device)

        #NOTE: clone y to avoid overridding it
        batch = Batch(x_lengths, y_lengths, None, y.clone(), emb_type=args.emb_type, DEVICE=device, fixed_padding=args.fixed_padding, rel_window=args.rel_window)

        model.zero_grad()

        #Return tuple of (output, encoder_output)
        #output = (batch_size, tgt_seq_length, tgt_vocab_size)
        #encoder_output = (batch_size, input_seq_length, trg_vocab_size)
        if(args.hybrid):
            output, encoder_output = model.forward(x, batch.trg, batch.src_mask, batch.trg_mask, batch.rel_mask, None)

            #CTC loss expects (batch, trg_seq, trg_vocab)
            encoder_output = encoder_output.transpose(0,1)
        else:
            output = model.forward(x, batch.trg, batch.src_mask, batch.trg_mask, batch.rel_mask, None)

        #Produce translation for blue score
        #Evaluate on dev
        if(is_train==False):

            x = Variable(x)

            translations = greedy_decode(model, x, None, batch.rel_mask, batch.src_mask,
                            max_len=20, start_symbol=1, device=device)

            #Loop over batch to create sentences
            for i in range(len(y)):

                ys = y[i, :]
                ys = ys[ys != 0]
                #NOTE: keep eos
                ys = ys[1:]

                translation = translations[i]

                hyp_trans = [vocab[x.item()] for x in translation]
                gt_trans = [vocab[x.item()] for x in ys]

                translation_corpus.append(hyp_trans)
                #NOTE: required to list of list (since we have 1 reference for each gt sentence)
                reference_corpus.append([gt_trans])

        x_lengths = torch.IntTensor(x_lengths)
        y_lengths = torch.IntTensor(y_lengths)

        if(type(gloss_lengths) != type(None)):
            gloss_lengths = torch.IntTensor(gloss_lengths)

        #Get CTCloss of batch without averaging
        if(args.hybrid):
            loss_ctc = ctc_loss(encoder_output, gloss.cpu(), x_lengths.cpu(), gloss_lengths.cpu())

        #Remove sos tokens from y
        y = y[:, 1:]

        #Predicted words with highest prob
        _, pred = torch.max(output, dim=-1)

        #NOTE: dont count pad
        for i in range(y.shape[0]):
            n_correct += (pred[i, :y_lengths[i]-1] ==  y[i, :y_lengths[i]-1]).sum()

        #NOTE: The transformer is an auto-regressive model: it makes predictions one part at a time,
        #and uses its output so far to decide what to do next
        #Teacher forcing is passing the true output to the next time step regardless of what the model predicts at the current time step.

        #Input of decoder (with sos and without eos)
        #Target (without sos and with eos)

        #NOTE: pred must be same shape as y
        y = y.contiguous().view(-1)
        pred = pred.contiguous().view(-1)
        output = output.view(-1, vocab_size)

        assert y.shape == pred.shape

        #Get loss cross entropy (from decoder) of batch without averaging
        loss = loss_fn(output, y)

        if(args.hybrid):
            #Joint CTC/Decoder loss
            loss = loss + loss_ctc

        total_loss += loss
        total_seqs += batch.seq
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        total_correct += n_correct

        if is_train:

            loss.backward()

            #Weight clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

            optimizer.step()

            if step % 100 == 0:
                elapsed = time.time() - start_time
                print("Step: %d, Loss: %f, Frame per Sec: %f, Token per sec: %f, Word Accuracy: %f" %
                      (step, loss / batch.ntokens, total_seqs * batch_size / elapsed, tokens / elapsed, n_correct.item() / tokens.item()))

                start_time = time.time()
                total_seqs = 0
                tokens = 0
                n_correct = 0.0

        #Free some memory
        #NOTE: this helps alot in avoiding cuda out of memory
        del loss, output, y

    if(is_train):
        print("Total word Accuracy: %f" %
                    (total_correct.item() / total_tokens.item()))
        return total_loss.item() / total_tokens.item()
    else:
        return translation_corpus, reference_corpus, total_loss.item() / total_tokens.item(), total_correct.item() / total_tokens.item()
Example #7
0
        #Update progress bar with every iter
        i += len(x)
        bar.update(i)

        if (args.hand_query):
            hand_regions = hand_regions.to(device)
        else:
            hand_regions = None

        y = torch.from_numpy(y).to(device)
        x = x.to(device)

        batch = Batch(x_lengths,
                      y_lengths,
                      hand_lengths,
                      trg=None,
                      DEVICE=device,
                      emb_type=args.emb_type,
                      fixed_padding=None,
                      rel_window=args.rel_window)

        #with torch.no_grad():

        output, output_context, output_hand = model.forward(
            x, batch.src_mask, batch.rel_mask, hand_regions)

        #CTC loss expects (Seq, batch, vocab)
        if (args.hand_query):
            output = output.transpose(0, 1)
            output_context = output_context.transpose(0, 1)
            output_hand = output_hand.transpose(0, 1)
        else:
Example #8
0
    #Loop over minibatches
    for step, (x, x_lengths, y, y_lengths, gloss,
               gloss_lengths) in enumerate(dataloader):

        #Update progress bar with every iter
        i += len(x)
        bar.update(i)

        y = torch.from_numpy(y).to(device)
        x = x.to(device)

        batch = Batch(x_lengths,
                      y_lengths,
                      y,
                      DEVICE=device,
                      emb_type='2d',
                      fixed_padding=None,
                      rel_window=args.rel_window)

        with torch.no_grad():
            #Return translation using our trained model
            translations = decoding(model,
                                    x,
                                    batch,
                                    None,
                                    start_symbol=1,
                                    max_len=args.decoding_length,
                                    method=args.decoding,
                                    n_beam=args.n_beam,
                                    device=device)
Example #9
0
def cnndm_test_full(args, model, logger):
    model = AutoExtSummarizer(args)
    model.to(args.device)
    model.eval()

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    train_dataset = CNNDMBlobNoTokens(prefix='test',
                                      data_path=args.data_dir,
                                      label_key=args.label_key)
    train_sampler = SequentialSampler(train_dataset)
    model_collate_fn = functools.partial(collate,
                                         pad_token_id=tokenizer.pad_token_id)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=1,
                                  collate_fn=single_collate,
                                  num_workers=args.num_workers)

    logger.info("***** Running CNNDM evaluation  *****")
    logger.info("  Num examples = %d", len(train_dataset))

    gold = []
    pred = []

    for batch in tqdm(train_dataloader, desc="Evaluating"):  #, disable=True):

        summary = batch[0][2]
        story = batch[0][0]

        blocks = create_labeled_blocks(args, batch[0], tokenizer)
        block_scores = []
        memory = None
        for block in blocks:
            _batch = Batch([block], pad_token_id=tokenizer.pad_token_id)
            source = _batch.src.to(args.device)
            encoder_mask = _batch.mask.to(args.device)
            clss = _batch.clss.to(args.device)
            cls_mask = _batch.mask_cls.to(args.device).bool()

            with torch.no_grad():
                sent_scores, mask, memory = model(source,
                                                  encoder_mask,
                                                  clss,
                                                  cls_mask,
                                                  memory=memory)
                #Seperates padding from the ones that are actually 0
                sent_scores = sent_scores + mask.float()
                sent_scores = sent_scores.cpu().data.numpy()
                block_scores.extend(sent_scores[0])
        selected_ids = np.argsort(block_scores)[::-1]
        _pred = []
        for i in selected_ids:
            candidate = story[i].strip()
            if (not _block_tri(candidate, _pred)):
                _pred.append(candidate)
            if len(_pred) == 3:
                break
        pred.append(_pred)
        gold.append(summary)

    #python rouge implementation
    rouge = Rouge()
    rouge_score = rouge.get_scores([" ".join(p) for p in pred],
                                   [" ".join(g) for g in gold],
                                   avg=True)
    rouge_score_formatted = format_rouge_scores(rouge_score)
    rouge_table = format_rouge_table(rouge_score)

    similarity_score = calc_sbert_similarity(pred, gold)
    #similarity_score = 0

    print(rouge_score_formatted)
    print("Similarity score(sbert): %.3f" % similarity_score)
    print(rouge_table + " & %.3f" % similarity_score)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    rouge_output_file = os.path.join(
        args.output_dir, "cnndm_test_full_results_{}_{}.txt".format(
            args.model_name,
            os.path.basename(args.model_path).split(".")[0]))
    with open(rouge_output_file, 'w', encoding="utf-8") as f:
        f.write(rouge_score_formatted)
        f.write("Similarity score(sbert): %.3f\n" % similarity_score)
        f.write(rouge_table + " & %.3f" % (similarity_score))
Example #10
0
def collate(batch, pad_token_id=0, device=None, is_test=False):
    return Batch(batch,
                 pad_token_id=pad_token_id,
                 device=device,
                 is_test=is_test)
Example #11
0
def collate(batch, pad_token_id=0, device=None):
    return Batch(batch, pad_token_id=pad_token_id, device=device)
Example #12
0
def train_full(args, model, tokenizer, writer):
    """ Fine-tune the pretrained model on the corpus. """
    set_seed(args)

    # Load the data
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_dataset = CNNDMBlobNoTokens(prefix='train',
                                      data_path=args.data_dir,
                                      label_key=args.label_key)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=1,
                                  collate_fn=single_collate,
                                  num_workers=args.num_workers)

    # Training schedule
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = t_total // (
            len(train_dataloader) // args.gradient_accumulation_steps + 1)
    else:
        t_total = (len(train_dataloader) // args.gradient_accumulation_steps *
                   args.num_train_epochs)

    ##Bertsum optimizer and scheduler
    if args.optim == 'bertsum':
        optimizer = build_optim(args, model, None)
    else:
        #Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=t_total *
                                                    0.1,
                                                    num_training_steps=t_total)

    if 'score' in args.label_key:
        criterion = torch.nn.MSELoss(reduction='sum')
    else:
        criterion = torch.nn.BCELoss(reduction='sum')

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    model.zero_grad()
    train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)

    global_step = 0
    tr_loss = 0.0
    logging_loss = 0.0
    start_time = time.time()
    num_docs = 0
    real_batch = []
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration")  #, disable=True)

        for step, batch in enumerate(epoch_iterator):
            num_docs += 1
            blocks = create_labeled_blocks(args, batch[0], tokenizer)
            free_slots = args.train_batch_size - len(real_batch)

            real_batch.extend(blocks[:free_slots])
            if len(real_batch) == args.train_batch_size:
                _batch = Batch(real_batch, pad_token_id=tokenizer.pad_token_id)
                source, encoder_mask, target, clss, cls_mask = _batch.src, _batch.mask, _batch.labels, _batch.clss, _batch.mask_cls

                source = source.to(args.device)
                target = target.to(args.device)
                encoder_mask = encoder_mask.to(args.device)
                cls_mask = cls_mask.to(args.device).bool()
                clss = clss.to(args.device)

                model.train()
                outputs, mask, _ = model(
                    source,
                    encoder_mask,
                    clss,
                    cls_mask,
                )

                #loss = criterion(outputs,target.float())
                #sumloss = loss.sum(dim=1)
                #summask = mask.float().sum(dim=1)
                #loss = (sumloss / summask).sum()
                #loss = (sumloss / summask).mean()

                loss = criterion(outputs, target.float())

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                #Only do this if mean loss
                #if args.gradient_accumulation_steps > 1:
                #   loss /= args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()

                real_batch = []
                real_batch.extend(blocks[free_slots:])

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.max_grad_norm:
                        if args.fp16:
                            torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer),
                                args.max_grad_norm)
                        else:
                            torch.nn.utils.clip_grad_norm_(
                                model.parameters(), args.max_grad_norm)
                    optimizer.step()
                    if args.optim != 'bertsum':
                        scheduler.step()
                    model.zero_grad()
                    global_step += 1

                    if args.max_steps > 0 and global_step > args.max_steps:
                        epoch_iterator.close()
                        break

                    if global_step % args.logging_steps == 0:
                        elapsed = time.time() - start_time
                        logger.info("##STEP: %i", (global_step))
                        logger.info("Unscaled loss: %f", tr_loss)
                        logger.info('Scaled loss: %f',
                                    (tr_loss /
                                     (global_step * args.train_batch_size *
                                      args.gradient_accumulation_steps)))
                        if args.optim == 'bertsum':
                            logger.info(
                                "loss: %4.2f; lr: %7.7f; %3.0f docs/s;",
                                (tr_loss - logging_loss) / args.logging_steps,
                                optimizer.learning_rate,
                                (global_step * args.train_batch_size *
                                 args.gradient_accumulation_steps) / elapsed,
                            )
                        else:
                            logger.info(
                                "loss: %4.2f; lr: %7.7f; %3.0f docs/s;",
                                (tr_loss - logging_loss) / args.logging_steps,
                                scheduler.get_lr()[0],
                                (global_step * args.train_batch_size *
                                 args.gradient_accumulation_steps) / elapsed,
                            )

                        logger.info("num docs: %f", (num_docs))
                        logger.info("num docs: %f", (num_docs / elapsed))
                        if args.optim == 'bertsum':
                            writer.add_scalar("train/lr",
                                              optimizer.learning_rate,
                                              global_step)
                        else:
                            writer.add_scalar('train/lr',
                                              scheduler.get_lr()[0],
                                              global_step)
                        writer.add_scalar('train/loss',
                                          (tr_loss - logging_loss) /
                                          args.logging_steps, global_step),
                        writer.add_scalar(
                            'train/loss_norm',
                            tr_loss / (global_step * args.train_batch_size *
                                       args.gradient_accumulation_steps),
                            global_step)
                        logging_loss = tr_loss

                    if global_step % args.eval_save_steps == 0 or global_step == 2000:
                        if not os.path.isdir(args.output_dir):
                            os.mkdir(args.output_dir)
                        checkpoint_path = os.path.join(
                            args.output_dir,
                            "model_step_{}.bin".format(global_step))
                        checkpoint = model.state_dict()
                        if args.n_gpu > 1:
                            from collections import OrderedDict
                            new_state_dict = OrderedDict()
                            for k, v in checkpoint.items():
                                name = k[
                                    7:]  # remove 'module.' of dataparallel
                                new_state_dict[name] = v
                            checkpoint = new_state_dict
                        torch.save(checkpoint, checkpoint_path)

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.optim == 'bertsum':
        writer.add_scalar("train/lr", optimizer.learning_rate, global_step)
    else:
        writer.add_scalar('train/lr', scheduler.get_lr()[0], global_step)
    writer.add_scalar('train/loss',
                      (tr_loss - logging_loss) / args.logging_steps,
                      global_step),
    writer.add_scalar(
        'train/loss_norm', tr_loss / (global_step * args.train_batch_size *
                                      args.gradient_accumulation_steps),
        global_step)
    logging_loss = tr_loss
    checkpoint_path = os.path.join(args.output_dir,
                                   "model_step_{}.bin".format(global_step))
    checkpoint = model.state_dict()
    if args.n_gpu > 1:
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in checkpoint.items():
            name = k[7:]  # remove 'module.' of dataparallel
            new_state_dict[name] = v
        checkpoint = new_state_dict
    torch.save(checkpoint, checkpoint_path)
    torch.save(
        args,
        os.path.join(args.output_dir,
                     "training_arguments_{}.bin".format(global_step)))
    torch.save(
        optimizer,
        os.path.join(args.output_dir,
                     "optimizer_step_{}.bin".format(global_step)))
    return global_step, tr_loss / global_step
Example #13
0
    def __init__(self, cell_index, stimulus_type, loss, optimizer, mean_adapt):
        """
        Superclass for managing keras models

        Parameters
        ----------

        cell_index : int

        stimulus_type : string
            Either 'naturalscene' or 'whitenoise'

        loss : string or object, optional
            The loss function to use. (Default: poisson_loss)
            See http://keras.io/objectives/ for more information

        optimizer : string or object
            The optimizer to use. (Default: sgd)
            See http://keras.io/optimizers/ for more information

        """

        # compile the model
        with notify('Compiling'):
            self.model.compile(loss=loss, optimizer=optimizer)

        # save architecture as a json file
        self.savedir = mksavedir(prefix=str(self))
        with notify('Saving architecture'):
            with open(join(self.savedir, 'architecture.json'), 'w') as f:
                f.write(self.model.to_json())

        # function to write data to a CSV file
        self.save_csv = partial(tocsv, join(self.savedir, 'performance'))
        self.save_csv(['Epoch', 'Iteration', 'Training CC', 'Test CC'])
        # load experimental data
        self.stimulus_type = stimulus_type
        if str(self) == 'lstm':
            numTime = self.stim_shape[0]
            self.holdout = loadexpt(cell_index,
                                    self.stimulus_type,
                                    'test',
                                    self.stim_shape[1],
                                    mean_adapt=mean_adapt)
            self.training = loadexpt(cell_index,
                                     self.stimulus_type,
                                     'train',
                                     self.stim_shape[1],
                                     mean_adapt=mean_adapt)
            X_train = self.training.X
            y_train = self.training.y
            X_test = self.holdout.X
            y_test = self.holdout.y
            numTrain = (int(X_train.shape[0] / numTime)) * numTime
            numTest = (int(X_test.shape[0] / numTime)) * numTime
            X_train = X_train[:numTrain]
            y_train = y_train[:numTrain]
            X_test = X_test[:numTest]
            y_test = y_test[:numTest]
            X_train = np.reshape(
                X_train, (int(numTrain / numTime), numTime, self.stim_shape[1],
                          self.stim_shape[2], self.stim_shape[3]))
            y_train = np.reshape(y_train,
                                 (int(numTrain / numTime), numTime, 1))
            X_test = np.reshape(
                X_test, (int(numTest / numTime), numTime, self.stim_shape[1],
                         self.stim_shape[2], self.stim_shape[3]))
            y_test = np.reshape(y_test, (int(numTest / numTime), numTime, 1))
            self.training = Batch(X_train, y_train)
            self.holdout = Batch(X_test, y_test)
        else:
            self.holdout = loadexpt(cell_index,
                                    self.stimulus_type,
                                    'test',
                                    self.stim_shape[0],
                                    mean_adapt=mean_adapt)
            self.training = loadexpt(cell_index,
                                     self.stimulus_type,
                                     'train',
                                     self.stim_shape[0],
                                     mean_adapt=mean_adapt)
        # save model information to a markdown file
        if 'architecture' not in self.__dict__:
            self.architecture = 'No architecture information specified'

        metadata = [
            '# ' + str(self), '## ' + strftime('%B %d, %Y'),
            'Started training on: ' + strftime('%I:%M:%S %p'),
            '### Architecture', self.architecture, '### Stimulus',
            'Experiment 10-07-15', stimulus_type,
            'Mean adaptation: ' + str(mean_adapt),
            'Cell #{}'.format(cell_index), '### Optimization',
            str(loss),
            str(optimizer)
        ]
        tomarkdown(join(self.savedir, 'README'), metadata)
def compute_loss_one_batch(model):
    if len(model.megabatch) == 0:

        if model.megabatch_anneal == 0:
            for i in range(model.max_megabatch_size):
                if model.curr_idx < len(model.mb):
                    model.megabatch.append(model.mb[model.curr_idx][1])
                    model.curr_idx += 1
        else:
            if model.increment and model.curr_megabatch_size < model.max_megabatch_size:
                model.curr_megabatch_size += 1
                model.increment = False
                print("Increasing megabatch size to {0}".format(model.curr_megabatch_size))

            for i in range(model.curr_megabatch_size):
                if model.curr_idx < len(model.mb):
                    model.megabatch.append(model.mb[model.curr_idx][1])
                    model.curr_idx += 1
                    if model.curr_idx % model.megabatch_anneal == 0:
                        model.increment = True

        megabatch = []
        for n, i in enumerate(model.megabatch):
            arr = [model.data[t] for t in i]
            example_arr = []
            for j in arr:
                example = (BigExample(j[0], model.vocab, model.rev_vocab, model.scramble_rate),
                           BigExample(j[1], model.vocab, model.rev_vocab, model.scramble_rate))
                if model.args.debug:
                    print("Logging Pairing: {0} {1}".format(j[0].sentence, j[1].sentence))

                example_arr.append(example)
            megabatch.append(example_arr)

        model.megabatch = megabatch

        if len(model.megabatch) == 0:
            return None

        sents1_list = []
        sents2_list = []

        sents1_lengths_list = []
        sents2_lengths_list = []

        for j in model.megabatch:

            sents1 = [i[0] for i in j]
            sents2 = [i[1] for i in j]

            sents_1_torch, lengths_1_torch = model.torchify_batch(sents1)
            if model.gpu:
                sents_1_torch = sents_1_torch.cuda()
                lengths_1_torch = lengths_1_torch.cuda()

            sents_2_torch, lengths_2_torch = model.torchify_batch(sents2)
            if model.gpu:
                sents_2_torch = sents_2_torch.cuda()
                lengths_2_torch = lengths_2_torch.cuda()

            sents1_list.append(sents_1_torch)
            sents2_list.append(sents_2_torch)

            sents1_lengths_list.append(lengths_1_torch)
            sents2_lengths_list.append(lengths_2_torch)

        p1_sents_list, p1_lengths_list, p2_sents_list, p2_lengths_list, = get_pairs_batch(model, sents1_list,
                                                sents1_lengths_list, sents2_list, sents2_lengths_list)

        model.megabatch = []
        for i in range(len(p1_sents_list)):
            new_batch = Batch()
            new_batch.g1 = sents1_list[i]
            new_batch.g1_l = sents1_lengths_list[i]

            new_batch.g2 = sents2_list[i]
            new_batch.g2_l = sents2_lengths_list[i]

            new_batch.p1 = p1_sents_list[i]
            new_batch.p1_l = p1_lengths_list[i]

            new_batch.p2 = p2_sents_list[i]
            new_batch.p2_l = p2_lengths_list[i]

            model.megabatch.append(new_batch)

    curr_batch = model.megabatch.pop(0)

    g1, g2, p1, p2 = model.forward(curr_batch)

    return model.loss_function(g1, g2, p1, p2)