コード例 #1
0
    def eval(self):
        params = self.params
        self.model.eval()

        lang_id1 = params.lang2id[params.src_lang]
        lang_id2 = params.lang2id[params.trg_lang]

        valid = 0
        total = 0

        for sent1, len1, sent2, len2, y, _, _ in tqdm(
                self.dataloader['valid']):
            sent1, len1 = truncate(sent1, len1, params.max_len,
                                   params.eos_index)
            sent2, len2 = truncate(sent2, len2, params.max_len,
                                   params.eos_index)
            x, lengths, positions, langs = concat_batches(sent1,
                                                          len1,
                                                          lang_id1,
                                                          sent2,
                                                          len2,
                                                          lang_id2,
                                                          params.pad_index,
                                                          params.eos_index,
                                                          reset_positions=True)

            # cuda
            x, y, lengths, positions, langs = to_cuda(x,
                                                      y,
                                                      lengths,
                                                      positions,
                                                      langs,
                                                      gpu=self.gpu)

            # forward
            output = self.model(x, lengths, positions, langs)
            predictions = output.data.max(1)[1]

            # update statistics
            valid += predictions.eq(y).sum().item()
            total += len(len1)

        # compute accuracy
        acc = 100.0 * valid / total
        scores = {}
        scores['acc'] = acc
        return scores
コード例 #2
0
def run(model,
        params,
        dico,
        data,
        split,
        src_lang,
        trg_lang,
        gen_type="src2trg",
        alpha=1.,
        beta=1.,
        gamma=0.,
        uniform=False,
        iter_mult=1,
        mask_schedule="constant",
        constant_k=1,
        batch_size=8,
        gpu_id=0):
    #n_batches = math.ceil(len(srcs) / batch_size)
    if gen_type == "src2trg":
        ref_path = params.ref_paths[(src_lang, trg_lang, split)]
    elif gen_type == "trg2src":
        ref_path = params.ref_paths[(trg_lang, src_lang, split)]

    refs = [s.strip() for s in open(ref_path, encoding="utf-8").readlines()]
    hypothesis = []
    #hypothesis_selected_pos = []
    for batch_n, batch in enumerate(
            get_iterator(params, data, split, "de", "en")):
        (src_x, src_lens), (trg_x, trg_lens) = batch

        batches, batches_src_lens, batches_trg_lens, total_scores = [], [], [], []
        #batches_selected_pos = []
        for i_topk_length in range(params.num_topk_lengths):

            # overwrite source/target lengths according to dataset stats if necessary
            if params.de2en_lengths != None and params.en2de_lengths != None:
                src_lens_item = src_lens[0].item() - 2  # remove BOS, EOS
                trg_lens_item = trg_lens[0].item() - 2  # remove BOS, EOS
                if gen_type == "src2trg":
                    if len(params.de2en_lengths[src_lens_item].keys()
                           ) < i_topk_length + 1:
                        break
                    data_trg_lens = sorted(
                        params.de2en_lengths[src_lens_item].items(),
                        key=operator.itemgetter(1))
                    data_trg_lens_item = data_trg_lens[-1 -
                                                       i_topk_length][0] + 2
                    # overwrite trg_lens
                    trg_lens = torch.ones_like(trg_lens) * data_trg_lens_item
                elif gen_type == "trg2src":
                    if len(params.en2de_lengths[trg_lens_item].keys()
                           ) < i_topk_length + 1:
                        break
                    data_src_lens = sorted(
                        params.en2de_lengths[trg_lens_item].items(),
                        key=operator.itemgetter(1))
                    # take i_topk_length most likely length and add BOS, EOS
                    data_src_lens_item = data_src_lens[-1 -
                                                       i_topk_length][0] + 2
                    # overwrite src_lens
                    src_lens = torch.ones_like(src_lens) * data_src_lens_item

            if gen_type == "src2trg":
                sent1_input = src_x
                sent2_input = create_masked_batch(trg_lens, params, dico)
                dec_len = torch.max(trg_lens).item() - 2  # cut BOS, EOS
            elif gen_type == "trg2src":
                sent1_input = create_masked_batch(src_lens, params, dico)
                sent2_input = trg_x
                dec_len = torch.max(src_lens).item() - 2  # cut BOS, EOS

            batch, lengths, positions, langs = concat_batches(sent1_input, src_lens, params.lang2id[src_lang], \
                                                              sent2_input, trg_lens, params.lang2id[trg_lang], \
                                                              params.pad_index, params.eos_index, \
                                                              reset_positions=True,
                                                              assert_eos=True) # not sure about it

            if gpu_id >= 0:
                batch, lengths, positions, langs, src_lens, trg_lens = \
                    to_cuda(batch, lengths, positions, langs, src_lens, trg_lens)

            with torch.no_grad():
                batch, total_score_argmax_toks = \
                    _evaluate_batch(model, params, dico, batch,
                                    lengths, positions, langs, src_lens, trg_lens,
                                    gen_type, alpha, beta, gamma, uniform,
                                    dec_len, iter_mult, mask_schedule, constant_k)
            batches.append(batch.clone())
            batches_src_lens.append(src_lens.clone())
            batches_trg_lens.append(trg_lens.clone())
            total_scores.append(total_score_argmax_toks)
            #batches_selected_pos.append(selected_pos)

        best_score_idx = np.array(total_scores).argmax()
        batch, src_lens, trg_lens = batches[best_score_idx], batches_src_lens[
            best_score_idx], batches_trg_lens[best_score_idx]
        #selected_pos = batches_selected_pos[best_score_idx]

        #if gen_type == "src2trg":
        #    hypothesis_selected_pos.append([selected_pos, trg_lens.item()-2])
        #elif gen_type == "trg2src":
        #    hypothesis_selected_pos.append([selected_pos, src_lens.item()-2])

        for batch_idx in range(batch_size):
            src_len = src_lens[batch_idx].item()
            tgt_len = trg_lens[batch_idx].item()
            if gen_type == "src2trg":
                generated = batch[src_len:src_len + tgt_len, batch_idx]
            else:
                generated = batch[:src_len, batch_idx]
            # extra <eos>
            eos_pos = (generated == params.eos_index).nonzero()
            if eos_pos.shape[0] > 2:
                generated = generated[:(eos_pos[1, 0].item() + 1)]
            hypothesis.extend(convert_to_text(generated.unsqueeze(1), \
                                torch.Tensor([generated.shape[0]]).int(), \
                                dico, params))

        print("Ex {0}\nRef: {1}\nHyp: {2}\n".format(
            batch_n, refs[batch_n].encode("utf-8"),
            hypothesis[-1].encode("utf-8")))

    hyp_path = os.path.join(params.hyp_path, 'decoding.txt')
    hyp_path_tok = os.path.join(params.hyp_path, 'decoding.tok.txt')
    #hyp_selected_pos_path = os.path.join(params.hyp_path, "selected_pos.pkl")

    # export sentences to hypothesis file / restore BPE segmentation
    with open(hyp_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(hypothesis) + '\n')
    with open(hyp_path_tok, 'w', encoding='utf-8') as f:
        f.write('\n'.join(hypothesis) + '\n')
    #with open(hyp_selected_pos_path, 'wb') as f:
    #    pkl.dump(hypothesis_selected_pos, f)
    restore_segmentation(hyp_path)

    # evaluate BLEU score
    bleu = eval_moses_bleu(ref_path, hyp_path)
    print("BLEU %s-%s; %s %s : %f" %
          (src_lang, trg_lang, hyp_path, ref_path, bleu))
    # write BLEU score result to file
    result_path = os.path.join(params.hyp_path, "result.txt")
    with open(result_path, 'w', encoding='utf-8') as f:
        f.write("BLEU %s-%s; %s %s : %f\n" %
                (src_lang, trg_lang, hyp_path, ref_path, bleu))
コード例 #3
0
ファイル: predict_pc.py プロジェクト: gbcolborne/XLM
def main(args):
    rng = np.random.RandomState(0)

    # Make dump path
    if not os.path.exists(args.dump_path):
        subprocess.Popen("mkdir -p %s" % args.dump_path, shell=True).wait()
    else:
        if os.listdir(args.dump_path):
            m = "Directory {} is not empty.".format(args.dump_path)
            raise ValueError(m)
    if len(args.log_file):
        write_log = True
    else:
        write_log = False

    # load model parameters
    model_dir = os.path.dirname(args.load_model)
    params_path = os.path.join(model_dir, 'params.pkl')
    with open(params_path, "rb") as f:
        params = pickle.load(f)

    # load data parameters and model parameters from checkpoint
    checkpoint_path = os.path.join(model_dir, 'checkpoint.pth')
    assert os.path.isfile(checkpoint_path)
    data = torch.load(
        checkpoint_path,
        map_location=lambda storage, loc: storage.cuda(params.local_rank))
    for k, v in data["params"].items():
        params.__dict__[k] = v
    dico = Dictionary(data["dico_id2word"], data["dico_word2id"],
                      data["dico_counts"])

    # Print score
    for k, v in data["best_metrics"].items():
        print("- {}: {}".format(k, v))

    # Fix some of the params we pass to load_data
    params.debug_train = False
    params.max_vocab = -1
    params.min_count = 0
    params.tokens_per_batch = -1
    params.max_batch_size = args.batch_size
    params.batch_size = args.batch_size

    # load data
    data = load_data(args.data_path, params)

    # Print data summary
    for (src, tgt), dataset in data['para'].items():
        datatype = "Para data (%s)" % (
            "WITHOUT labels" if dataset.labels is None else "WITH labels")
        m = '{: <27} - {: >12}:{: >10}'.format(datatype, '%s-%s' % (src, tgt),
                                               len(dataset))
        print(m)

    # Fix some of the params we pass to the model builder
    params.reload_model = args.load_model

    # build model
    if params.encoder_only:
        model = build_model(params, dico)
    else:
        encoder, decoder = build_model(params, dico)
        model = encoder

    # Predict
    model = model.module if params.multi_gpu else model
    model.eval()
    start = time.time()
    for (src, tgt), dataset in data['para'].items():
        path = os.path.join(args.dump_path, "{}-{}.pred".format(src, tgt))
        scores_file = open(path, "w")
        lang1_id = params.lang2id[src]
        lang2_id = params.lang2id[tgt]
        diffs = []
        nb_written = 0
        for batch in dataset.get_iterator(False,
                                          group_by_size=False,
                                          n_sentences=-1,
                                          return_indices=False):
            (sent1, len1), (sent2, len2), labels = batch
            sent1, len1 = truncate(sent1, len1, params.max_len,
                                   params.eos_index)
            sent2, len2 = truncate(sent2, len2, params.max_len,
                                   params.eos_index)
            x, lengths, positions, langs = concat_batches(sent1,
                                                          len1,
                                                          lang1_id,
                                                          sent2,
                                                          len2,
                                                          lang2_id,
                                                          params.pad_index,
                                                          params.eos_index,
                                                          reset_positions=True)
            x, lengths, positions, langs = to_cuda(x, lengths, positions,
                                                   langs)
            with torch.no_grad():
                # Get sentence pair embedding
                h = model('fwd',
                          x=x,
                          lengths=lengths,
                          positions=positions,
                          langs=langs,
                          causal=False)[0]
                CLF_ID1, CLF_ID2 = 8, 9  # very hacky, use embeddings to make weights for the classifier
                emb = (model.module
                       if params.multi_gpu else model).embeddings.weight
                pred = F.linear(h, emb[CLF_ID1].unsqueeze(0), emb[CLF_ID2, 0])
                pred = torch.sigmoid(pred)
                pred = pred.view(-1).cpu().numpy().tolist()
            for p, l1, l2 in zip(pred, len1, len2):
                if l1.item() == 0 and l2.item() == 0:
                    scores_file.write("0.00000000\n")
                else:
                    scores_file.write("{:.8f}\n".format(p))
            nb_written += len(pred)
            if nb_written % 1000 == 0:
                elapsed = int(time.time() - start)
                lpss = elapsed % 60
                lpsm = elapsed // 60
                lpsh = lpsm // 60
                lpsm = lpsm % 60
                msg = "[{:02d}:{:02d}:{:02d} {}-{}]".format(
                    lpsh, lpsm, lpss, src, tgt)
                msg += " {}/{} ({:.2f}%) sentences processed".format(
                    nb_written, len(dataset), 100 * nb_written / len(dataset))
                print(msg)
                if write_log:
                    with open(args.log_file, "a") as fout:
                        fout.write(msg + "\n")
            # Try reversing order
            if TEST_REVERSE:
                x, lengths, positions, langs = concat_batches(
                    sent2,
                    len2,
                    lang2_id,
                    sent1,
                    len1,
                    lang1_id,
                    params.pad_index,
                    params.eos_index,
                    reset_positions=True)
                x, lengths, positions, langs = to_cuda(x, lengths, positions,
                                                       langs)
                with torch.no_grad():
                    # Get sentence pair embedding
                    h = model('fwd',
                              x=x,
                              lengths=lengths,
                              positions=positions,
                              langs=langs,
                              causal=False)[0]
                    CLF_ID1, CLF_ID2 = 8, 9  # very hacky, use embeddings to make weights for the classifier
                    emb = (model.module
                           if params.multi_gpu else model).embeddings.weight
                    pred_rev = F.linear(h, emb[CLF_ID1].unsqueeze(0),
                                        emb[CLF_ID2, 0])
                    pred_rev = torch.sigmoid(pred_rev)
                    pred_rev = pred_rev.view(-1).cpu().numpy().tolist()
                    for p, pp in zip(pred, pred_rev):
                        diffs.append(p - pp)

        if TEST_REVERSE:
            print(
                "Average absolute diff between score(l1,l2) and score(l2,l1): {}"
                .format(np.mean(np.abs(diffs))))

        scores_file.close()
コード例 #4
0
    def train(self):
        params = self.params
        self.model.train()

        # training variables
        losses = []
        ns = 0  # number of sentences
        nw = 0  # number of words
        t = time.time()

        lang_id1 = params.lang2id[params.src_lang]
        lang_id2 = params.lang2id[params.trg_lang]

        for sent1, len1, sent2, len2, y, _, _ in self.dataloader['train']:
            self.global_step += 1
            sent1, len1 = truncate(sent1, len1, params.max_len,
                                   params.eos_index)
            sent2, len2 = truncate(sent2, len2, params.max_len,
                                   params.eos_index)
            x, lengths, positions, langs = concat_batches(sent1,
                                                          len1,
                                                          lang_id1,
                                                          sent2,
                                                          len2,
                                                          lang_id2,
                                                          params.pad_index,
                                                          params.eos_index,
                                                          reset_positions=True)

            bs = len(len1)

            # cuda
            x, y, lengths, positions, langs = to_cuda(x,
                                                      y,
                                                      lengths,
                                                      positions,
                                                      langs,
                                                      gpu=self.gpu)

            # loss
            output = self.model(x, lengths, positions, langs)
            loss = self.criterion(output, y)

            # backward / optimization
            self.optimizer_e.zero_grad()
            self.optimizer_p.zero_grad()
            loss.backward()
            self.optimizer_e.step()
            self.optimizer_p.step()
            losses.append(loss.item())

            # log
            if self.global_step % self.params.report_interval == 0:
                logger.info("GPU %i - Epoch %i - Global_step %i - Loss: %.4f" %
                            (self.gpu, self.epoch, self.global_step,
                             sum(losses) / len(losses)))
                nw, t = 0, time.time()
                losses = []

            if self.global_step % params.eval_interval == 0:
                if self.gpu == 0:
                    logger.info("XLM - Evaluating")
                    with torch.no_grad():
                        scores = self.eval()
                        if scores['acc'] > self.best_acc:
                            self.best_acc = scores['acc']
                            torch.save(
                                self.model.module,
                                os.path.join(params.save_model,
                                             'best_acc_model.pkl'))
                            with open(
                                    os.path.join(params.save_model,
                                                 'best_acc.note'), 'a') as f:
                                f.write(str(self.best_acc) + '\n')
                        with open(os.path.join(params.save_model, 'acc.note'),
                                  'a') as f:
                            f.write(str(scores['acc']) + '\n')
                        logger.info("acc - %i " % scores['acc'])
                    self.model.train()
コード例 #5
0
    def run_test(self):

        params = self.params
        result_path = params.test_result_path + '_{}'.format(self.gpu)
        self.model.eval()

        lang_id1 = params.lang2id[params.src_lang]
        lang_id2 = params.lang2id[params.trg_lang]

        proba_result = []
        src_text_list = []
        trg_text_list = []

        with torch.no_grad():

            for sent1, len1, sent2, len2, _, src_text, trg_text in tqdm(
                    self.dataloader['test']):
                sent1, len1 = truncate(sent1, len1, params.max_len,
                                       params.eos_index)
                sent2, len2 = truncate(sent2, len2, params.max_len,
                                       params.eos_index)
                x, lengths, positions, langs = concat_batches(
                    sent1,
                    len1,
                    lang_id1,
                    sent2,
                    len2,
                    lang_id2,
                    params.pad_index,
                    params.eos_index,
                    reset_positions=True)

                # cuda
                x, lengths, positions, langs = to_cuda(x,
                                                       lengths,
                                                       positions,
                                                       langs,
                                                       gpu=self.gpu)

                # forward
                output = self.model(x, lengths, positions, langs)
                proba = F.softmax(output, 1)[:, 1]

                proba_result.extend(proba.cpu().numpy())
                src_text_list.extend(src_text)
                trg_text_list.extend(trg_text)
                assert len(proba_result) == len(src_text_list)
                assert len(proba_result) == len(trg_text_list)

                if len(proba_result) > params.flush_frequency:
                    logger.info(" GPU %i - write out score..." % self.gpu)
                    with open(result_path, 'a') as f:
                        for i in range(len(proba_result)):
                            f.write('{}{}{}{}{}'.format(
                                src_text_list[i], params.delimeter,
                                trg_text_list[i], params.delimeter,
                                str(proba_result[i])) + os.linesep)
                        proba_result = []
                        src_text_list = []
                        trg_text_list = []

            # write out the remainings
            logger.info(" GPU %i - write out score..." % self.gpu)
            with open(result_path, 'a') as f:
                for i in range(len(proba_result)):
                    f.write('{}{}{}{}{}'.format(
                        src_text_list[i], params.delimeter, trg_text_list[i],
                        params.delimeter, str(proba_result[i])) + os.linesep)
                proba_result = []
                src_text_list = []
                trg_text_list = []