Beispiel #1
0
def main():
    opt = parser.parse_args()
    seq_length = opt.max_sent_length
    logger.info(opt)
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = neusum.Summarizer(opt, logger=logger)

    outF = open(opt.output, 'w', encoding='utf-8')

    predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0

    srcBatch, tgtBatch = [], []
    src_raw, tgt_raw = [], []

    count = 0

    tgtF = open(opt.tgt) if opt.tgt else None
    for line in addone(open(opt.src, encoding='utf-8')):
        if line is not None:
            sline = line.strip()
            srcSents = sline.split('##SENT##')
            srcWords = [x.split(' ')[:seq_length] for x in srcSents]

            src_raw.append(srcSents)
            srcBatch.append(srcWords)

            if tgtF:
                tgtTokens = tgtF.readline().split(' ') if tgtF else None
                tgtBatch += [tgtTokens]
                # tgt_raw.append(tgtWords)

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        predBatch, predId, predScore, goldScore = translator.translate(
            srcBatch, src_raw, None)

        predScoreTotal += sum(score[0] for score in predScore)
        predWordsTotal += sum(len(x[0]) for x in predBatch)

        for b in range(len(predBatch)):
            count += 1
            outF.write('{0}\t{1}'.format(predId[b], predBatch[b]) + '\n')
            outF.flush()
        srcBatch, tgtBatch = [], []
        src_raw, tgt_raw = [], []

    if tgtF:
        tgtF.close()
def main():
    if not opt.online_process_data:
        raise Exception(
            'This code does not use preprocessed .pt pickle file. It has some issues with big files.'
        )
        # dataset = torch.load(opt.data)
    else:
        import onlinePreprocess
        onlinePreprocess.seq_length = opt.max_sent_length
        onlinePreprocess.max_doc_len = opt.max_doc_len
        onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0
        onlinePreprocess.norm_lambda = opt.norm_lambda
        from onlinePreprocess import prepare_data_online
        dataset = prepare_data_online(opt.train_src, opt.src_vocab,
                                      opt.train_tgt, opt.tgt_vocab,
                                      opt.train_oracle, opt.train_src_rouge,
                                      opt.train_src_section,
                                      opt.drop_too_short, opt.drop_too_long)

    trainData = neusum.Dataset(
        dataset['train']['src'],
        dataset['train']['src_raw'],
        dataset['train']['tgt'],
        dataset['train']['oracle'],
        dataset['train']['src_rouge'],
        dataset['train']['src_section'],
        dataset['train']['src_section_raw'],
        opt.batch_size,
        opt.max_doc_len,
        opt.gpus,
        dataset['train']['bert_annotation'],
        good_patterns=loglinear.Config.Keyword[opt.qtype],
        use_good=True)

    dicts = dataset['dicts']
    # logger.info(' * vocabulary size. source = %d; target = %d' %
    #             (dicts['src'].size(), dicts['tgt'].size()))
    logger.info(' * vocabulary size. source = %d' % (dicts['src'].size()))
    logger.info(' * number of training sentences. %d' %
                len(dataset['train']['src']))
    logger.info(' * maximum batch size. %d' % opt.batch_size)

    # sent_encoder = loglinear.model.SentEncoder(opt, dicts['src'])
    # model = loglinear.model.LogLinear(sent_encoder)
    if opt.gpus:
        model = loglinear.model.LogLinear(use_gpu=True)
    else:
        model = loglinear.model.LogLinear(use_gpu=False)

    model.set_rules(opt.position_weight, opt.keyword_weight,
                    loglinear.Config.Keyword[opt.qtype], opt.in_bert_weight,
                    opt.in_section_weight,
                    loglinear.Config.PossibleSection[opt.qtype],
                    opt.section_embedding, opt.pre_word_vecs_enc)

    if len(opt.gpus) >= 1:
        model.cuda()
    else:
        model.cpu()

    if opt.freeze_word_vecs_enc:
        logger.warning('Not updating encoder word embedding.')

    # sent_encoder.load_pretrained_vectors(opt, logger)

    optim = neusum.Optim(opt.optim,
                         opt.learning_rate,
                         max_grad_norm=opt.max_grad_norm,
                         max_weight_value=opt.max_weight_value,
                         lr_decay=opt.learning_rate_decay,
                         start_decay_at=opt.start_decay_at,
                         decay_bad_count=opt.halve_lr_bad_count)

    optim.set_parameters(model.parameters())

    validData = None
    if opt.dev_input_src and opt.dev_ref:
        summarizer = neusum.Summarizer(opt, model, dataset)
        validData = load_dev_data(
            summarizer,
            opt.dev_input_src,
            opt.dev_ref,
            opt.dev_input_src_section,
            opt.drop_too_short,
            opt.drop_too_long,
            test_bert_annotation=opt.test_bert_annotation)

    trainModel(model, trainData, validData, dataset, optim)
Beispiel #3
0
def main():
    if not opt.online_process_data:
        raise Exception(
            'This code does not use preprocessed .pt pickle file. It has some issues with big files.'
        )
        # dataset = torch.load(opt.data)
    else:
        import onlinePreprocess
        onlinePreprocess.seq_length = opt.max_sent_length
        onlinePreprocess.max_doc_len = opt.max_doc_len
        onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0
        onlinePreprocess.norm_lambda = opt.norm_lambda
        from onlinePreprocess import prepare_data_online
        dataset = prepare_data_online(opt.train_src, opt.src_vocab,
                                      opt.train_tgt, opt.tgt_vocab,
                                      opt.train_oracle, opt.train_src_rouge)

    trainData = neusum.Dataset(dataset['train']['src'],
                               dataset['train']['src_raw'],
                               dataset['train']['tgt'],
                               dataset['train']['oracle'],
                               dataset['train']['src_rouge'], opt.batch_size,
                               opt.max_doc_len, opt.gpus)
    dicts = dataset['dicts']
    logger.info(' * vocabulary size. source = %d; target = %d' %
                (dicts['src'].size(), dicts['tgt'].size()))
    logger.info(' * number of training sentences. %d' %
                len(dataset['train']['src']))
    logger.info(' * maximum batch size. %d' % opt.batch_size)

    logger.info('Building model...')

    sent_encoder = neusum.Models.Encoder(opt, dicts['src'])
    doc_encoder = neusum.Models.DocumentEncoder(opt)
    pointer = neusum.Models.Pointer(opt, dicts['tgt'])
    if opt.dec_init == "simple":
        decIniter = neusum.Models.DecInit(opt)
    elif opt.dec_init == "att":
        decIniter = neusum.Models.DecInitAtt(opt)
    else:
        raise ValueError('Unknown decoder init method: {0}'.format(
            opt.dec_init))

    model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer,
                                   decIniter, rouge_calculator)
    summarizer = neusum.Summarizer(opt, model, dataset)

    if len(opt.gpus) >= 1:
        model.cuda()
    else:
        model.cpu()

    if opt.freeze_word_vecs_enc:
        logger.warning('Not updating encoder word embedding.')

    for pr_name, p in model.named_parameters():
        logger.info(pr_name)
        # p.data.uniform_(-opt.param_init, opt.param_init)
        if p.dim() == 1:
            # p.data.zero_()
            p.data.normal_(0, math.sqrt(6 / (1 + p.size(0))))
        else:
            xavier_normal(p, math.sqrt(3))
            # xavier_uniform(p)

    sent_encoder.load_pretrained_vectors(opt, logger)

    optim = neusum.Optim(opt.optim,
                         opt.learning_rate,
                         max_grad_norm=opt.max_grad_norm,
                         max_weight_value=opt.max_weight_value,
                         lr_decay=opt.learning_rate_decay,
                         start_decay_at=opt.start_decay_at,
                         decay_bad_count=opt.halve_lr_bad_count)

    optim.set_parameters(model.parameters())

    validData = None
    if opt.dev_input_src and opt.dev_ref:
        validData = load_dev_data(summarizer, opt.dev_input_src, opt.dev_ref)
    trainModel(model, summarizer, trainData, validData, dataset, optim)
Beispiel #4
0
        decIniter = neusum.Models.DecInit(opt)
    elif opt.dec_init == "att":
        decIniter = neusum.Models.DecInitAtt(opt)
    else:
        raise ValueError('Unknown decoder init method: {0}'.format(
            opt.dec_init))

    model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer,
                                   decIniter)

    # load model
    logger.info('Loading trained model...')
    # model.load_state_dict(checkpoint['model'])

    model.load_state_dict(checkpoint['model'])
    summarizer = neusum.Summarizer(opt, model, dataset)

    if len(opt.gpus) >= 1:
        model.cuda()
    else:
        model.cpu()

    testData = load_dev_data(summarizer,
                             opt.dev_input_src,
                             opt.dev_ref,
                             opt.dev_input_src_section,
                             test_bert_annotation=opt.test_bert_annotation)
    model.eval()
    scores = evalModel(model, summarizer, testData, opt.output_len, 'test',
                       opt.set_postfix, opt.stripping_mode,
                       checkpoint['epoch'])