def main():
    onlinePreprocess.seq_length = opt.max_sent_length
    onlinePreprocess.max_doc_len = opt.max_doc_len
    onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0
    onlinePreprocess.norm_lambda = opt.norm_lambda
    dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt,
                                  opt.tgt_vocab, opt.train_oracle,
                                  opt.train_src_rouge, opt.train_src_section,
                                  opt.drop_too_short, opt.drop_too_long)

    trainData = neusum.Dataset(
        dataset['train']['src'],
        dataset['train']['src_raw'],
        dataset['train']['tgt'],
        dataset['train']['oracle'],
        dataset['train']['src_rouge'],
        dataset['train']['src_section'],
        dataset['train']['src_section_raw'],
        opt.batch_size,
        opt.max_doc_len,
        opt.gpus,
        dataset['train']['bert_annotation'],
        good_patterns=loglinear.Config.Keyword[opt.qtype],
        use_good=True)

    dicts = dataset['dicts']
    print(' * vocabulary size. source = %d' % (dicts['src'].size()))
    print(' * number of training sentences. %d' % len(dataset['train']['src']))
    print(' * maximum batch size. %d' % opt.batch_size)

    print('Building model...')

    sent_encoder = neusum.Models.Encoder(opt, dicts['src'])
    doc_encoder = neusum.Models.DocumentEncoder(opt)
    pointer = neusum.Models.Pointer(opt)
    if opt.dec_init == "simple":
        decIniter = neusum.Models.DecInit(opt)
    elif opt.dec_init == "att":
        decIniter = neusum.Models.DecInitAtt(opt)
    else:
        raise ValueError('Unknown decoder init method: {0}'.format(
            opt.dec_init))

    model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer,
                                   decIniter)

    log_linear_model = loglinear.model.LogLinear()
    log_linear_model.set_rules(opt.position_weight, opt.keyword_weight,
                               loglinear.Config.Keyword[opt.qtype],
                               opt.in_bert_weight, opt.in_section_weight,
                               loglinear.Config.PossibleSection[opt.qtype],
                               opt.section_embedding, opt.pre_word_vecs_enc)

    get_report(model, 'Neural-based Model')
    get_report(log_linear_model, 'Log-linear Model')
Ejemplo n.º 2
0
    def buildData(self, srcBatch, srcRaw, tgtRaw, oracleBatch, srcRougeBatch, src_section_batch, src_section_raw, bert_annotation=None, good_patterns: List[str] = None, use_good: bool = False):
        """
        (used in load_dev_data)
        """
        srcData = [[self.src_dict.convertToIdx(b,
                                               neusum.Constants.UNK_WORD) for b in doc] for doc in srcBatch]
        srcBatchData = [[self.src_dict.convertToIdx(b,
                                               neusum.Constants.UNK_WORD) for b in doc] for doc in src_section_batch]

        return neusum.Dataset(srcData, srcRaw, tgtRaw, oracleBatch, srcRougeBatch, srcBatchData, src_section_raw, self.opt.batch_size,
                           # self.opt.max_doc_len, self.opt.cuda, volatile=True)
                           self.opt.max_doc_len, self.opt.cuda, bert_annotation=bert_annotation, good_patterns=good_patterns, use_good=use_good)
    def buildData(self, srcBatch, srcRaw, tgtRaw, oracleBatch, srcRougeBatch):
        srcData = [[
            self.src_dict.convertToIdx(b, neusum.Constants.UNK_WORD)
            for b in doc
        ] for doc in srcBatch]

        return neusum.Dataset(srcData,
                              srcRaw,
                              tgtRaw,
                              oracleBatch,
                              srcRougeBatch,
                              self.opt.batch_size,
                              self.opt.max_doc_len,
                              self.opt.cuda,
                              volatile=True)
def main():
    if not opt.online_process_data:
        raise Exception(
            'This code does not use preprocessed .pt pickle file. It has some issues with big files.'
        )
        # dataset = torch.load(opt.data)
    else:
        import onlinePreprocess
        onlinePreprocess.seq_length = opt.max_sent_length
        onlinePreprocess.max_doc_len = opt.max_doc_len
        onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0
        onlinePreprocess.norm_lambda = opt.norm_lambda
        from onlinePreprocess import prepare_data_online
        dataset = prepare_data_online(opt.train_src, opt.src_vocab,
                                      opt.train_tgt, opt.tgt_vocab,
                                      opt.train_oracle, opt.train_src_rouge,
                                      opt.train_src_section,
                                      opt.drop_too_short, opt.drop_too_long)

    trainData = neusum.Dataset(
        dataset['train']['src'],
        dataset['train']['src_raw'],
        dataset['train']['tgt'],
        dataset['train']['oracle'],
        dataset['train']['src_rouge'],
        dataset['train']['src_section'],
        dataset['train']['src_section_raw'],
        opt.batch_size,
        opt.max_doc_len,
        opt.gpus,
        dataset['train']['bert_annotation'],
        good_patterns=loglinear.Config.Keyword[opt.qtype],
        use_good=True)

    dicts = dataset['dicts']
    # logger.info(' * vocabulary size. source = %d; target = %d' %
    #             (dicts['src'].size(), dicts['tgt'].size()))
    logger.info(' * vocabulary size. source = %d' % (dicts['src'].size()))
    logger.info(' * number of training sentences. %d' %
                len(dataset['train']['src']))
    logger.info(' * maximum batch size. %d' % opt.batch_size)

    # sent_encoder = loglinear.model.SentEncoder(opt, dicts['src'])
    # model = loglinear.model.LogLinear(sent_encoder)
    if opt.gpus:
        model = loglinear.model.LogLinear(use_gpu=True)
    else:
        model = loglinear.model.LogLinear(use_gpu=False)

    model.set_rules(opt.position_weight, opt.keyword_weight,
                    loglinear.Config.Keyword[opt.qtype], opt.in_bert_weight,
                    opt.in_section_weight,
                    loglinear.Config.PossibleSection[opt.qtype],
                    opt.section_embedding, opt.pre_word_vecs_enc)

    if len(opt.gpus) >= 1:
        model.cuda()
    else:
        model.cpu()

    if opt.freeze_word_vecs_enc:
        logger.warning('Not updating encoder word embedding.')

    # sent_encoder.load_pretrained_vectors(opt, logger)

    optim = neusum.Optim(opt.optim,
                         opt.learning_rate,
                         max_grad_norm=opt.max_grad_norm,
                         max_weight_value=opt.max_weight_value,
                         lr_decay=opt.learning_rate_decay,
                         start_decay_at=opt.start_decay_at,
                         decay_bad_count=opt.halve_lr_bad_count)

    optim.set_parameters(model.parameters())

    validData = None
    if opt.dev_input_src and opt.dev_ref:
        summarizer = neusum.Summarizer(opt, model, dataset)
        validData = load_dev_data(
            summarizer,
            opt.dev_input_src,
            opt.dev_ref,
            opt.dev_input_src_section,
            opt.drop_too_short,
            opt.drop_too_long,
            test_bert_annotation=opt.test_bert_annotation)

    trainModel(model, trainData, validData, dataset, optim)
Ejemplo n.º 5
0
def main():
    if not opt.online_process_data:
        raise Exception(
            'This code does not use preprocessed .pt pickle file. It has some issues with big files.'
        )
        # dataset = torch.load(opt.data)
    else:
        import onlinePreprocess
        onlinePreprocess.seq_length = opt.max_sent_length
        onlinePreprocess.max_doc_len = opt.max_doc_len
        onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0
        onlinePreprocess.norm_lambda = opt.norm_lambda
        from onlinePreprocess import prepare_data_online
        dataset = prepare_data_online(opt.train_src, opt.src_vocab,
                                      opt.train_tgt, opt.tgt_vocab,
                                      opt.train_oracle, opt.train_src_rouge)

    trainData = neusum.Dataset(dataset['train']['src'],
                               dataset['train']['src_raw'],
                               dataset['train']['tgt'],
                               dataset['train']['oracle'],
                               dataset['train']['src_rouge'], opt.batch_size,
                               opt.max_doc_len, opt.gpus)
    dicts = dataset['dicts']
    logger.info(' * vocabulary size. source = %d; target = %d' %
                (dicts['src'].size(), dicts['tgt'].size()))
    logger.info(' * number of training sentences. %d' %
                len(dataset['train']['src']))
    logger.info(' * maximum batch size. %d' % opt.batch_size)

    logger.info('Building model...')

    sent_encoder = neusum.Models.Encoder(opt, dicts['src'])
    doc_encoder = neusum.Models.DocumentEncoder(opt)
    pointer = neusum.Models.Pointer(opt, dicts['tgt'])
    if opt.dec_init == "simple":
        decIniter = neusum.Models.DecInit(opt)
    elif opt.dec_init == "att":
        decIniter = neusum.Models.DecInitAtt(opt)
    else:
        raise ValueError('Unknown decoder init method: {0}'.format(
            opt.dec_init))

    model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer,
                                   decIniter, rouge_calculator)
    summarizer = neusum.Summarizer(opt, model, dataset)

    if len(opt.gpus) >= 1:
        model.cuda()
    else:
        model.cpu()

    if opt.freeze_word_vecs_enc:
        logger.warning('Not updating encoder word embedding.')

    for pr_name, p in model.named_parameters():
        logger.info(pr_name)
        # p.data.uniform_(-opt.param_init, opt.param_init)
        if p.dim() == 1:
            # p.data.zero_()
            p.data.normal_(0, math.sqrt(6 / (1 + p.size(0))))
        else:
            xavier_normal(p, math.sqrt(3))
            # xavier_uniform(p)

    sent_encoder.load_pretrained_vectors(opt, logger)

    optim = neusum.Optim(opt.optim,
                         opt.learning_rate,
                         max_grad_norm=opt.max_grad_norm,
                         max_weight_value=opt.max_weight_value,
                         lr_decay=opt.learning_rate_decay,
                         start_decay_at=opt.start_decay_at,
                         decay_bad_count=opt.halve_lr_bad_count)

    optim.set_parameters(model.parameters())

    validData = None
    if opt.dev_input_src and opt.dev_ref:
        validData = load_dev_data(summarizer, opt.dev_input_src, opt.dev_ref)
    trainModel(model, summarizer, trainData, validData, dataset, optim)