Ejemplo n.º 1
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    logger.info(dir(test_iter))
    tokenizer = BertTokenizer.from_pretrained(args.model_path,
                                              do_lower_case=True)
    symbols = {
        'BOS': tokenizer.vocab['[unused1]'],
        'EOS': tokenizer.vocab['[unused2]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused3]']
    }
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Ejemplo n.º 2
0
def _format_to_bert(params):
    corpus_type, json_file, args, save_file = params
    is_test = corpus_type == 'test'
    if (os.path.exists(save_file)):
        logger.info('Ignore %s' % save_file)
        return

    bert = BertData(args)

    logger.info('Processing %s' % json_file)
    jobs = json.load(open(json_file))
    datasets = []
    for d in jobs:
        d = json.loads(d)
        source, tgt, alignment = d['src'], d['tgt'], d['alignment']

        # temp code
        tgt = [tgt]
        # temp code end

        sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3)
        if (args.lower):
            source = [' '.join(s).lower().split() for s in source]
            tgt = [' '.join(s).lower().split() for s in tgt]
        b_data = bert.preprocess(source, tgt, alignment, sent_labels, \
                use_bert_basic_tokenizer=args.use_bert_basic_tokenizer, \
                is_test=is_test)
        # b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer)

        if (b_data is None):
            continue
        src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt, alignment = b_data
        b_data_dict = {
            "src": src_subtoken_idxs,
            "tgt": tgt_subtoken_idxs,
            "src_sent_labels": sent_labels,
            "segs": segments_ids,
            'clss': cls_ids,
            'src_txt': src_txt,
            "tgt_txt": tgt_txt,
            "alignment": alignment
        }
        datasets.append(b_data_dict)
    logger.info('Processed instances %d' % len(datasets))
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
    datasets = []
    gc.collect()
Ejemplo n.º 3
0
def train_abs_multi(args):
    """ Spawns 1 process per GPU """
    init_logger()

    nb_gpu = args.world_size
    mp = torch.multiprocessing.get_context("spawn")

    # Create a thread to listen for errors in the child processes.
    error_queue = mp.SimpleQueue()
    error_handler = ErrorHandler(error_queue)

    # Train with multiprocessing.
    procs = []
    for i in range(nb_gpu):
        device_id = i
        procs.append(
            mp.Process(target=run, args=(args, device_id, error_queue), daemon=True)
        )
        procs[i].start()
        logger.info(" Starting process pid: %d  " % procs[i].pid)
        error_handler.add_child(procs[i].pid)
    for p in procs:
        p.join()
Ejemplo n.º 4
0
def load_models_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    return predictor
Ejemplo n.º 5
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = ExtSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args, load_dataset(args, 'dev', shuffle=False),
                                        args.batch_size, device,
                                        shuffle=False, is_test=False)
    trainer = build_trainer(args, device_id, model, None)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Ejemplo n.º 6
0
    def inference_save_by_id(self, loader, cache_dir, model_file):

        datalist = CommentDataset(self.args, loader, cache_dir, self.dataname, self.device, self.tokenizer)
        data_iter = Iterator(datalist, batch_size=len(datalist), device=self.device, shuffle=False) # 1651

        if model_file:
            print('Model is loaded from ', model_file)
            if logger:
                logger.info('Model is from {}'.format(model_file))     
            model_path = os.path.join(self.savepath, model_file)
            self.model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)['model'], strict=False)
        else:
            print('Not loading pretrained model...')
            if logger:
                logger.info('Not loading pretrained model ...')
        try:
            print('[Warning] Going to delete the original temp file at ', self.args.savepath)
            shutil.rmtree(self.args.savepath+'/temp')
        except:
            print('Creating new temp folder')
        os.makedirs(os.path.join(self.args.savepath, 'temp'), exist_ok=True)
        os.makedirs(os.path.join(self.args.savepath, 'temp_gold'), exist_ok=True)
        self.predictor.translate(data_iter, model_file, cal_rouge=False, save=False, save_by_id=True)
Ejemplo n.º 7
0
def test_text_ext(args):
    logger.info('Loading checkpoint from %s' % args.test_from)
    checkpoint = torch.load(
        args.test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    device_id = 0 if device == "cuda" else -1
    logger.info('Coming here:1')
    model = ExtSummarizer(args, device, checkpoint)
    model.eval()
    logger.info('Coming here:2')
    logger.info('args: %s' % args)
    test_iter = data_loader.load_text(
        args, args.text_src, args.text_tgt, device)

    trainer = build_trainer(args, device_id, model, None)
    stats = trainer.test(test_iter, -1)
    logger.info('Coming here:3')
    return stats
Ejemplo n.º 8
0
def fix_missing_period(args):
    input_dir = os.path.abspath(args.raw_path)
    output_dir = os.path.abspath(args.save_path)
    os.makedirs(output_dir, exist_ok=True)

    logger.info("Fixing missing period in %s and saving in %s..." %
                (input_dir, output_dir))
    stories = os.listdir(input_dir)
    for s in stories:
        if (not s.endswith('story')):
            continue
        _fix_missing_period(os.path.join(input_dir, s),
                            os.path.join(output_dir, s))

    # Check that the tokenized stories directory contains the same number of files as the original directory
    num_inputs = len(os.listdir(input_dir))
    num_outputs = len(os.listdir(output_dir))
    if num_inputs != num_outputs:
        raise Exception(
            "The output directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during processing?"
            % (output_dir, num_outputs, input_dir, num_inputs))
    logger.info("Successfully finished fixing missing period %s to %s.\n" %
                (input_dir, output_dir))
    def output(self, step, num_steps, learning_rate, start):
        """Write out statistics to stdout.

        Args:
           step (int): current step
           n_batch (int): total batches
           start (int): start time of step.
        """
        t = self.elapsed_time()
        step_fmt = "%2d" % step
        if num_steps > 0:
            step_fmt = "%s/%5d" % (step_fmt, num_steps)
        logger.info(
            ("Step %s; xent: %4.2f; " + "lr: %7.7f; %3.0f docs/s; %6.0f sec")
            % (
                step_fmt,
                self.xent(),
                learning_rate,
                self.n_docs / (t + 1e-5),
                time.time() - start,
            )
        )
        sys.stdout.flush()
Ejemplo n.º 10
0
def str_format_to_bert_test(source, args, save_file):

    bert = BertData(args)

    logger.info('Processing %s' % source)
    tgt = [word_tokenize(t) for t in sent_tokenize(tgt)]
    source = [word_tokenize(t) for t in sent_tokenize(source)]

    sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3)
    if (args.lower):
        source = [' '.join(s).lower().split() for s in source]
        tgt = [' '.join(s).lower().split() for s in tgt]
    b_data = bert.preprocess(
        source,
        tgt,
        sent_labels,
        use_bert_basic_tokenizer=args.use_bert_basic_tokenizer,
        is_test=True)

    if (b_data is None):
        return
    src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data
    b_data_dict = {
        "src": src_subtoken_idxs,
        "tgt": tgt_subtoken_idxs,
        "src_sent_labels": sent_labels,
        "segs": segments_ids,
        'clss': cls_ids,
        'src_txt': src_txt,
        "tgt_txt": tgt_txt
    }

    sent_labels = [0 for i in range(len(sent_labels))]
    tgt = []
    datasets = [b_data_dict]
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
Ejemplo n.º 11
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if pt != '':
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)

    symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir)
    model = AbsSummarizer(args, device, checkpoint, symbols=symbols)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False,
                                        tokenizer=tokenizer)
    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Ejemplo n.º 12
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)

    tokenizer = BertData(args).tokenizer

    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    # tokenizer = None
    # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']:
    #     tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir)
    #
    # if not tokenizer:
    #     raise NotImplementedError("tokenizer")

   # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]'])
    symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'),
               'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Ejemplo n.º 13
0
def train(args, device_id):
    init_logger(args.log_file)

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'train',
                                                   shuffle=True),
                                      args.batch_size,
                                      device,
                                      shuffle=True,
                                      is_test=False)

    model = Summarizer(args, device, load_pretrained_bert=True)
    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
        model.load_cp(checkpoint)
        optim = model_builder.build_optim(args, model, checkpoint)
    else:
        optim = model_builder.build_optim(args, model, None)

    logger.info(model)
    trainer = build_trainer(args, device_id, model, optim)
    losses, n_docs = trainer.train(train_iter_fct, args.train_steps)

    save_pickle(losses, 'losses_classifier')
    save_pickle(n_docs, 'docs_classifier')
Ejemplo n.º 14
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False),
                                        args.batch_size, device,
                                        shuffle=False, is_test=False)

    if args.bart:
        tokenizer = AutoTokenizer.from_pretrained('/home/ybai/downloads/bart', do_lower_case=True,
                                                  cache_dir=args.temp_dir, local_files_only=False)
        symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'],
                   'PAD': tokenizer.encoder['<pad>'], 'EOQ': tokenizer.encoder['madeupword0002']}
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True,
                                                  cache_dir=args.temp_dir, local_files_only=True)
        symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'],
                   'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']}

    valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Ejemplo n.º 15
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == "-1" else "cuda"
    if pt != "":
        test_from = pt
    else:
        test_from = args.test_from
    logger.info("Loading checkpoint from %s" % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint["opt"])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(
        args,
        load_dataset(args, "test", shuffle=False),
        args.test_batch_size,
        device,
        shuffle=False,
        is_test=True,
    )
    tokenizer = BertTokenizer.from_pretrained(
        "bert-base-chinese", do_lower_case=True, cache_dir=args.temp_dir
    )
    symbols = {
        "BOS": tokenizer.vocab["[unused0]"],
        "EOS": tokenizer.vocab["[unused1]"],
        "PAD": tokenizer.vocab["[PAD]"],
        "EOQ": tokenizer.vocab["[unused2]"],
    }
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Ejemplo n.º 16
0
def build_trainer(args, device_id, model, optim):
    """
    Simplify `Trainer` creation based on user `opt`s*
    Args:
        opt (:obj:`Namespace`): user options (usually from argument parsing)
        model (:obj:`onmt.models.NMTModel`): the model to train
        fields (dict): dict of fields
        optim (:obj:`onmt.utils.Optimizer`): optimizer used during training
        data_type (str): string describing the type of data
            e.g. "text", "img", "audio"
        model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object
            used to save the model
    """

    grad_accum_count = args.accum_count
    n_gpu = args.world_size

    if device_id >= 0:
        gpu_rank = int(args.gpu_ranks[device_id])
    else:
        gpu_rank = 0
        n_gpu = 0

    print('gpu_rank %d' % gpu_rank)

    # set tensorboard.
    tensorboard_log_dir = args.model_path
    writer = SummaryWriter(tensorboard_log_dir, comment="Unmt")
    report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer)
    # build trainer
    trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager)
    if (model):
        # paramaters
        n_params = _tally_parameters(model)
        logger.info('* number of parameters: %d' % n_params)

    return trainer
Ejemplo n.º 17
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    # for chinese tokenization
    add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]']
    if args.bart:
        tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False)
        # tokenizer = AutoTokenizer.from_pretrained('/home/ybai/downloads/bart', do_lower_case=True,
        #                                           cache_dir=args.temp_dir, local_files_only=False)
        symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'],
                   'PAD': tokenizer.encoder['<pad>'], 'EOQ': tokenizer.encoder['madeupword0002']}
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True,
                                                  cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list)
        symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'],
                   'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Ejemplo n.º 18
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    vocab = get_kobert_vocab(cachedir=args.temp_dir)
    symbols = {
        'BOS': vocab.token_to_idx['[BOS]'],
        'EOS': vocab.token_to_idx['[EOS]'],
        'PAD': vocab.token_to_idx['[PAD]'],
        'EOQ': vocab.token_to_idx['[EOS]']
    }
    predictor = build_predictor(args, vocab, symbols, model, logger)
    predictor.translate(test_iter, step)
Ejemplo n.º 19
0
def _format_to_bert(params):
    corpus_type, json_file, args, save_file = params
    is_test = corpus_type == 'test'
    if (os.path.exists(save_file)):
        logger.info('Ignore %s' % save_file)
        return

    bert = BertData(args)

    logger.info('Processing %s' % json_file)
   # print("PATH is : {}".format(json_file))
   # base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
   # if corpus_type == 'train':
   #     json_file = os.path.join(base_dir, 'merged_stories_tokenized', json_file)
   # elif corpus_type == 'valid':
   #     json_file = os.path.join(base_dir, 'merged_stories_tokenized_val', json_file)
   # elif corpus_type == 'test':
   #     json_file = os.path.join(base_dir, 'merged_stories_tokenized_test', json_file)
   # else:
   #     print("Not in in dataset")
   #     sys.exit()
    jobs = json.load(open(json_file))
    datasets = []
    for d in jobs:
        source, tgt = d['src'], d['tgt']

        sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3)
        if (args.lower):
            source = [' '.join(s).lower().split() for s in source]
            tgt = [' '.join(s).lower().split() for s in tgt]
        b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer,
                                 is_test=is_test)
        # b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer)

        if (b_data is None):
            continue
        src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data
        b_data_dict = {"src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs,
                       "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids,
                       'src_txt': src_txt, "tgt_txt": tgt_txt}
        datasets.append(b_data_dict)
    logger.info('Processed instances %d' % len(datasets))
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
    datasets = []
    gc.collect()
Ejemplo n.º 20
0
def getTranslator():
    # set up model

    device = "cpu"

    logger.info('Loading checkpoint from %s' % args.test_from)
    checkpoint = torch.load(args.test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])

    print(args)

    config = BertConfig.from_json_file(args.bert_config_path)
    model = Summarizer(args,
                       device,
                       load_pretrained_bert=False,
                       bert_config=config)
    model.load_cp(checkpoint)
    model.eval()

    return build_trainer(args, -1, model, None)
Ejemplo n.º 21
0
def train(args, device_id):
    init_logger(args.log_file)

    device = "cpu" if args.visible_gpus == "-1" else "cuda"
    logger.info("Device ID %d" % device_id)
    logger.info("Device %s" % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(
            args,
            load_dataset(args, "train", shuffle=True),
            args.batch_size,
            device,
            shuffle=True,
            is_test=False,
        )

    model = Summarizer(args, device, load_pretrained_bert=True)
    if args.train_from != "":
        logger.info("Loading checkpoint from %s" % args.train_from)
        checkpoint = torch.load(
            args.train_from, map_location=lambda storage, loc: storage
        )
        opt = vars(checkpoint["opt"])
        for k in opt.keys():
            if k in model_flags:
                setattr(args, k, opt[k])
        model.load_cp(checkpoint)
        optim = model_builder.build_optim(args, model, checkpoint)
    else:
        optim = model_builder.build_optim(args, model, None)

    logger.info(model)
    trainer = build_trainer(args, device_id, model, optim)
    trainer.train(train_iter_fct, args.train_steps)
Ejemplo n.º 22
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    config = BertConfig.from_json_file(args.bert_config_path)
    model = Summarizer(args,
                       device,
                       load_pretrained_bert=False,
                       bert_config=config)
    model.load_cp(checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)
    trainer = build_trainer(args, device_id, model, None)
    # comet_experiment.log_parameters(config)
    with comet_experiment.test():
        stats = trainer.validate(valid_iter, step)
        return stats.xent()
Ejemplo n.º 23
0
def _format_to_bert(params):
    corpus_type, json_file, args, save_file = params
    is_test = corpus_type == 'test'
    if (os.path.exists(save_file)):
        logger.info('Ignore %s' % save_file)
        return

    bert = BertData(args)

    logger.info('Processing %s' % json_file)
    jobs = json.load(open(json_file))
    datasets = []
    for d in jobs:
        source, tgt, n_tgt = d['src'], d['tgt'], d['n_tgt']

        if (args.lower):
            source = [' '.join(s).lower().split() for s in source]
            tgt = [' '.join(s).lower().split() for s in tgt]
            n_tgt = [' '.join(s).lower().split() for s in n_tgt]
        b_data = bert.preprocess(
            source,
            tgt,
            n_tgt,
            use_bert_basic_tokenizer=args.use_bert_basic_tokenizer,
            is_test=is_test)
        if (b_data is None):
            continue
        p_pair, n_pair, p_segments_ids, n_segments_ids, p_summ_mask, n_summ_mask, src_txt, tgt_txt, n_tgt_text = b_data
        b_data_dict = {
            "pos": p_pair,
            "neg": n_pair,
            "p_summ_mask": p_summ_mask,
            'n_summ_mask': n_summ_mask,
            "p_segs": p_segments_ids,
            "n_segs": n_segments_ids,
            'src_txt': src_txt,
            "tgt_txt": tgt_txt,
            "n_tgt_text": n_tgt_text
        }
        datasets.append(b_data_dict)
    logger.info('Processed instances %d' % len(datasets))
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
    datasets = []
    gc.collect()
Ejemplo n.º 24
0
    def test_detector(self, loader, run_gen=False, portion='all'):
        """ Testing detector  """
        test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer)
        #test = CommentDataset(self.args, loader, self.dataname, self.device, self.tokenizer)

        data_iter = Iterator(test, train=False, device=self.device, 
                             batch_size=len(test) if len(test)<self.bs else self.bs,
                             sort_key=lambda x: len(x.src),
                             sort_within_batch=False)
        # Define trainer
        train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, train=False)
        trainer = build_trainer(self.args, self.model, self.optim, train_loss)

        logger.info('Test on best model (stage-1)')
        best_model = os.path.join(self.args.savepath, 'best_model.pt')

        if os.path.exists(best_model):
            try:
                self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                  loc: storage)['model'])
            except:
                self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                  loc: storage)['model'], strict=False)
                logger.info('[Warning] The keys in state dict do not strictly match')

            test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="Without Generated Response >>", write_type="w")
            test_stat.write_results(os.path.join(self.args.savepath, 'result_test.csv'), 'test-'+portion, self.args.label_num)

        if self.args.test_adv:

            logger.info('Test on adversarially-trained model (stage-2)')
            best_model = os.path.join(self.args.savepath, 'best_adv_model.pt')
            if os.path.exists(best_model):
                try:
                    self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                      loc: storage)['model'])
                except:
                    self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, 
                                                      loc: storage)['model'], strict=False)
                    logger.info('[Warning] The keys in state dict do not strictly match')

            test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="\nWith Generated Response from {} >>".format(best_model.split("/")[-1]), write_type=="a")
            predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger)
            predictor.translate(data_iter, 'best', have_gold=False)
Ejemplo n.º 25
0
def validate_abs(args, device_id):
    timestep = 0
    tensorboard_writer = SummaryWriter(args.model_path + '/valid_test', comment="Unmt")
    if (args.test_all):
        cp_files = sorted(glob.glob(os.path.join(args.model_path, 'model_step_*.pt')))
        cp_files.sort(key=os.path.getmtime)
        xent_lst = []
        for i, cp in enumerate(cp_files):
            logger.info('validate:  %s'%cp)
            step = int(cp.split('.')[-2].split('_')[-1])
            if (args.test_start_from != -1 and step < args.test_start_from):
                xent_lst.append((1e6, cp))
                continue
            xent = validate(args, device_id, cp, step)
            tensorboard_writer.add_scalar('valid/xent', xent, step)
            tensorboard_writer.flush()
            xent_lst.append((xent, cp))
            max_step = xent_lst.index(min(xent_lst))
            if (i - max_step > 10):
                break
        xent_lst = sorted(xent_lst, key=lambda x: x[0])
        logger.info('PPL %s' % str(xent_lst))
        for xent, cp in xent_lst:
            step = int(cp.split('.')[-2].split('_')[-1])
            logger.info('test:  %s' % cp)
            test_abs(args, device_id, cp, step)
    else:
        while (True):
            cp_files = sorted(glob.glob(os.path.join(args.model_path, 'model_step_*.pt')))
            cp_files.sort(key=os.path.getmtime)
            if (cp_files):
                cp = cp_files[-1]
                time_of_cp = os.path.getmtime(cp)
                if (not os.path.getsize(cp) > 0):
                    time.sleep(60)
                    continue
                if (time_of_cp > timestep):
                    timestep = time_of_cp
                    step = int(cp.split('.')[-2].split('_')[-1])
                    xent = validate(args, device_id, cp, step)
                    tensorboard_writer.add_scalar('valid/xent', xent, step)
                    tensorboard_writer.flush()
                    test_abs(args, device_id, cp, step)

            cp_files = sorted(glob.glob(os.path.join(args.model_path, 'model_step_*.pt')))
            cp_files.sort(key=os.path.getmtime)
            if (cp_files):
                cp = cp_files[-1]
                time_of_cp = os.path.getmtime(cp)
                if (time_of_cp > timestep):
                    continue
            else:
                time.sleep(300)
Ejemplo n.º 26
0
def validate_ext(args, device_id):
    timestep = 0
    if (args.test_all):
        cp_files = sorted(
            glob.glob(os.path.join(args.model_path, 'model_step_*.pt')))
        cp_files.sort(key=os.path.getmtime)
        xent_lst = []
        for i, cp in enumerate(cp_files):
            step = int(cp.split('.')[-2].split('_')[-1])
            xent = validate(args, device_id, cp, step)
            xent_lst.append((xent, cp))
            max_step = xent_lst.index(min(xent_lst))
            if (i - max_step > 10):
                break
        xent_lst = sorted(xent_lst, key=lambda x: x[0])[:3]
        logger.info('PPL %s' % str(xent_lst))
        logger.info(
            'Decoding and Computing ROUGE for top-3 models for DEV Set: ')
        for xent, cp in xent_lst:
            step = int(cp.split('.')[-2].split('_')[-1])
            val_ext(args, device_id, cp, step)
        logger.info(
            'Decoding and Computing ROUGE for top-3 models for TEST Set: ')
        for xent, cp in xent_lst:
            step = int(cp.split('.')[-2].split('_')[-1])
            test_ext(args, device_id, cp, step)
    else:
        while (True):
            cp_files = sorted(
                glob.glob(os.path.join(args.model_path, 'model_step_*.pt')))
            cp_files.sort(key=os.path.getmtime)
            if (cp_files):
                cp = cp_files[-1]
                time_of_cp = os.path.getmtime(cp)
                if (not os.path.getsize(cp) > 0):
                    time.sleep(60)
                    continue
                if (time_of_cp > timestep):
                    timestep = time_of_cp
                    step = int(cp.split('.')[-2].split('_')[-1])
                    validate(args, device_id, cp, step)
                    test_ext(args, device_id, cp, step)

            cp_files = sorted(
                glob.glob(os.path.join(args.model_path, 'model_step_*.pt')))
            cp_files.sort(key=os.path.getmtime)
            if (cp_files):
                cp = cp_files[-1]
                time_of_cp = os.path.getmtime(cp)
                if (time_of_cp > timestep):
                    continue
            else:
                time.sleep(300)
Ejemplo n.º 27
0
def build_trainer(args, device_id, model, symbols, vocab_size, optim):
    """
    Simplify `Trainer` creation based on user `opt`s*
    Args:
        opt (:obj:`Namespace`): user options (usually from argument parsing)
        model (:obj:`onmt.models.NMTModel`): the model to train
        fields (dict): dict of fields
        optim (:obj:`onmt.utils.Optimizer`): optimizer used during training
        data_type (str): string describing the type of data
            e.g. "text", "img", "audio"
        model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object
            used to save the model
    """

    device = "cpu" if args.visible_gpus == '-1' else "cuda"

    train_loss = build_loss_compute(model.generator,
                                    symbols,
                                    vocab_size,
                                    device,
                                    train=True,
                                    label_smoothing=args.label_smoothing)
    valid_loss = build_loss_compute(model.generator,
                                    symbols,
                                    vocab_size,
                                    train=False,
                                    device=device)

    shard_size = args.max_generator_batches
    grad_accum_count = args.accum_count
    n_gpu = args.world_size
    if device_id >= 0:
        gpu_rank = int(args.gpu_ranks[device_id])
    else:
        gpu_rank = 0
        n_gpu = 0

    tensorboard_log_dir = args.model_path

    # writer = SummaryWriter(tensorboard_log_dir, comment="Unmt")

    # report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer)

    trainer = Trainer(args, model, train_loss, valid_loss, optim, shard_size,
                      grad_accum_count, n_gpu, gpu_rank)

    n_params, enc, dec = _tally_parameters(model)
    logger.info('encoder: %d' % enc)
    logger.info('decoder: %d' % dec)
    logger.info('* number of parameters: %d' % n_params)

    return trainer
Ejemplo n.º 28
0
def train(args, device_id):
    init_logger(args.log_file)

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        # return data_loader.AbstractiveDataloader(load_dataset('train', True), symbols, FLAGS.batch_size, device, True)
        return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device,
                                                 shuffle=True, is_test=False)

    model = Summarizer(args, device, checkpoint)
    # optim = model_builder.build_optim(args, model.reg, checkpoint)
    optim = model_builder.build_optim(args, model, checkpoint)
    # optim = BertAdam()
    logger.info(model)
    trainer = build_trainer(args, device_id, model, optim)
    #
    trainer.train(train_iter_fct, args.train_steps)
Ejemplo n.º 29
0
    def _preprocess(self, params):
        corpus_type, json_file, args, save_file = params
        is_test = corpus_type == 'test'
        if (os.path.exists(save_file)):
            logger.info('Ignore %s' % save_file)
            return

        bert = PretrainData(args)
        logger.info('Processing %s' % json_file)
        jobs = json.load(open(json_file))
        datasets = []
        for d in jobs:
            source, tgt = d['src'], d['tgt']
            # DDDDDDDDDDDELETE code
            #if is_test:
            #    if len(d['src']) < 5:
            #        continue
            # DDDDDDDDDDDELETE code end
            if (args.lower):
                source = [' '.join(s).lower().split() for s in source]
                tgt = [' '.join(s).lower().split() for s in tgt]
            b_data = bert.preprocess(source, tgt, is_test=is_test)
            if (b_data is None):
                continue
            src_subtoken_idxs, tgt_subtoken_idxs, segments_ids, src_txt, tgt_txt = b_data

            b_data_dict = {"src": src_subtoken_idxs,
                           "tgt": tgt_subtoken_idxs,
                           "segs": segments_ids,
                           "example_id": d['example_id'],
                           "src_txt": src_txt,
                           "tgt_txt": tgt_txt}
            datasets.append(b_data_dict)

        logger.info('Processed instances %d' % len(datasets))
        logger.info('Saving to %s' % save_file)
        torch.save(datasets, save_file)
        datasets = []
        gc.collect()
Ejemplo n.º 30
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(
                pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(line.strip())
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    logger.info("txt read finished")
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)
        else:
            logger.info("not in mapping file", f)
            train_files.append(f)
    logger.info("data split over")
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []