Beispiel #1
0
    def predict(self,
                data,
                pred=None,
                buckets=8,
                batch_size=5000,
                prob=False,
                **kwargs):
        args = self.args.update(locals())
        init_logger(logger, verbose=args.verbose)

        self.transform.eval()
        if args.prob:
            self.transform.append(Field('probs'))

        logger.info("Load the data")
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, args.buckets, shuffle=False)
        logger.info(f"\n{dataset}")

        logger.info("Make predictions on the dataset")
        start = datetime.now()
        preds = self._predict(dataset.loader)
        elapsed = datetime.now() - start

        for name, value in preds.items():
            setattr(dataset, name, value)
        if pred is not None:
            logger.info(f"Save predicted results to {pred}")
            self.transform.save(pred, dataset.sentences)
        logger.info(
            f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s"
        )

        return dataset
Beispiel #2
0
    def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, **kwargs):
        args = self.args.update(locals())
        init_logger(logger, verbose=args.verbose)

        self.transform.eval()

        logger.info("Load the data")
        # test = {'sentences': os.path.join(data, "Testing_InputSentences.pickle"),
        #          'edu_break': os.path.join(data, "Testing_EDUBreaks.pickle"),
        #          'golden_metric': os.path.join(data, "Testing_GoldenLabelforMetric.pickle")}
        # dataset = Dataset(self.transform, test)
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, n_buckets=1, shuffle=False)
        logger.info(f"\n{dataset}")
        logger.info(vars(dataset))

        logger.info("Make predictions on the dataset")
        start = datetime.now()
        preds = self._predict(dataset.loader)
        import pickle
        with open(args.predict_output_path,'wb') as f:
            pickle.dump(preds, f)
        # elapsed = datetime.now() - start

        # for name, value in preds.items():
        #     setattr(dataset, name, value)
        # if pred is not None:
        #     logger.info(f"Save predicted results to {pred}")
        #     self.transform.save(pred, dataset.sentences)
        # logger.info(f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s")

        return dataset
Beispiel #3
0
def train_abs_multi(args):
    """ Spawns 1 process per GPU """
    init_logger()

    nb_gpu = args.world_size
    mp = torch.multiprocessing.get_context('spawn')

    # Create a thread to listen for errors in the child processes.
    error_queue = mp.SimpleQueue()
    error_handler = ErrorHandler(error_queue)

    # Train with multiprocessing.
    procs = []
    for i in range(nb_gpu):
        device_id = i
        procs.append(
            mp.Process(target=run,
                       args=(
                           args,
                           device_id,
                           error_queue,
                       ),
                       daemon=True))
        procs[i].start()
        logger.info(" Starting process pid: %d  " % procs[i].pid)
        error_handler.add_child(procs[i].pid)
    for p in procs:
        p.join()
Beispiel #4
0
    def evaluate(self, data, buckets=8, batch_size=5000, **kwargs):
        args = self.args.update(locals())
        init_logger(logger, verbose=args.verbose)

        self.transform.train()
        logger.info("Load the data")
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, args.buckets)
        logger.info(f"\n{dataset}")

        logger.info("Evaluate the dataset")
        start = datetime.now()
        loss, metric = self._evaluate(dataset.loader)
        elapsed = datetime.now() - start
        logger.info(f"loss: {loss:.4f} - {metric}")
        logger.info(
            f"{elapsed}s elapsed, {len(dataset)/elapsed.total_seconds():.2f} Sents/s"
        )

        return loss, metric
Beispiel #5
0
def main(model_id, folds, debug):
    MODEL_ID = model_id

    # Script parameters
    MODEL_NAME = f"model_L1A_{MODEL_ID}"
    DATA_DIR = "data"
    PATH_TO_LOG = ".log"

    print('-' * 80)
    # Init logger
    if folds is None:
        log_filename = os.path.join(PATH_TO_ROOT, PATH_TO_LOG,
                                    MODEL_NAME + '.log')
    else:
        log_filename = os.path.join(PATH_TO_ROOT, PATH_TO_LOG,
                                    MODEL_NAME + f'_{folds}.log')
    orig_stdout, orig_stderr, sys.stdout, sys.stderr = init_logger(
        sys, log_filename, timestamp=True, verbose=True)
    print(f'Logged to file: {log_filename}')

    # Read model_module
    MM = exe.load_model_module(MODEL_NAME + "_module",
                               os.path.join(PATH_TO_ROOT, MODEL_MODULE_PATH))

    # Print information
    print('Executed with arguments:')
    print(MM.ARGS)
    print('-' * 80)

    # Read dataset
    path_to_data = os.path.join(PATH_TO_ROOT, DATA_DIR)
    dset_df, annot_dict = MM.get_dset(path_to_data)

    # Add folds
    dset_df = exe.load_kfolds(
        dset_df, os.path.join(PATH_TO_ROOT, DATA_DIR, KFOLDS_FILE))

    # Get list of fold_ids
    fold_ids = dset_df.fold.dropna().unique().tolist()
    fold_ids.sort()

    # Folds to train
    train_folds = fold_ids
    if folds is not None:
        train_folds = [s for s in train_folds if s in folds]

    print('-' * 80)
    print(
        subprocess.run(['nvidia-smi'],
                       stdout=subprocess.PIPE).stdout.decode('utf-8'))

    # Iterate folds
    for fold_id in train_folds:
        fold_id = [
            fold_id,
        ]

        print('-' * 40)
        offold_ids = [s for s in fold_ids if s not in fold_id]
        print(
            f"TRAINING FOLDS: '{','.join(offold_ids)}' TO PREDICT FOLD '{','.join(fold_id)}'"
        )

        # Generate datasets
        datasets = {
            'train': dset_df[dset_df.train & ~dset_df.fold.isin(fold_id)],
            # 'train': dset_df[dset_df.fold.isin(fold_id)],
            'valid': dset_df[dset_df.fold.isin(fold_id)],
            'fold': dset_df[dset_df.fold.isin(fold_id)],
            'test': dset_df[dset_df.test],
        }
        if debug:
            datasets['train'] = datasets['train'][0:len(datasets['valid'])]
        print(f"Training: {len(datasets['train']):,} | "
              f"Validation: {len(datasets['valid']):,} | "
              f"OOT-Fold: {len(datasets['fold']):,} | "
              f"Test: {len(datasets['test']):,}")

        # Get data loaders
        data_loaders = MM.get_dataloaders(path_to_data, datasets)

        # Get learner
        learner = MM.get_learner(annot_dict['nb_classes'])

        if True and 'original_class_id' in datasets[
                'train'].columns:  # Check original/pseudo labels
            train_original = datasets['train'].groupby(
                'class_id').count().iloc[:, 0].values
            train_actual = datasets['train'].groupby(
                'original_class_id').count().iloc[:, 0].values
            if any([s1 != s2 for s1, s2 in zip(train_original, train_actual)]):
                print("Using not original labels for training!")

        # Train
        print('-' * 40 + " Training")
        epochs = MM.args.max_train_epochs if not debug else 1
        learner.train_loader(data_loaders, epochs=epochs)

        # Output_name
        ouput_name = MM.args.ouput_name if not debug else 'debug_' + MM.args.ouput_name

        # Predict Fold
        print('-' * 40 + " Predicting Fold")
        valid_preds = learner.predict_loader(data_loaders['valid'])
        valid_preds_df = pd.DataFrame(valid_preds, index=datasets['valid'].id)
        valid_preds_df.columns = [
            annot_dict['classId_to_name'][s1] for s1 in valid_preds_df.columns
        ]
        print(f"Dataset shape: {valid_preds_df.shape}")
        print(valid_preds_df.head(2))
        filepath = os.path.join(
            PATH_TO_ROOT, OUTPUTS_PATH,
            ouput_name + f"_fold_{','.join(fold_id)}.csv.gz")
        print(f"Saving FOLD predictions: {filepath}")
        valid_preds_df.to_csv(filepath, index=True)

        # Predict Test
        print('-' * 40 + " Predicting Test")
        test_preds = learner.predict_loader(data_loaders['test'])
        test_preds_df = pd.DataFrame(test_preds, index=datasets['test'].id)
        test_preds_df.columns = [
            annot_dict['classId_to_name'][s1] for s1 in test_preds_df.columns
        ]
        print(f"Dataset shape: {test_preds_df.shape}")
        test_preds_df.head(2)
        filepath = os.path.join(
            PATH_TO_ROOT, OUTPUTS_PATH,
            ouput_name + f"_test_{','.join(fold_id)}.csv.gz")
        print(f"Saving TEST predictions: {filepath}")
        test_preds_df.to_csv(filepath, index=True)

        learner.clean_memory()
        del learner
Beispiel #6
0
def main(model_id, folds, debug, only_test):
    MODEL_ID = model_id

    # Script parameters
    MODEL_NAME = f"model_L1A_{MODEL_ID}"
    DATA_DIR = "data"
    PATH_TO_LOG = ".log"

    print('*' * 80)
    prefix = 'debug_' if debug else ''
    # Init logger
    if folds is None:
        log_filename = os.path.join(PATH_TO_ROOT, PATH_TO_LOG,
                                    prefix + MODEL_NAME + '_predict.log')
    else:
        log_filename = os.path.join(
            PATH_TO_ROOT, PATH_TO_LOG,
            prefix + MODEL_NAME + f'_{folds}_predict.log')
    orig_stdout, orig_stderr, sys.stdout, sys.stderr = init_logger(
        sys, log_filename, timestamp=True, verbose=True)
    print(f'Logged to file: {log_filename}')

    # Read model_module
    MM = exe.load_model_module(MODEL_NAME + "_module",
                               os.path.join(PATH_TO_ROOT, MODEL_MODULE_PATH))

    # Print information
    print('Executed with arguments:')
    print(MM.ARGS)
    print('-' * 80)

    # Read dataset
    path_to_data = os.path.join(PATH_TO_ROOT, DATA_DIR)
    dset_df = MM.get_dset(path_to_data)

    # Add folds
    dset_df = exe.load_kfolds(
        dset_df, os.path.join(PATH_TO_ROOT, DATA_DIR, KFOLDS_FILE))

    # Get list of fold_ids
    fold_ids = dset_df.fold.dropna().unique().tolist()
    fold_ids.sort()

    print('-' * 80)
    print(
        subprocess.run(['nvidia-smi'],
                       stdout=subprocess.PIPE).stdout.decode('utf-8'))

    # Folds to train
    train_folds = fold_ids
    if folds is not None:
        train_folds = [s for s in train_folds if s in folds]

    # Iterate folds
    for fold_id in train_folds:
        fold_id = [
            fold_id,
        ]

        print('-' * 40)
        offold_ids = [s for s in fold_ids if s not in fold_id]
        print(f"PREDICT FOLD '{','.join(fold_id)}'")

        # Generate datasets
        datasets = {
            'train':
            dset_df[dset_df.train & ~dset_df.fold.isin(fold_id)],
            'valid':
            dset_df[dset_df.train & dset_df.fold.isin(fold_id)
                    & dset_df.for_validation],
            'fold':
            dset_df[dset_df.train & dset_df.fold.isin(fold_id)],
            'test':
            dset_df[dset_df.test],
        }
        if debug:
            datasets['train'] = datasets['train'][0:len(datasets['valid'])]
        print(f"Training: {len(datasets['train']):,} | "
              f"Validation: {len(datasets['valid']):,} | "
              f"OOT-Fold: {len(datasets['fold']):,} | "
              f"Test: {len(datasets['test']):,}")

        # Get data loaders
        data_loaders = MM.get_dataloaders(path_to_data, datasets)

        # Output_name
        ouput_name = MM.args.ouput_name if not debug else 'debug_' + MM.args.ouput_name

        # Load model
        filename = os.path.join(PATH_TO_ROOT, MODELS_PATH,
                                ouput_name + f"_model_{','.join(fold_id)}.tar")
        learner = MM.load_model(filename)

        if not only_test:
            # Predict Fold
            print('-' * 40 + " Predicting Fold")
            valid_preds = learner.predict_loader(data_loaders['fold'])
            valid_preds_df = pd.DataFrame(valid_preds,
                                          index=datasets['fold'].image_id)
            valid_preds_df.columns = ['wind_speed']
            print(f"Dataset shape: {valid_preds_df.shape}")
            print(valid_preds_df.head(2))
            filepath = os.path.join(
                PATH_TO_ROOT, OUTPUTS_PATH,
                ouput_name + f"_fold_{','.join(fold_id)}.csv.gz")
            print(f"Saving FOLD predictions: {filepath}")
            valid_preds_df.to_csv(filepath, index=True)

        # Predict Test
        print('-' * 40 + " Predicting Test")
        test_preds = learner.predict_loader(data_loaders['test'])
        test_preds_df = pd.DataFrame(test_preds,
                                     index=datasets['test'].image_id)
        test_preds_df.columns = ['wind_speed']
        print(f"Dataset shape: {test_preds_df.shape}")
        test_preds_df.head(2)
        filepath = os.path.join(
            PATH_TO_ROOT, OUTPUTS_PATH,
            ouput_name + f"_test_{','.join(fold_id)}.csv.gz")
        print(f"Saving TEST predictions: {filepath}")
        test_preds_df.to_csv(filepath, index=True)

        learner.clean_memory()
        del learner
Beispiel #7
0
    test_all = False
    test_start_from = -1  # type=int)

    train_from = ''
    report_rouge = True
    block_trigram = True


args = parser

args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))]
args.world_size = len(args.gpu_ranks)
os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus

init_logger(args.log_file)
device = "cpu" if args.visible_gpus == '-1' else "cuda"
device_id = 0 if device == "cuda" else -1


def summarize_pdf(pdf_file, sent_percentage):
    pdf_file_obj = open(pdf_file, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
    title = pdf_reader.getDocumentInfo().title
    summary_title = "Summary"
    if title is not None:
        summary_title = title + ' - ' + summary_title
    num_of_pages = pdf_reader.numPages
    body = ''
    for i in range(num_of_pages):
        pageobj = pdf_reader.getPage(i)
Beispiel #8
0
    def train(self,
              train,
              dev,
              test,
              buckets=32,
              batch_size=5000,
              lr=8e-4,
              mu=.9,
              nu=.9,
              epsilon=1e-12,
              clip=5.0,
              decay=.75,
              decay_steps=5000,
              step_decay_factor=0.5,
              step_decay_patience=15,
              epochs=5000,
              patience=100,
              verbose=True,
              **kwargs):
        args = self.args.update(locals())
        init_logger(logger, verbose=args.verbose)

        self.transform.train()
        if dist.is_initialized():
            args.batch_size = args.batch_size // dist.get_world_size()
        logger.info("Load the data")
        train = Dataset(self.transform, args.train, **args)
        dev = Dataset(self.transform, args.dev)
        test = Dataset(self.transform, args.test)
        train.build(args.batch_size, args.buckets, True, dist.is_initialized())
        dev.build(args.batch_size, args.buckets)
        test.build(args.batch_size, args.buckets)
        logger.info(
            f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n")

        logger.info(f"{self.model}\n")
        if dist.is_initialized():
            self.model = DDP(self.model,
                             device_ids=[dist.get_rank()],
                             find_unused_parameters=True)
        self.optimizer = Adam(self.model.parameters(), args.lr,
                              (args.mu, args.nu), args.epsilon)
        if self.args.learning_rate_schedule == 'Exponential':
            self.scheduler = ExponentialLR(self.optimizer,
                                           args.decay**(1 / args.decay_steps))
        elif self.args.learning_rate_schedule == 'Plateau':
            self.scheduler = ReduceLROnPlateau(
                self.optimizer,
                'max',
                factor=args.step_decay_factor,
                patience=args.step_decay_patience,
                verbose=True)

        elapsed = timedelta()
        best_e, best_metric = 1, Metric()
        best_metric_test = Metric()
        for epoch in range(1, args.epochs + 1):
            start = datetime.now()

            logger.info(f"Epoch {epoch} / {args.epochs}:")
            loss = self._train(train.loader)
            logger.info(f"{'train:':6} - loss: {loss:.4f}")
            loss, dev_metric = self._evaluate(dev.loader)
            logger.info(f"{'dev:':6} - loss: {loss:.4f} - {dev_metric}")
            loss, test_metric = self._evaluate(test.loader)
            logger.info(f"{'test:':6} - loss: {loss:.4f} - {test_metric}")

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric:
                best_e, best_metric = epoch, dev_metric
                dev_metric_name = '_dev_LP_{:.2f}_LR_{:.2f}_LF_{:.2f}.pt'.format(
                    100 * best_metric.lp, 100 * best_metric.lr,
                    100 * best_metric.lf)
                if is_master():
                    self.save(args.path + dev_metric_name)
                logger.info(f"{t}s elapsed (saved)\n")
                keep_last_n_checkpoint(args.path + '_dev_', n=5)
            else:
                logger.info(f"{t}s elapsed\n")
            elapsed += t
            if self.args.learning_rate_schedule == 'Plateau':
                self.scheduler.step(best_metric.score)

            # if epoch - best_e >= args.patience:
            #     break
        loss, metric = self.load(args.path)._evaluate(test.loader)

        logger.info(f"Epoch {best_e} saved")
        logger.info(f"{'dev:':6} - {best_metric}")
        logger.info(f"{'test:':6} - {metric}")
        logger.info(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")
Beispiel #9
0
def train_abs_single(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if (args.load_from_extractive != ''):
        logger.info('Loading bert from extractive model %s' %
                    args.load_from_extractive)
        bert_from_extractive = torch.load(
            args.load_from_extractive,
            map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'train',
                                                   shuffle=True),
                                      args.batch_size,
                                      device,
                                      shuffle=True,
                                      is_test=False)

    model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    train_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          device,
                          train=True,
                          label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)