Ejemplo n.º 1
0
def main(args):
    torch.manual_seed(0)

    train_fname = args.dataset_path + 'train.csv'
    test_fname = args.dataset_path + 'test.csv'

    dataloaders = make_dataloader(train_fname, test_fname)

    model = VDCNN(depth=args.depth, num_class=args.num_class)

    run_model(model, dataloaders, args.num_epochs)
Ejemplo n.º 2
0
def run_reranking(new_index,sentence_in,qid,specific_ws,ref_doc,out_index,texts,new_trectext_name,ranked_lists,new_feature_file,feature_dir,trec_file,score_file,options):
    new_text = update_text_doc(texts[ref_doc],sentence_in,out_index)
    create_new_trectext(ref_doc,texts,new_text,new_trectext_name)
    create_specifi_ws(qid,ranked_lists,specific_ws)
    logger.info("creating features")
    create_index(new_trectext_name, os.path.dirname(new_index), os.path.basename(new_index),
                 options.home_path, options.indri_path)
    features_file = create_features_file_diff(feature_dir, options.index_path, new_index,
                                              new_feature_file, specific_ws, options.scripts_path,options.java_path,options.swig_path,options.stopwords_file,options.queries_text_file,options.home_path)
    logger.info("creating docname index")
    docname_index = create_index_to_doc_name_dict(features_file)
    logger.info("docname index creation is completed")
    logger.info("features creation completed")
    logger.info("running ranking model on features file")
    score_file = run_model(features_file, options.home_path, options.java_path, options.jar_path, score_file,
                           options.model)
    logger.info("ranking completed")
    logger.info("retrieving scores")
    scores = retrieve_scores(docname_index, score_file)
    logger.info("scores retrieval completed")
    logger.info("creating trec_eval file")
    tmp_trec = create_trec_eval_file(scores, trec_file)
    logger.info("trec file creation is completed")
    logger.info("ordering trec file")
    final = order_trec_file(tmp_trec)
    logger.info("ranking procedure completed")
    return final
Ejemplo n.º 3
0
Archivo: TTT_rl.py Proyecto: alvinzz/rl
 def ith_nn_strat(state):
     model = self.past_models[i]
     if model is None:
         return TTT_random(state)
     else:
         return utils.run_model(model,
                                self.process_state(state),
                                stochastic=True)
Ejemplo n.º 4
0
def main():
    config = utils.load_yaml(args.config)
    task = config['task']
    EPOCHS = config['epoch']
    N_FOLDS = 5
    BATCH_SIZE = config['batchsize']
    IMAGE_SIZE = config['image_size']
    model_name = config['model']
    optimizer_name = config['optimizer']
    loss = config['loss']
    lr = float(config['lr'])
    n_class = config['n_class']
    lr_scheduler = config.get('lr_scheduler')
    azure_run = None
    tb_writer = None
    num_workers = 64
    experiment_name = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')

    print(f'found {torch.cuda.device_count()} gpus !!')
    try:

        if args.debug:
            print('running in debug mode')
            EPOCHS = 1
            N_FOLDS = 2
        if args.debug:
            result_dir = Path(utils.RESULT_DIR) / ('debug-' + experiment_name)
        else:
            result_dir = Path(utils.RESULT_DIR) / experiment_name
            ws = Workspace.from_config('.aml_config/config.json')
            exp = Experiment(workspace=ws, name='kaggle-aptos2019')
            azure_run = exp.start_logging()
            azure_run.log('experiment name', experiment_name)
            azure_run.log('epoch', EPOCHS)
            azure_run.log('batch size', BATCH_SIZE)
            azure_run.log('image size', IMAGE_SIZE)
            azure_run.log('model', model_name)
            azure_run.log('optimizer', optimizer_name)
            azure_run.log('loss_name', loss['name'])
            azure_run.log('lr', lr)
            azure_run.log('lr_scheduler', lr_scheduler)
            azure_run.log('task', task)
            if args.cv:
                azure_run.log('cv', N_FOLDS)
            else:
                azure_run.log('cv', 0)

        if args.multi:
            print('use multi gpu !!')

        os.mkdir(result_dir)
        print(f'created: {result_dir}')
        utils.save_yaml(result_dir / Path(args.config).name, config)

        #         if not args.debug:
        #             tb_writer = SummaryWriter(log_dir=result_dir)

        device = torch.device("cuda:0")
        config = {
            'epochs': EPOCHS,
            'multi': args.multi,
            'batch_size': BATCH_SIZE,
            'image_size': IMAGE_SIZE,
            'model_name': model_name,
            'n_class': n_class,
            'optimizer_name': optimizer_name,
            'loss': loss,
            'lr': lr,
            'lr_scheduler': lr_scheduler,
            'task': task,
            'device': device,
            'num_workers': num_workers,
        }

        print(config)

        if not args.debug:
            slack.notify_start(experiment_name, config)
        train_df = pd.read_csv(utils.TRAIN_CSV_PATH)
        if args.debug:
            train_df = train_df[:1000]
        config['df'] = train_df

        skf = StratifiedKFold(n_splits=N_FOLDS, random_state=41, shuffle=True)
        indices = list(skf.split(train_df, train_df['diagnosis']))
        if not args.cv:
            print('do not use cross validation')
            indices = [indices[0]]

        # cross validation
        oof_preds = np.zeros((len(train_df), n_class))
        for i_fold, (train_index, valid_index) in tqdm(enumerate(indices)):
            model_path = result_dir / f'model_fold{i_fold}'
            config['train_index'] = train_index
            config['valid_index'] = valid_index
            config['model_path'] = str(model_path)
            if azure_run:
                if i_fold == 0:
                    config['azure_run'] = azure_run
                    y_pred, y_true = utils.run_model(**config)
                else:
                    with azure_run.child_run() as child:
                        config['azure_run'] = child
                        y_pred, y_true = utils.run_model(**config)
            else:
                y_pred, y_true = utils.run_model(**config)
            if args.cv:
                oof_preds[valid_index] = y_pred
        if args.cv:
            valid_preds = oof_preds
            valid_true = train_df['diagnosis']
        else:
            valid_preds = y_pred
            valid_true = y_true
        if task == 'class':
            round_valid_preds = np.argmax(valid_preds, axis=1)
        elif task == 'reg':
            print('optimizing threshold ...')
            optR = utils.OptimizedRounder()
            optR.fit(valid_preds, valid_true)
            coef = optR.coefficients()
            print(f'best coef: {coef}')
            if azure_run:
                azure_run.log('coef', coef)
            round_valid_preds = optR.predict(valid_preds, coef)
        val_kappa = cohen_kappa_score(round_valid_preds,
                                      valid_true,
                                      weights='quadratic')

        print(f'best val kappa: {val_kappa}')
        if azure_run:
            azure_run.log('best val kappa', val_kappa)

        test_csv = pd.read_csv(utils.TEST_CSV_PATH)
        #test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='test')
        test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='val')
        test_dataset = RetinopathyDataset(df=test_csv,
                                          mode='test',
                                          transform=test_tfms,
                                          auto_crop=True,
                                          add_blur=True)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=BATCH_SIZE,
                                                  shuffle=False,
                                                  pin_memory=True,
                                                  num_workers=num_workers)

        test_preds = np.zeros((len(test_csv), n_class))
        for i in range(len(indices)):
            model = utils.load_pytorch_model(model_name,
                                             result_dir / f'model_fold{i}',
                                             n_class)
            test_preds += utils.predict(model,
                                        test_loader,
                                        n_class=n_class,
                                        device=device,
                                        tta=1)
        test_preds /= len(indices)
        if task == 'class':
            round_test_preds = np.argmax(test_preds, axis=1)
        elif task == 'reg':
            round_test_preds = optR.predict(test_preds, coef)
        submission_csv = pd.read_csv(utils.SAMPLE_SUBMISSION_PATH)
        submission_csv['diagnosis'] = round_test_preds
        submission_csv.to_csv(result_dir / 'submission.csv', index=False)

        print('finish!!!')
        if not args.debug:
            slack.notify_finish(experiment_name, config, val_kappa)

    except KeyboardInterrupt as e:
        if not args.debug:
            slack.notify_fail(experiment_name, config, e.__class__.__name__,
                              str(e))
    except Exception as e:
        if azure_run:
            azure_run.fail(e)
        if not args.debug:
            slack.notify_fail(experiment_name, config, e.__class__.__name__,
                              str(e))
        raise
    finally:
        if azure_run:
            azure_run.complete()
            print('close azure_run')
        if tb_writer:
            tb_writer.export_scalars_to_json(
                os.path.join(result_dir, 'all_scalars.json'))
            tb_writer.close()
            print('close tb_writer')
        - row: pandas dataframe row

    Return:
        A modified pandas dataframe row with calculated columns

    """
    row['approve'] = '0'
    row['limit'] = '%s' % np.nan
    return row

if __name__ == '__main__':
    # max client aproving
    max_approved = 1500

    # checking for fraud
    fraud_pred = run_model(FraudEnsemble, model_suffix='*_fraud_ensemble.bin')
    fraud_pred.set_index(['ids'], inplace=True)
    fraud_pred['fraud'] = fraud_pred['fraud'].apply(pd.to_numeric)
    
    # checking for default
    default_pred = run_model(DefaultEnsemble, model_suffix='*_default_ensemble.bin')
    default_pred.set_index(['ids'], inplace=True)
    default_pred['default'].astype(float, inplace=True)
    default_pred['default'] = default_pred['default'].apply(pd.to_numeric)

    # checking for spend
    spend_pred = run_model(SpendDTR, model_suffix='*_spend_dtr.bin')
    spend_pred.set_index(['ids'], inplace=True)
    spend_pred['spend_score'].astype(int, inplace=True)
    spend_pred['spend_score'] = spend_pred['spend_score'].apply(pd.to_numeric)
Ejemplo n.º 6
0
Archivo: TTT.py Proyecto: alvinzz/rl
 def nn_strategy(state):
     return utils.run_model(model, state, stochastic=False)
Ejemplo n.º 7
0
        K = []

        print("Computing kernels...")
        for name in [0, 1, 2]:
            X    = np.array(pd.read_csv(f'{args.data_folder}/Xtr{name}.csv')['seq'])
            X_ev = np.array(pd.read_csv(f'{args.data_folder}/Xte{name}.csv')['seq'])

            t0 = time.time()
            K_tr = Mismatch_kernel(X, X, k=default_params['k'], m=default_params['m'])
            K_te = Mismatch_kernel(X, X_ev, k=default_params['k'], m=default_params['m'])
            print(f"Finished computing mismatch kernel for dataset {name}.")

            K.append({"train": K_tr, "eval": K_te})

        preds, _ = run_model('ksvm', kernel='', K=K, sequence=True, prop_test=0.2, default_params=default_params)

        write_csv(np.arange(preds.shape[0]), preds, args.savefile)

    # If MKL is used
    else:
        default_params = {"lamb": 25, "step": .05}

        kernel_params = [(7, 3), (8, 3)]

        K = [{"train": [], "eval": []} for _ in range(3)]

        print("Computing kernels...")
        for name in [0, 1, 2]:
            X    = np.array(pd.read_csv(f'{args.data_folder}/Xtr{name}.csv')['seq'])
            X_ev = np.array(pd.read_csv(f'{args.data_folder}/Xte{name}.csv')['seq'])
Ejemplo n.º 8
0
# Default parameters #

weights = args.weights
hidden_units = int(args.hidden_units) if args.hidden_units else 60
batch_size = int(args.batch_size) if args.batch_size else 32
bidirectional = bool(args.bidirectional) if args.bidirectional else True
highway = bool(args.highway) if args.highway else False
self_attention = bool(args.self_attention) if args.self_attention else False
max_pooling = bool(args.max_pooling) if args.max_pooling else False
alignment = bool(args.alignment) if args.alignment else False
shortcut = bool(shortcut) if args.shortcut else False

######################

# Load character list #
with open("characters.pkl", 'rb') as char:
    characters = pickle.load(char)
#########################

model = load_model(hidden_units, bidirectional, highway, self_attention, max_pooling, alignment, shortcut)

two_fold = args.dataset_train == args.dataset_test

print("Loading train dataset...")   
dataset_train = load_dataset(args.dataset_train, characters, two_fold, False)
print("Loading test dataset...")
dataset_test = load_dataset(args.dataset_test, characters, False, two_fold)

run_model(model, weights, batch_size, dataset_train, dataset_test, two_fold)