Example #1
0
def estimate_importance(dataset, model_name):
    if os.path.exists(
            os.path.join('feature_importance', dataset, model_name,
                         'importance.csv')):
        print(f'Found {dataset}, {model_name}')
        return
    model_remote_path = stat_df.loc[model_name, dataset]
    postfix = '/test_score.json'

    remote_dir_name = model_remote_path[:-len(postfix)]

    def downloadDirectoryFroms3(bucketName, remoteDirectoryName,
                                local_dir_path):
        s3_resource = boto3.resource('s3')
        bucket = s3_resource.Bucket(bucketName)
        for obj in bucket.objects.filter(Prefix=remoteDirectoryName):
            print(obj.key)
            download_path = os.path.join(local_dir_path, obj.key)
            if not os.path.exists(os.path.dirname(download_path)):
                os.makedirs(os.path.dirname(download_path), exist_ok=True)
            bucket.download_file(obj.key, download_path)

    local_dir_name = os.path.join(download_path, remote_dir_name)
    if os.path.exists(local_dir_name):
        pass
    else:
        downloadDirectoryFroms3('automl-mm-bench', remote_dir_name,
                                download_path)
    test_dataset = dataset_registry.create(dataset, 'test')
    if model_name == MULTIMODAL_TEXT_MODEL_NAME:
        predictor = MultiModalTextModel.load(
            os.path.join(local_dir_name, 'saved_model'))
    elif model_name == TABULAR_MODEL_NAME:
        predictor = TabularPredictor.load(os.path.join(local_dir_name))
    elif model_name == STACK_ENSEMBLE_MODEL_NAME:
        predictor = TabularPredictor.load(os.path.join(local_dir_name))
    else:
        raise NotImplementedError
    sample_size = min(len(test_dataset.data), 1000)
    if model_name == TABULAR_MODEL_NAME:
        importance_df = predictor.feature_importance(
            test_dataset.data[test_dataset.feature_columns +
                              test_dataset.label_columns],
            subsample_size=sample_size)
    else:
        importance_df = compute_permutation_feature_importance(
            test_dataset.data[test_dataset.feature_columns],
            test_dataset.data[test_dataset.label_columns[0]],
            predict_func=predictor.predict,
            eval_metric=get_metric(test_dataset.metric),
            subsample_size=sample_size,
            num_shuffle_sets=3)
    os.makedirs(os.path.join('feature_importance', dataset, model_name),
                exist_ok=True)
    importance_df.to_csv(
        os.path.join('feature_importance', dataset, model_name,
                     'importance.csv'))
    print(importance_df)
def extract_pretrained_embedding(dataset):
    hyperparameters = ag_text_presets.create('default')
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'model.num_trainable_layers'] = 0
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'model._disable_update'] = True
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'optimization.num_train_epochs'] = 1
    hyperparameters['models']['MultimodalTextModel']['search_space'][
        'preprocessing.categorical.convert_to_text'] = True
    hyperparameters['models']['MultimodalTextModel']['search_space']['optimization.lr'] = 0.
    seed = 123
    train_dataset = dataset_registry.create(dataset, 'train')
    test_dataset = dataset_registry.create(dataset, 'test')
    train_data1, tuning_data1 = sklearn.model_selection.train_test_split(
        train_dataset.data,
        test_size=0.05,
        random_state=np.random.RandomState(seed))
    column_types, inferred_problem_type = infer_column_problem_types(train_data1,
                                                                     tuning_data1,
                                                                     label_columns=train_dataset.label_columns,
                                                                     problem_type=train_dataset.problem_type)
    text_feature_columns = [col_name for col_name in train_dataset.feature_columns if
                            column_types[col_name] == 'text']
    train_text_only_data = train_dataset.data[text_feature_columns + train_dataset.label_columns]
    test_text_only_data = test_dataset.data[text_feature_columns + test_dataset.label_columns]
    sampled_train_data = train_text_only_data.sample(10)
    predictor = TextPredictor(label=train_dataset.label_columns)
    predictor.fit(train_data=sampled_train_data,
                  column_types=column_types,
                  hyperparameters=hyperparameters)
    train_features = predictor.extract_embedding(train_text_only_data)
    test_features = predictor.extract_embedding(test_text_only_data)
    save_base_dir = f'embeddings/{dataset}/pretrain_text_embedding'
    os.makedirs(save_base_dir, exist_ok=True)
    np.save(os.path.join(save_base_dir, 'train.npy'), train_features)
    np.save(os.path.join(save_base_dir, 'test.npy'), test_features)
    with open(os.path.join(save_base_dir, 'text_columns.json'), 'w') as in_f:
        json.dump(text_feature_columns, in_f)
def extract_finetuned_embedding(dataset, stat_df, verify_performance):
    model_l = stat_df['Unnamed: 0']
    multimodal_idx = None
    text_only_idx = None
    for i, model in enumerate(model_l):
        if model == MULTIMODAL_TEXT_MODEL_NAME:
            multimodal_idx = i
        elif model == TEXT_MODEL_NAME:
            text_only_idx = i
    if multimodal_idx is None or text_only_idx is None:
        raise NotImplementedError("Model not found!")

    multimodal_model_remote_path = stat_df[dataset].iloc[multimodal_idx]
    text_model_remote_path = stat_df[dataset].iloc[text_only_idx]
    postfix = '/test_score.json'

    multimodal_remote_dir_name = multimodal_model_remote_path[:-len(postfix)]
    text_remote_dir_name = text_model_remote_path[:-len(postfix)]
    print(multimodal_remote_dir_name)


    def downloadDirectoryFroms3(bucketName, remoteDirectoryName, local_dir_path):
        s3_resource = boto3.resource('s3')
        bucket = s3_resource.Bucket(bucketName)
        for obj in bucket.objects.filter(Prefix=remoteDirectoryName):
            print(obj.key)
            download_path = os.path.join(local_dir_path, obj.key)
            if not os.path.exists(os.path.dirname(download_path)):
                os.makedirs(os.path.dirname(download_path), exist_ok=True)
            bucket.download_file(obj.key, download_path)


    downloadDirectoryFroms3('automl-mm-bench', multimodal_remote_dir_name, args.model_download_path)
    downloadDirectoryFroms3('automl-mm-bench', text_remote_dir_name, args.model_download_path)

    # Multimodal Embedding
    multimodal_text_nn = MultiModalTextModel.load(os.path.join(args.model_download_path,
                                                               multimodal_remote_dir_name,
                                                               'saved_model'))
    print(multimodal_text_nn)
    with open(os.path.join(args.model_download_path, multimodal_remote_dir_name,
                           'test_score.json'), 'r') as in_f:
        model_test_score = json.load(in_f)
        multimodal_loaded_score_val = list(model_test_score.values())[0]
    train_dataset = dataset_registry.create(dataset, 'train')
    test_dataset = dataset_registry.create(dataset, 'test')
    train_features = multimodal_text_nn.extract_embedding(train_dataset.data)
    test_features = multimodal_text_nn.extract_embedding(test_dataset.data)
    if verify_performance:
        multimodal_pred_score = multimodal_text_nn.evaluate(test_dataset.data)
        assert multimodal_pred_score == multimodal_loaded_score_val,\
            f"MultiModalText NN: Predicted score={multimodal_pred_score}, " \
            f"Loaded score={multimodal_loaded_score_val}, " \
            f"Dataset={dataset}"
    os.makedirs(f'embeddings/{dataset}/multimodal_embedding', exist_ok=True)
    np.save(os.path.join(f'embeddings/{dataset}/multimodal_embedding', 'train.npy'), train_features)
    np.save(os.path.join(f'embeddings/{dataset}/multimodal_embedding', 'test.npy'), test_features)

    # Text Embedding
    text_nn = MultiModalTextModel.load(os.path.join(args.model_download_path,
                                                    text_remote_dir_name, 'saved_model'))
    with open(os.path.join(args.model_download_path, text_remote_dir_name,
                           'test_score.json'), 'r') as in_f:
        model_test_score = json.load(in_f)
        text_loaded_score_val = list(model_test_score.values())[0]
    train_text_embeddings = text_nn.extract_embedding(train_dataset.data)
    test_text_embeddings = text_nn.extract_embedding(test_dataset.data)
    if verify_performance:
        text_pred_score = text_nn.evaluate(test_dataset.data)
        assert text_pred_score == text_loaded_score_val,\
            f"Text-only network: Predicted score={text_pred_score}, " \
            f"Loaded score={text_loaded_score_val}, " \
            f"Dataset={dataset}"
    os.makedirs(f'embeddings/{dataset}/tuned_text_embedding', exist_ok=True)
    np.save(os.path.join(f'embeddings/{dataset}/tuned_text_embedding', 'train.npy'), train_text_embeddings)
    np.save(os.path.join(f'embeddings/{dataset}/tuned_text_embedding', 'test.npy'), test_text_embeddings)
def train_model(dataset_name,
                text_presets,
                save_dir,
                model,
                tabular_presets,
                num_gpus=None,
                get_competition_results=False,
                seed=123):
    set_seed(seed)
    if get_competition_results:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'competition')
    else:
        train_dataset = dataset_registry.create(dataset_name, 'train')
        test_dataset = dataset_registry.create(dataset_name, 'test')
    feature_columns = train_dataset.feature_columns
    label_columns = train_dataset.label_columns
    metric = train_dataset.metric
    problem_type = train_dataset.problem_type
    train_data1, tuning_data1 = sklearn.model_selection.train_test_split(
        train_dataset.data,
        test_size=0.05,
        random_state=np.random.RandomState(seed))
    train_data = train_dataset.data
    test_data = test_dataset.data
    column_types, inferred_problem_type = infer_column_problem_types(
        train_data1,
        tuning_data1,
        label_columns=label_columns,
        problem_type=problem_type)
    train_data = train_data[feature_columns + label_columns]
    # tuning_data = tuning_data[feature_columns + label_columns]
    if not get_competition_results:
        test_data = test_data[feature_columns + label_columns]
    train_tic = time.time()
    if model == 'ag_tabular_quick':
        MAX_NGRAM = 300
        feature_generator = AutoMLPipelineFeatureGenerator(
            vectorizer=CountVectorizer(min_df=30,
                                       ngram_range=(1, 3),
                                       max_features=MAX_NGRAM,
                                       dtype=np.uint8))
        predictor = TabularPredictor(label=label_columns[0],
                                     path=save_dir,
                                     problem_type=problem_type)
        predictor.fit(train_data,
                      time_limit=30,
                      feature_generator=feature_generator)
    elif model == 'ag_tabular_without_text':
        no_text_feature_columns = []
        for col_name in feature_columns:
            if column_types[col_name] != _TEXT:
                no_text_feature_columns.append(col_name)
        train_data = train_data[no_text_feature_columns + label_columns]
        # tuning_data = tuning_data[no_text_feature_columns + label_columns]
        test_data = test_data[no_text_feature_columns + label_columns]
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets in ['best_quality']:
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          presets=tabular_presets)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS,
                          num_bag_folds=5,
                          num_stack_levels=1)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_tabular_old':
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    elif model == 'ag_text_only':
        text_feature_columns = [
            col_name for col_name in feature_columns
            if column_types[col_name] == _TEXT
        ]
        train_data = train_data[text_feature_columns + label_columns]
        test_data = test_data[text_feature_columns + label_columns]
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'ag_text_multimodal':
        predictor = TextPredictor(path=save_dir,
                                  label=label_columns[0],
                                  problem_type=problem_type,
                                  eval_metric=metric)
        hparams = ag_text_presets.create(text_presets)
        if len(train_data) > 500000:
            hparams = set_epoch3(hparams)
        predictor.fit(train_data=train_data,
                      hyperparameters=hparams,
                      num_gpus=num_gpus,
                      seed=seed)
    elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text':
        feature_generator = AutoMLPipelineFeatureGenerator(
            enable_text_special_features=False,
            enable_text_ngram_features=False)
        pre_embedding_folder = os.path.join(_CURR_DIR,
                                            'pre_computed_embeddings')
        if model == 'pre_embedding':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'pretrain_text_embedding', 'test.npy'))
        elif model == 'tune_embedding_multimodal':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'multimodal_embedding', 'test.npy'))
        elif model == 'tune_embedding_text':
            train_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'train.npy'))
            test_features = np.load(
                os.path.join(pre_embedding_folder, dataset_name,
                             'tuned_text_embedding', 'test.npy'))
        else:
            raise NotImplementedError
        train_data = train_data.join(
            pd.DataFrame(train_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(train_features.shape[1])
                         ]))
        train_data.reset_index(drop=True, inplace=True)
        test_data = test_data.join(
            pd.DataFrame(test_features,
                         columns=[
                             f'pre_feat{i}'
                             for i in range(test_features.shape[1])
                         ]))
        test_data.reset_index(drop=True, inplace=True)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError

    elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table':
        if model == 'tabular_multimodal':
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True)
            hyperparameters = get_multimodal_tabular_hparam_just_gbm(
                text_presets=text_presets)
        else:
            MAX_NGRAM = 300
            feature_generator = AutoMLPipelineFeatureGenerator(
                vectorizer=CountVectorizer(min_df=30,
                                           ngram_range=(1, 3),
                                           max_features=MAX_NGRAM,
                                           dtype=np.uint8),
                enable_raw_text_features=True,
                enable_text_special_features=False,
                enable_text_ngram_features=False)
            hyperparameters = multimodal_tabular_just_table_hparam(
                text_presets=text_presets)
        predictor = TabularPredictor(path=save_dir,
                                     label=label_columns[0],
                                     problem_type=problem_type,
                                     eval_metric=metric)
        if tabular_presets == 'best_quality':
            predictor.fit(train_data=train_data,
                          presets=tabular_presets,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '5fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=5,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == '3fold_1stack':
            predictor.fit(train_data=train_data,
                          num_bag_folds=3,
                          num_stack_levels=1,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        elif tabular_presets == 'no':
            predictor.fit(train_data=train_data,
                          hyperparameters=hyperparameters,
                          feature_generator=feature_generator,
                          excluded_model_types=TABULAR_EXCLUDE_MODELS)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
    train_toc = time.time()
    inference_tic = time.time()
    predictions = predictor.predict(test_data, as_pandas=True)
    predictor.save()
    inference_toc = time.time()
    if problem_type == MULTICLASS or problem_type == BINARY:
        prediction_prob = predictor.predict_proba(test_data, as_pandas=True)
        prediction_prob.to_csv(
            os.path.join(save_dir, 'test_prediction_prob.csv'))
    predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv'))
    gt = test_data[label_columns[0]]
    gt.to_csv(os.path.join(save_dir, 'ground_truth.csv'))
    if not get_competition_results:
        score = predictor.evaluate(test_data)
        with open(os.path.join(save_dir, 'test_score.json'), 'w') as of:
            json.dump({metric: score}, of)
    with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of:
        json.dump(
            {
                'train_time': train_toc - train_tic,
                'inference_time': inference_toc - inference_tic,
                'cpuinfo': cpuinfo.get_cpu_info()
            }, of)
    "product_sentiment_machine_hack",
    "google_qa_question_type_reason_explanation",
    "google_qa_answer_type_reason_explanation", "women_clothing_review",
    "melbourne_airbnb", "ae_price_prediction", "mercari_price_suggestion100K",
    "jigsaw_unintended_bias100K", "imdb_genre_prediction",
    "fake_job_postings2", "kick_starter_funding", "jc_penney_products",
    "wine_reviews", "news_popularity2", "news_channel"
]

embedding_type = 'pretrain_text_embedding'
base_dir = 'embeddings'
out_dir = 'pre_embedding_data'

for dataset in dataset_l:
    print('Processing', dataset)
    train_dataset = dataset_registry.create(dataset, 'train')
    test_dataset = dataset_registry.create(dataset, 'test')
    train_features = np.load(
        os.path.join(base_dir, dataset, embedding_type, 'train.npy'))
    test_features = np.load(
        os.path.join(base_dir, dataset, embedding_type, 'test.npy'))
    with open(
            os.path.join(base_dir, dataset, embedding_type,
                         'text_columns.json'), 'r') as in_f:
        text_columns = json.load(in_f)
    other_columns = [
        col for col in train_dataset.feature_columns if col not in text_columns
    ]
    train_data = train_dataset.data[other_columns +
                                    train_dataset.label_columns]
    test_data = test_dataset.data[other_columns + train_dataset.label_columns]
def train_baseline(dataset_name, save_dir, baseline,
                   time_limit_sec=None, w2v_epochs=None, embed_dir=None, seed=123):
    set_seed(seed)
    train_dataset = dataset_registry.create(dataset_name, 'train')
    test_dataset = dataset_registry.create(dataset_name, 'test')
    feature_columns = train_dataset.feature_columns
    label_columns = train_dataset.label_columns
    label_column = label_columns[0]
    eval_metric = train_dataset.metric
    problem_type = train_dataset.problem_type
    train_data = train_dataset.data
    test_data = test_dataset.data
    train_data = train_data[feature_columns + label_columns]
    test_data = test_data[feature_columns + label_columns]
    print("Running: ",baseline)

    # Train baseline:
    train_tic = time.time()
    if baseline == 'h2o_word2vec':
        h2o.init()
        w2v_model = train_w2v(train_data, epochs=w2v_epochs, save_dir=save_dir)
        train_data = process_w2v(train_data, w2v_model)
        test_data = process_w2v(test_data, w2v_model)
    elif baseline == 'h2o_embedding':
        train_data, test_data = get_embedded(train_data, test_data, dataset_name, embed_dir=embed_dir)

    print("Train/test data shapes: ")
    print(train_data.shape)
    print(test_data.shape)
    h2o_model = H2OBaseline()
    num_models_trained, fit_time = h2o_model.fit(train_data=train_data,
                label_column=label_column, problem_type=problem_type, eval_metric=eval_metric,
                time_limit_sec=time_limit_sec, output_directory=save_dir)
    train_toc = time.time()
    print("H2O fit runtime: %s" % fit_time)

    # Predict with baseline:
    inference_tic = time.time()
    y_pred, y_prob, predict_time = h2o_model.predict(test_data.drop(columns=[label_column]),
                                                     pred_class_and_proba=True)
    inference_toc = time.time()
    print("H2O predict runtime: %s" % predict_time)

    # Evaluate predictions:
    # class_order = h2o_model.classes
    preds_toevaluate = y_pred
    if eval_metric is not None:
        if eval_metric == 'roc_auc':
            preds_toevaluate = y_prob.iloc[:,1]
        elif eval_metric == 'log_loss':
            preds_toevaluate = y_prob

    gt = test_data[label_column]
    gt.to_csv(os.path.join(save_dir, 'ground_truth.csv'))
    y_pred.to_csv(os.path.join(save_dir, 'h2o_test_prediction.csv'))
    if problem_type == MULTICLASS or problem_type == BINARY:
        y_prob.to_csv(os.path.join(save_dir, 'h2o_test_prediction_prob.csv'))
    if len(gt) != len(y_pred):
        print("WARNING: length of gt, y_pred dont match!")
        print("len(gt) ",len(gt))
        print("len(y_pred) ",len(y_pred))
        print("test_data.shape ", test_data.shape)
        if len(y_pred) > len(gt):
            print("WARNING: truncating predictions length to length of labels in test data ...")
            y_pred = y_pred[:len(gt)]
            y_prob = y_prob[:len(gt)]

    scorer = TabularPredictor(label=label_column, problem_type=problem_type, eval_metric=eval_metric)
    # scorer.fit(train_data.sample(:200], hyperparameters={'GBM': {'num_boost_round': 1}}, presets='ignore_text')
    score = scorer._learner.eval_metric(gt, preds_toevaluate)
    print("H2O score: ", score)
    with open(os.path.join(save_dir, 'test_score.json'), 'w') as of:
        json.dump({eval_metric: score}, of)
    with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of:
        json.dump({'train_time': train_toc - train_tic,
                   'inference_time': inference_toc - inference_tic,
                   'cpuinfo': cpuinfo.get_cpu_info()}, of)