Beispiel #1
0
def create_feature_value_category(data):
    """Categorizer of transaction value based on
        distance to the average transaction value of whole dataframe.

    Distances:
        - 3: at least 100 times bigger than average.
        - 2: at least 10 times bigger than average.
        - 1: at least 2 times bigger than average.
        - 0: if none of the previous conditions are satisfied.

    Args:
        data (spark dataframe): input spark data frame.
    """
    utils.save_log('{0} :: {1}'.format(
        create_feature_value_category.__module__,
        create_feature_value_category.__name__))

    avg_value = data.agg({'Value': 'avg'}).collect()[0][0]
    data = data. \
        withColumn('ValueStrategy',
                   when(col('Value') > avg_value * 200, 2).
                   when(col('Value') > avg_value * 50, 1).
                   otherwise(0))

    update_list_features("numerical", ["ValueStrategy"])

    return data
Beispiel #2
0
def smotenc_over_sampler(X_data,
                         y_data,
                         categorical_features_dims):
    """Generate oversampling for training data set using SMOTENC technique.

    Args:
        X_data (pandas data frame):
        y_data (pandas vector):
        categorical_features_dims (list):

    Returns:
        X and Y datasets balanced
    """
    utils.save_log('{0} :: {1}'.format(
        smotenc_over_sampler.__module__,
        smotenc_over_sampler.__name__))

    model = SMOTENC(categorical_features=categorical_features_dims,
                    random_state=config.random_seed,
                    n_jobs=config.num_jobs)

    X, y = model.fit_resample(X_data, y_data)

    X_smotenc = pandas.DataFrame(X,
                                 columns=features_engineering.features_list)
    y_smotenc = pandas.DataFrame(y,
                                 columns=[features_engineering.target_label])

    return X_smotenc, y_smotenc
def plot_heatmap(data_set):
    """Plot heatmap visualization using Seaborn library.

    Args:
        data_set (Pandas data frame): data set with features to be
    """
    utils.save_log('{0} :: {1}'.format(plot_heatmap.__module__,
                                       plot_heatmap.__name__))

    columns_to_print = deepcopy(features_engineering.features_list)
    columns_to_print.append(features_engineering.target_label)

    corr_matrix = data_set[columns_to_print].corr()
    k = 70  # number of variables for heat-map
    cols = corr_matrix.nlargest(
        k,
        features_engineering.target_label)[features_engineering.
                                           target_label]. \
        index
    cm = numpy.corrcoef(data_set[cols].values.T)
    sns.set(font_scale=1.25, rc={'figure.figsize': (15, 15)})
    sns.heatmap(cm,
                cbar=True,
                annot=True,
                square=True,
                fmt='.3f',
                annot_kws={'size': 8},
                yticklabels=cols.values,
                xticklabels=cols.values)
    plt.show()
def create_model(iterations=5000,
                 depth_tree=4,
                 learning_rate=0.0135,
                 reg_l2=2,
                 evaluation_metric='F1'):
    """Create a CatBoost model.

    Args:
        iterations
        depth_tree (int):
        learning_rate (int):
        reg_l2 (int):
        evaluation_metric (str):

    Returns:
        model: Isolation Forest model
    """
    utils.save_log('{0} :: {1}'.format(create_model.__module__,
                                       create_model.__name__))

    model = CatBoostClassifier(iterations=iterations,
                               depth=depth_tree,
                               learning_rate=learning_rate,
                               l2_leaf_reg=reg_l2,
                               eval_metric=evaluation_metric,
                               task_type=config.device_type,
                               random_seed=config.random_seed)

    return model
Beispiel #5
0
def train(data,
          features_columns_list,
          label_column,
          percentage_of_outliers,
          output_file_name='../data/model_lscp'):
    """Fit the LSCP model using the training data.
        The model weights are saved in output file.

    Args:
        data (Pandas dataframe): a matrix dataframe
        features_columns_list: list of columns to use in the train
        label_column: column name fraud identification
        percentage_of_outliers: percentage of fraud on data
        output_file_name: output file name to export IF model

    Returns:
        model: LSCP model
    """
    utils.save_log('{0} :: {1}'.format(
        train.__module__,
        train.__name__))

    if os.path.isfile(output_file_name):
        utils.save_log('Loading LSCP model.')
        with open(output_file_name, 'rb') as pickle_file:
            model = pickle.load(pickle_file)
        return model

    model = create_model(percentage_of_outliers=percentage_of_outliers)
    model.fit(data[features_columns_list], data[label_column])

    with open(output_file_name, 'wb') as file_model:
        pickle.dump(model, file_model)

    return model
Beispiel #6
0
def create_model(percentage_of_outliers=0.002):
    """Create a LSCP model.

    Args:
        percentage_of_outliers: percentage of fraud on data

    Returns:
        model: LSCP model
    """
    utils.save_log('{0} :: {1}'.format(
        create_model.__module__,
        create_model.__name__))

    bagging_model = \
        get_model_bagging(percentage_of_outliers=percentage_of_outliers)

    lof_model = \
        get_model_lof(percentage_of_outliers=percentage_of_outliers)

    cblof_model = \
        get_model_cblof(percentage_of_outliers=percentage_of_outliers)

    list_of_detectors = [bagging_model, lof_model, cblof_model]
    model = LSCP(detector_list=list_of_detectors,
                 contamination=percentage_of_outliers)

    return model
def outliers_with_isolation_forest(data, features_columns_list,
                                   label_column: None,
                                   percentage_of_outliers: None):
    """Usage of Isolation Forest model to predict outliers into the data

    Args:
        data (Pandas dataframe): a matrix dataframe
        features_columns_list: list of column names (list of features)
        label_column: target column name
        percentage_of_outliers: percentage of false itens (fraud into data)

     Returns:
        data: dataframe with Isolation Forest outlier column
    """
    utils.save_log('{0} :: {1}'.format(
        outliers_with_isolation_forest.__module__,
        outliers_with_isolation_forest.__name__))

    if label_column is not None:
        isolation_forest.train(data, features_columns_list, label_column,
                               percentage_of_outliers)

        predictions = \
            isolation_forest.predict(data[features_columns_list])
    else:
        predictions = isolation_forest.predict(data)

    data['IsolationForest'] = \
        isolation_forest.normalize_vector(predictions)

    return data
def train(X_data,
          y_data,
          categorical_features_list,
          cat_boost_file_name='../data/catBoost_model'):
    """Fit the CatBoost model using the training data.
        The model weights are saved in output file.

    Args:
        X_data: a matrix dataframe
        y_data: column outcome value to use in the train
        categorical_features_list: categorical features in X_data
        cat_boost_file_name: file name to export the trained model

    Returns:
        model: CatBoost model
    """
    utils.save_log('{0} :: {1}'.format(train.__module__, train.__name__))

    model_cat_boost = create_model()

    model_cat_boost.fit(X_data,
                        y_data,
                        verbose=False,
                        plot=True,
                        cat_features=categorical_features_list)

    model_cat_boost.save_model(fname=cat_boost_file_name)

    return model_cat_boost
Beispiel #9
0
def test(*args, **kwargs):
    """Execute:
    $ python main.py \
    test \
    --path_data_test ../dataset/laser/ \
    --output_path ../output/ \
    --mode offline
    """
    utils.save_log(f'{test.__module__} :: {test.__name__}')

    model = system_configurator.set_network(kwargs['output_path'])

    img_label, X_img, y_depth, y_axis, y_quat = \
        data_engineering.extract_data(sys.argv[1],
                                      kwargs['path_data_test'])

    outcomes = model.predict(X_img)

    exporter.export_pose_prediction(img_label,
                        y_axis,
                        y_quat,
                        outcomes,
                        kwargs['output_path'])

    # print/save depth original and depth prediction
    exporter.export_depth_prediction(y_depth, outcomes, kwargs['output_path'])
    def fit(self):
        if self.make_new_dir:
            # Make a folder to save model
            model_path = os.path.join(self.model_dir, self.model_prefix)
            if not os.path.isdir(model_path):
                os.mkdir(model_path)

            model_full_path = os.path.join(model_path, datetime.now().strftime('%Y_%m_%d_%H:%M:%S'))
            if not os.path.isdir(model_full_path):
                os.mkdir(model_full_path)
        else:
            model_full_path = self.finetune_model

        # Save config in model folder
        with open(os.path.join(model_full_path, 'train_' + datetime.now().strftime('%Y_%m_%d_%H:%M:%S') + '.cfg'), 'w') as f:
            self.config.write(f)
        utils.save_log(model_full_path)     # Save event log

        # Build mxnet model and train
        checkpoint = mx.callback.do_checkpoint(os.path.join(model_full_path, 'test_v0'))
        model = self.build_model()

        train, val = self.get_data_iter(self.train_list_path, self.val_list_path, self.rois_dir, self.rois_siamese_dir,
                                        self.label_dir, self.image_size, self.batch_size, self.multi_thread, 'mode')

        eval_metric = CompositeEvalMetric(metrics=[Loss()])
        call_back = utils.get_callback(3)

        model.fit(
            X=train,
            eval_data=val,
            eval_metric=eval_metric,
            epoch_end_callback=checkpoint,
            batch_end_callback=call_back
        )
Beispiel #11
0
def extract_data_from_file(method, path):
    utils.save_log(f'{extract_data_from_file.__module__} :: '
                   f'{extract_data_from_file.__name__}')

    images_rgb = []
    images_depth = []
    images_label_axis = []
    images_label_quat = []

    file = 'dataset_train.txt' \
        if (method == 'train') \
        else 'dataset_test.txt'

    with open(path + file) as f:
        for line in f:
            (image_name, pos_x, pos_y, pos_z, quat_w, quat_p, quat_q,
             quat_r) = line.split()
            images_rgb.append(''.join(path + image_name + '.jpg'))
            images_depth.append(
                ''.join(path + image_name.replace('rgb', 'depth') + '.jpg'))
            images_label_axis.append(
                (float(pos_x), float(pos_y), float(pos_z)))
            images_label_quat.append(
                (float(quat_w), float(quat_p), float(quat_q), float(quat_r)))

    return images_name, \
        images_rgb, \
        images_depth, \
        images_label_axis, \
        images_label_quat
def outliers_with_lscp(data, features_columns_list, label_column: None,
                       percentage_of_outliers: None):
    """Usage of LSCP model to predict outliers into the data

    Args:
        data (Pandas dataframe): a matrix dataframe
        features_columns_list: list of column names (list of features)
        label_column: target column name
        percentage_of_outliers: percentage of false itens (fraud into data)

     Returns:
        data: dataframe with LSCP outlier column
    """
    utils.save_log('{0} :: {1}'.format(outliers_with_lscp.__module__,
                                       outliers_with_lscp.__name__))

    if label_column is not None:
        lscp.train(data, features_columns_list, label_column,
                   percentage_of_outliers)

        predictions = lscp.predict(data[features_columns_list])
    else:
        predictions = lscp.predict(data)

    data['LSCP'] = predictions

    return data
Beispiel #13
0
def balance_data_set(X_data,
                     y_data,
                     categorical_features_dims):
    """Usage of KNN model to predict outliers into the data

    Args:
        X_data: a matrix dataframe
        y_data: list of column names (list of features)
        categorical_features_dims: target column name

     Returns:
        Dataframe with KNN outlier column
    """
    utils.save_log('{0} :: {1}'.format(
        balance_data_set.__module__,
        balance_data_set.__name__))

    X_data_oversampled, y_data_oversampled = \
        smotenc_over_sampler(X_data,
                             y_data,
                             categorical_features_dims)

    X_data_oversampled = pandas.DataFrame(X_data_oversampled)
    y_data_oversampled = pandas.DataFrame(y_data_oversampled)

    return X_data_oversampled, y_data_oversampled
Beispiel #14
0
def create_feature_average_value_for_category(data, item):
    """Create feature based on average value transaction per
        all aggregated by item.

    Args:
        data (spark dataframe): input spark data frame.
        item: type of attribute used to aggregate the data and
        compute the average.

    Returns:
        data (spark data frame): output spark data frame with
        the new feature created.
    """
    utils.save_log('{0} :: {1}'.format(
        create_feature_average_value_for_category.__module__,
        create_feature_average_value_for_category.__name__))

    column_name = 'AverageValuePer{0}'.format(item)
    aux = data.select([item, config.feature_column_value[0]]).\
        groupBy(item).\
        mean()
    aux = aux.select(col(item), col('avg' + '(Value)').alias(column_name))
    data = data.join(aux, on=item)
    update_list_features("numerical", [column_name])
    return data
Beispiel #15
0
def get_model_bagging(percentage_of_outliers=0.002,
                      num_estimators=2,
                      combination='max'):
    """Create a Feature Bagging model.

    Args:
        percentage_of_outliers: percentage of fraud on data
        num_estimators: number of base estimators in the ensemble.
        combination: if ‘average’: take the average of all detectors
                     if ‘max’: take the maximum scores of all detectors

    Returns:
        model: Feature Bagging model
    """
    utils.save_log('{0} :: {1}'.format(
        get_model_bagging.__module__,
        get_model_bagging.__name__))

    model = FeatureBagging(contamination=percentage_of_outliers,
                           n_estimators=num_estimators,
                           combination=combination,
                           random_state=config.random_seed,
                           n_jobs=config.num_jobs)

    return model
Beispiel #16
0
def train(*args, **kwargs):
    """Execute:
    $ python main.py \
    train \
    --path_data_train ../dataset/laser/ \
    --output_path ../output/
    """
    utils.save_log(f'{train.__module__} :: {train.__name__}')

    model = system_configurator.set_network(kwargs['output_path'])
    checkpointer = networks.set_checkpointer(kwargs['output_path'])
    lr_reducer = networks.set_reducer()

    _, X_img, y_depth, y_axis, y_quat = \
        data_engineering.extract_data(sys.argv[1],
                                      kwargs['path_data_train'])

    # fit nn spaceynet
    outcomes = model.fit(X_img,
                         [y_depth, y_axis, y_quat],
                         batch_size=config.BATCH_SIZE,
                         epochs=config.EPOCHS,
                         shuffle=True,
                         callbacks=[checkpointer, lr_reducer])

    exporter.export_curves_and_depth(outcomes,
                                     kwargs['output_path'])

    exporter.export_lr_curve(outcomes,
                             kwargs['output_path'],
                             config.EPOCHS)
Beispiel #17
0
def export_lr_curve(outcomes, path, epochs):
    utils.save_log(f'{export_lr_curve.__module__} :: '
                   f'{export_lr_curve.__name__}')
    plt.figure(figsize=(8, 5))
    plt.plot(epochs, outcomes.history['lr'])
    plt.xlabel('epochs')
    plt.grid(True)
    plt.legend(['Learning Rate over Time'], loc=7)
    plt.savefig(path + 'acc_loss/learning_rate.png')
Beispiel #18
0
def export_depth_curve(outcomes, path, epochs, flag):
    utils.save_log(f'{export_depth_curve.__module__} :: '
                   f'{export_depth_curve.__name__}')
    plt.figure(figsize=(8, 5))
    plt.plot(epochs, outcomes.history['cls_depth_' + flag])
    plt.plot(epochs, outcomes.history['val_cls_depth_' + flag])
    plt.xlabel('epochs')
    plt.ylabel(flag)
    plt.grid(True)
    plt.legend(['train', 'val'], loc=7)
    plt.savefig(path + 'acc_loss/depth_' + flag + '.png')
Beispiel #19
0
def update_features_dims(data):
    """Update list of features and this dimension to use in SMOTENC

    Args:
        data: dataframe
    """
    utils.save_log('{0} :: {1}'.format(update_features_dims.__module__,
                                       update_features_dims.__name__))

    return [
        data[features_list].columns.get_loc(i)
        for i in categorical_features_list
    ]
def normalize_vector(vector):
    """Normalize the values of prediction to 0 and 1

    Args:
        vector : column predictions made by Isolation Forest

    Returns:
        vector_normalized: a column value of Isolation Forest normalized
    """
    utils.save_log('{0} :: {1}'.format(normalize_vector.__module__,
                                       normalize_vector.__name__))

    return ((vector - 1) * -1) // 2
Beispiel #21
0
 def extract_images(input_images):
     utils.save_log(f'{Processer.extract_images.__module__} :: '
                    f'{Processer.extract_images.__name__}')
     images = numpy.zeros((len(input_images), config.IMG_HEIGHT,
                           config.IMG_WIDTH, config.IMG_CHANNEL),
                          dtype=numpy.uint8)
     for i in range(len(input_images)):
         image = cv2.imread(input_images[i])
         image = cv2.resize(image, (config.IMG_HEIGHT, config.IMG_WIDTH))
         images[i] = image
     images = images.astype('float32')
     images /= 255
     return images
    def save_history(self,
                     filePath=None,
                     savePredHist=False,
                     saveTrainHist=True,
                     saveResults=False,
                     results=None,
                     resultsLabel='',
                     historyLabel=''):

        sep = self.sep
        resultsLabel = resultsLabel if resultsLabel is not '' else sep.join(
            (self.descr, 'results'))
        if filePath is not None:
            rootBasePath = filePath
            saveResFolder = os.path.join(saveFile, 'Logs', 'Results')
            saveHisFolder = os.path.join(saveFile, 'Logs', 'History')
        else:
            rootBasePath = self.rootSaveFolder
            saveResFolder = self.saveLogsFolder
            saveHisFolder = self.saveHisFolder
        folders = [saveResFolder, saveHisFolder]
        # Create the Target Directory if does not exist.
        for f in folders:
            if not os.path.exists(f):
                os.makedirs(f)
        if saveResults == True:
            saveResFile = os.path.join(
                saveResFolder,
                sep.join((self.defSavePrefix, resultsLabel, ".txt")))
        saveFile = os.path.join(saveHisFolder,
                                sep.join((self.defSavePrefix, "log1.txt")))

        # Save training history or predHistory as required.
        if saveTrainHist == True:
            utils.save_log(saveFile, self.history)
        if savePredHist == True:
            utils.save_log(saveFile, self.predHistory)
        # Save Results if required
        if saveResults == True:
            if results is not None:
                try:
                    utils.save_tensor(results, filePath=saveResFile)
                except (AttributeError, TypeError):
                    raise AssertionError(
                        'Input Results variable should be Tensor.')
            else:
                print("No Results Tensor to save is given.")
        return saveFile
Beispiel #23
0
def create_features_avg_ratio_value_by_categories(data, list_of_categories):
    """Create new features relating the transaction value for
    each product category.

    Args:
        data: input spark data frame.
        list_of_categories: features to be inserted on global features list
    """
    utils.save_log('{0} :: {1}'.format(
        create_features_avg_ratio_value_by_categories.__module__,
        create_features_avg_ratio_value_by_categories.__name__))

    for item in list_of_categories:
        data = create_feature_average_value_for_category(data, item)
        data = create_feature_ratio_between_value_and_category(data, item)
    return data
Beispiel #24
0
def main():
    args = parse_args()
    train_dataset, test_dataset = dataset.get_dataset(args.path,
                                                      args.use_augmentation,
                                                      args.use_fivecrop)
    train_loader = DataLoader(train_dataset,
                              args.batch,
                              True,
                              num_workers=args.worker,
                              pin_memory=True)
    test_loader = DataLoader(test_dataset,
                             args.batch,
                             False,
                             num_workers=args.worker,
                             pin_memory=True)
    if args.cuda:
        torch.cuda.set_device(0)
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    if args.model == 'ResNet18':
        mymodel = model.ResNet18(args.frozen_layers).to(device)
    elif args.model == 'ResNet34':
        mymodel = model.ResNet34(args.frozen_layers).to(device)
    elif args.model == 'ResNet50':
        mymodel = model.ResNet50(args.frozen_layers).to(device)
    elif args.model == 'DenseNet':
        mymodel = model.DenseNet().to(device)
    else:
        pass
    op = optim.Adam(mymodel.parameters(), lr=args.lr)
    train_losses, test_mF1s, test_precisions, test_recalls = [], [], [], []
    early = args.early
    for i in range(args.epoch):
        train_loss = train.train(mymodel, op, train_loader, i, device,
                                 args.log, utils.pos_weight)
        mF1, recall, presicion = test.test(mymodel, test_loader, device,
                                           args.use_fivecrop)
        train_losses.append(train_loss)
        test_mF1s.append(mF1)
        test_precisions.append(presicion)
        test_recalls.append(recall)
        early = utils.early_stop(test_mF1s, early)
        if early <= 0:
            break
    utils.save_log(mymodel, train_losses, test_mF1s, test_precisions,
                   test_recalls)
Beispiel #25
0
def validate(valloader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top2 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input_data, target) in enumerate(valloader):
        if use_gpu:
            input_data = input_data.cuda(async=True)
            target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input_data, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec2 = accuracy(output.data, target, topk=(1, 2))
        losses.update(loss.data[0], input_data.size(0))
        top1.update(prec1[0], input_data.size(0))
        top2.update(prec2[0], input_data.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            info = 'Testing: [{0}/{1}]\t'\
             'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'\
             'Loss {loss.val:.4f} ({loss.avg:.4f})\t'\
             'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'\
             'Prec@2 {top2.val:.3f} ({top2.avg:.3f})'.format(
              i, len(valloader), batch_time=batch_time, loss=losses,
              top1=top1, top2=top2)
            print(info)
            save_log(info, log_file)

    info = '* Test:  Prec@1 {top1.avg:.3f} Prec@2 {top2.avg:.3f}'.format(
        top1=top1, top2=top2)
    print(info)
    save_log(info, log_file)
    return top1.avg
Beispiel #26
0
def update_list_features(list_type, list_column_name):
    """Update adding name features to the list of features

    Args:
        list_type: categorical or numerical.
        list_column_name: list of features name to be added.
            If is a unique value, need be as list as well.
    """
    utils.save_log('{0} :: {1}'.format(update_list_features.__module__,
                                       update_list_features.__name__))

    for column_name in list_column_name:
        if list_type == 'categorical':
            categorical_features_list.append(column_name)
        if list_type == 'numerical':
            numerical_features_list.append(column_name)
        features_list.append(column_name)
Beispiel #27
0
def create_feature_is_credit_debit(data):
    """ Create new column Operation based on Amount column
        Identifier if the operation is:
        - credit (-1) or debit (1)

    Args:
        data (spark data frame): input spark data frame.
    """
    utils.save_log('{0} :: {1}'.format(
        create_feature_is_credit_debit.__module__,
        create_feature_is_credit_debit.__name__))

    data = data.withColumn("Operation", when(data.Amount > 0, 1).otherwise(0))

    update_list_features("numerical", ["Operation"])

    return data
def get_percentage_of_fraudulent_transactions(data):
    """Compute the proportion of fraudulent transactions on data.

    Args:
        data (Pandas dataframe): a matrix dataframe

    Returns:
        Percentage of fraud into dataframe.
    """
    utils.save_log('{0} :: {1}'.format(
        get_percentage_of_fraudulent_transactions.__module__,
        get_percentage_of_fraudulent_transactions.__name__))

    if features_engineering.target_label in data.columns:
        features_engineering.fraudulent_percentage = \
            data.filter('FraudResult==1').count() / data.count()
    return features_engineering.fraudulent_percentage
Beispiel #29
0
 def get_log(self, date):
     """Парсинг json и сохранение данных логов и пользователей в БД"""
     data = self.get_json()
     if 'error' in data:
         logger.debug(data['error'])
     if 'logs' in data:
         # Получение всех логов с форматированным временем
         all_logs = list(correct_time(data['logs']))
         sort_by_created_at(all_logs)
         for log in all_logs:
             first_name = log['first_name']
             second_name = log['second_name']
             user_id = log['user_id']
             message = log['message']
             created_at = log['created_at']
             save_user(first_name, second_name, user_id)
             save_log(message, created_at, user_id)
         logger.info(f'Запись завершена')
def predict(data, input_file_name='../data/model_if'):
    """Generate predictions using the Isolation Forest model.

    Args:
        data (Pandas dataframe): a matrix dataframe
        input_file_name: input file name of IF model

    Returns:
        predictions: Model outcomes (predictions)
    """
    utils.save_log('{0} :: {1}'.format(predict.__module__, predict.__name__))

    with open(input_file_name, 'rb') as pickle_file:
        model = pickle.load(pickle_file)

    predictions = model.predict(data)

    return predictions