def create_feature_value_category(data): """Categorizer of transaction value based on distance to the average transaction value of whole dataframe. Distances: - 3: at least 100 times bigger than average. - 2: at least 10 times bigger than average. - 1: at least 2 times bigger than average. - 0: if none of the previous conditions are satisfied. Args: data (spark dataframe): input spark data frame. """ utils.save_log('{0} :: {1}'.format( create_feature_value_category.__module__, create_feature_value_category.__name__)) avg_value = data.agg({'Value': 'avg'}).collect()[0][0] data = data. \ withColumn('ValueStrategy', when(col('Value') > avg_value * 200, 2). when(col('Value') > avg_value * 50, 1). otherwise(0)) update_list_features("numerical", ["ValueStrategy"]) return data
def smotenc_over_sampler(X_data, y_data, categorical_features_dims): """Generate oversampling for training data set using SMOTENC technique. Args: X_data (pandas data frame): y_data (pandas vector): categorical_features_dims (list): Returns: X and Y datasets balanced """ utils.save_log('{0} :: {1}'.format( smotenc_over_sampler.__module__, smotenc_over_sampler.__name__)) model = SMOTENC(categorical_features=categorical_features_dims, random_state=config.random_seed, n_jobs=config.num_jobs) X, y = model.fit_resample(X_data, y_data) X_smotenc = pandas.DataFrame(X, columns=features_engineering.features_list) y_smotenc = pandas.DataFrame(y, columns=[features_engineering.target_label]) return X_smotenc, y_smotenc
def plot_heatmap(data_set): """Plot heatmap visualization using Seaborn library. Args: data_set (Pandas data frame): data set with features to be """ utils.save_log('{0} :: {1}'.format(plot_heatmap.__module__, plot_heatmap.__name__)) columns_to_print = deepcopy(features_engineering.features_list) columns_to_print.append(features_engineering.target_label) corr_matrix = data_set[columns_to_print].corr() k = 70 # number of variables for heat-map cols = corr_matrix.nlargest( k, features_engineering.target_label)[features_engineering. target_label]. \ index cm = numpy.corrcoef(data_set[cols].values.T) sns.set(font_scale=1.25, rc={'figure.figsize': (15, 15)}) sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.3f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values) plt.show()
def create_model(iterations=5000, depth_tree=4, learning_rate=0.0135, reg_l2=2, evaluation_metric='F1'): """Create a CatBoost model. Args: iterations depth_tree (int): learning_rate (int): reg_l2 (int): evaluation_metric (str): Returns: model: Isolation Forest model """ utils.save_log('{0} :: {1}'.format(create_model.__module__, create_model.__name__)) model = CatBoostClassifier(iterations=iterations, depth=depth_tree, learning_rate=learning_rate, l2_leaf_reg=reg_l2, eval_metric=evaluation_metric, task_type=config.device_type, random_seed=config.random_seed) return model
def train(data, features_columns_list, label_column, percentage_of_outliers, output_file_name='../data/model_lscp'): """Fit the LSCP model using the training data. The model weights are saved in output file. Args: data (Pandas dataframe): a matrix dataframe features_columns_list: list of columns to use in the train label_column: column name fraud identification percentage_of_outliers: percentage of fraud on data output_file_name: output file name to export IF model Returns: model: LSCP model """ utils.save_log('{0} :: {1}'.format( train.__module__, train.__name__)) if os.path.isfile(output_file_name): utils.save_log('Loading LSCP model.') with open(output_file_name, 'rb') as pickle_file: model = pickle.load(pickle_file) return model model = create_model(percentage_of_outliers=percentage_of_outliers) model.fit(data[features_columns_list], data[label_column]) with open(output_file_name, 'wb') as file_model: pickle.dump(model, file_model) return model
def create_model(percentage_of_outliers=0.002): """Create a LSCP model. Args: percentage_of_outliers: percentage of fraud on data Returns: model: LSCP model """ utils.save_log('{0} :: {1}'.format( create_model.__module__, create_model.__name__)) bagging_model = \ get_model_bagging(percentage_of_outliers=percentage_of_outliers) lof_model = \ get_model_lof(percentage_of_outliers=percentage_of_outliers) cblof_model = \ get_model_cblof(percentage_of_outliers=percentage_of_outliers) list_of_detectors = [bagging_model, lof_model, cblof_model] model = LSCP(detector_list=list_of_detectors, contamination=percentage_of_outliers) return model
def outliers_with_isolation_forest(data, features_columns_list, label_column: None, percentage_of_outliers: None): """Usage of Isolation Forest model to predict outliers into the data Args: data (Pandas dataframe): a matrix dataframe features_columns_list: list of column names (list of features) label_column: target column name percentage_of_outliers: percentage of false itens (fraud into data) Returns: data: dataframe with Isolation Forest outlier column """ utils.save_log('{0} :: {1}'.format( outliers_with_isolation_forest.__module__, outliers_with_isolation_forest.__name__)) if label_column is not None: isolation_forest.train(data, features_columns_list, label_column, percentage_of_outliers) predictions = \ isolation_forest.predict(data[features_columns_list]) else: predictions = isolation_forest.predict(data) data['IsolationForest'] = \ isolation_forest.normalize_vector(predictions) return data
def train(X_data, y_data, categorical_features_list, cat_boost_file_name='../data/catBoost_model'): """Fit the CatBoost model using the training data. The model weights are saved in output file. Args: X_data: a matrix dataframe y_data: column outcome value to use in the train categorical_features_list: categorical features in X_data cat_boost_file_name: file name to export the trained model Returns: model: CatBoost model """ utils.save_log('{0} :: {1}'.format(train.__module__, train.__name__)) model_cat_boost = create_model() model_cat_boost.fit(X_data, y_data, verbose=False, plot=True, cat_features=categorical_features_list) model_cat_boost.save_model(fname=cat_boost_file_name) return model_cat_boost
def test(*args, **kwargs): """Execute: $ python main.py \ test \ --path_data_test ../dataset/laser/ \ --output_path ../output/ \ --mode offline """ utils.save_log(f'{test.__module__} :: {test.__name__}') model = system_configurator.set_network(kwargs['output_path']) img_label, X_img, y_depth, y_axis, y_quat = \ data_engineering.extract_data(sys.argv[1], kwargs['path_data_test']) outcomes = model.predict(X_img) exporter.export_pose_prediction(img_label, y_axis, y_quat, outcomes, kwargs['output_path']) # print/save depth original and depth prediction exporter.export_depth_prediction(y_depth, outcomes, kwargs['output_path'])
def fit(self): if self.make_new_dir: # Make a folder to save model model_path = os.path.join(self.model_dir, self.model_prefix) if not os.path.isdir(model_path): os.mkdir(model_path) model_full_path = os.path.join(model_path, datetime.now().strftime('%Y_%m_%d_%H:%M:%S')) if not os.path.isdir(model_full_path): os.mkdir(model_full_path) else: model_full_path = self.finetune_model # Save config in model folder with open(os.path.join(model_full_path, 'train_' + datetime.now().strftime('%Y_%m_%d_%H:%M:%S') + '.cfg'), 'w') as f: self.config.write(f) utils.save_log(model_full_path) # Save event log # Build mxnet model and train checkpoint = mx.callback.do_checkpoint(os.path.join(model_full_path, 'test_v0')) model = self.build_model() train, val = self.get_data_iter(self.train_list_path, self.val_list_path, self.rois_dir, self.rois_siamese_dir, self.label_dir, self.image_size, self.batch_size, self.multi_thread, 'mode') eval_metric = CompositeEvalMetric(metrics=[Loss()]) call_back = utils.get_callback(3) model.fit( X=train, eval_data=val, eval_metric=eval_metric, epoch_end_callback=checkpoint, batch_end_callback=call_back )
def extract_data_from_file(method, path): utils.save_log(f'{extract_data_from_file.__module__} :: ' f'{extract_data_from_file.__name__}') images_rgb = [] images_depth = [] images_label_axis = [] images_label_quat = [] file = 'dataset_train.txt' \ if (method == 'train') \ else 'dataset_test.txt' with open(path + file) as f: for line in f: (image_name, pos_x, pos_y, pos_z, quat_w, quat_p, quat_q, quat_r) = line.split() images_rgb.append(''.join(path + image_name + '.jpg')) images_depth.append( ''.join(path + image_name.replace('rgb', 'depth') + '.jpg')) images_label_axis.append( (float(pos_x), float(pos_y), float(pos_z))) images_label_quat.append( (float(quat_w), float(quat_p), float(quat_q), float(quat_r))) return images_name, \ images_rgb, \ images_depth, \ images_label_axis, \ images_label_quat
def outliers_with_lscp(data, features_columns_list, label_column: None, percentage_of_outliers: None): """Usage of LSCP model to predict outliers into the data Args: data (Pandas dataframe): a matrix dataframe features_columns_list: list of column names (list of features) label_column: target column name percentage_of_outliers: percentage of false itens (fraud into data) Returns: data: dataframe with LSCP outlier column """ utils.save_log('{0} :: {1}'.format(outliers_with_lscp.__module__, outliers_with_lscp.__name__)) if label_column is not None: lscp.train(data, features_columns_list, label_column, percentage_of_outliers) predictions = lscp.predict(data[features_columns_list]) else: predictions = lscp.predict(data) data['LSCP'] = predictions return data
def balance_data_set(X_data, y_data, categorical_features_dims): """Usage of KNN model to predict outliers into the data Args: X_data: a matrix dataframe y_data: list of column names (list of features) categorical_features_dims: target column name Returns: Dataframe with KNN outlier column """ utils.save_log('{0} :: {1}'.format( balance_data_set.__module__, balance_data_set.__name__)) X_data_oversampled, y_data_oversampled = \ smotenc_over_sampler(X_data, y_data, categorical_features_dims) X_data_oversampled = pandas.DataFrame(X_data_oversampled) y_data_oversampled = pandas.DataFrame(y_data_oversampled) return X_data_oversampled, y_data_oversampled
def create_feature_average_value_for_category(data, item): """Create feature based on average value transaction per all aggregated by item. Args: data (spark dataframe): input spark data frame. item: type of attribute used to aggregate the data and compute the average. Returns: data (spark data frame): output spark data frame with the new feature created. """ utils.save_log('{0} :: {1}'.format( create_feature_average_value_for_category.__module__, create_feature_average_value_for_category.__name__)) column_name = 'AverageValuePer{0}'.format(item) aux = data.select([item, config.feature_column_value[0]]).\ groupBy(item).\ mean() aux = aux.select(col(item), col('avg' + '(Value)').alias(column_name)) data = data.join(aux, on=item) update_list_features("numerical", [column_name]) return data
def get_model_bagging(percentage_of_outliers=0.002, num_estimators=2, combination='max'): """Create a Feature Bagging model. Args: percentage_of_outliers: percentage of fraud on data num_estimators: number of base estimators in the ensemble. combination: if ‘average’: take the average of all detectors if ‘max’: take the maximum scores of all detectors Returns: model: Feature Bagging model """ utils.save_log('{0} :: {1}'.format( get_model_bagging.__module__, get_model_bagging.__name__)) model = FeatureBagging(contamination=percentage_of_outliers, n_estimators=num_estimators, combination=combination, random_state=config.random_seed, n_jobs=config.num_jobs) return model
def train(*args, **kwargs): """Execute: $ python main.py \ train \ --path_data_train ../dataset/laser/ \ --output_path ../output/ """ utils.save_log(f'{train.__module__} :: {train.__name__}') model = system_configurator.set_network(kwargs['output_path']) checkpointer = networks.set_checkpointer(kwargs['output_path']) lr_reducer = networks.set_reducer() _, X_img, y_depth, y_axis, y_quat = \ data_engineering.extract_data(sys.argv[1], kwargs['path_data_train']) # fit nn spaceynet outcomes = model.fit(X_img, [y_depth, y_axis, y_quat], batch_size=config.BATCH_SIZE, epochs=config.EPOCHS, shuffle=True, callbacks=[checkpointer, lr_reducer]) exporter.export_curves_and_depth(outcomes, kwargs['output_path']) exporter.export_lr_curve(outcomes, kwargs['output_path'], config.EPOCHS)
def export_lr_curve(outcomes, path, epochs): utils.save_log(f'{export_lr_curve.__module__} :: ' f'{export_lr_curve.__name__}') plt.figure(figsize=(8, 5)) plt.plot(epochs, outcomes.history['lr']) plt.xlabel('epochs') plt.grid(True) plt.legend(['Learning Rate over Time'], loc=7) plt.savefig(path + 'acc_loss/learning_rate.png')
def export_depth_curve(outcomes, path, epochs, flag): utils.save_log(f'{export_depth_curve.__module__} :: ' f'{export_depth_curve.__name__}') plt.figure(figsize=(8, 5)) plt.plot(epochs, outcomes.history['cls_depth_' + flag]) plt.plot(epochs, outcomes.history['val_cls_depth_' + flag]) plt.xlabel('epochs') plt.ylabel(flag) plt.grid(True) plt.legend(['train', 'val'], loc=7) plt.savefig(path + 'acc_loss/depth_' + flag + '.png')
def update_features_dims(data): """Update list of features and this dimension to use in SMOTENC Args: data: dataframe """ utils.save_log('{0} :: {1}'.format(update_features_dims.__module__, update_features_dims.__name__)) return [ data[features_list].columns.get_loc(i) for i in categorical_features_list ]
def normalize_vector(vector): """Normalize the values of prediction to 0 and 1 Args: vector : column predictions made by Isolation Forest Returns: vector_normalized: a column value of Isolation Forest normalized """ utils.save_log('{0} :: {1}'.format(normalize_vector.__module__, normalize_vector.__name__)) return ((vector - 1) * -1) // 2
def extract_images(input_images): utils.save_log(f'{Processer.extract_images.__module__} :: ' f'{Processer.extract_images.__name__}') images = numpy.zeros((len(input_images), config.IMG_HEIGHT, config.IMG_WIDTH, config.IMG_CHANNEL), dtype=numpy.uint8) for i in range(len(input_images)): image = cv2.imread(input_images[i]) image = cv2.resize(image, (config.IMG_HEIGHT, config.IMG_WIDTH)) images[i] = image images = images.astype('float32') images /= 255 return images
def save_history(self, filePath=None, savePredHist=False, saveTrainHist=True, saveResults=False, results=None, resultsLabel='', historyLabel=''): sep = self.sep resultsLabel = resultsLabel if resultsLabel is not '' else sep.join( (self.descr, 'results')) if filePath is not None: rootBasePath = filePath saveResFolder = os.path.join(saveFile, 'Logs', 'Results') saveHisFolder = os.path.join(saveFile, 'Logs', 'History') else: rootBasePath = self.rootSaveFolder saveResFolder = self.saveLogsFolder saveHisFolder = self.saveHisFolder folders = [saveResFolder, saveHisFolder] # Create the Target Directory if does not exist. for f in folders: if not os.path.exists(f): os.makedirs(f) if saveResults == True: saveResFile = os.path.join( saveResFolder, sep.join((self.defSavePrefix, resultsLabel, ".txt"))) saveFile = os.path.join(saveHisFolder, sep.join((self.defSavePrefix, "log1.txt"))) # Save training history or predHistory as required. if saveTrainHist == True: utils.save_log(saveFile, self.history) if savePredHist == True: utils.save_log(saveFile, self.predHistory) # Save Results if required if saveResults == True: if results is not None: try: utils.save_tensor(results, filePath=saveResFile) except (AttributeError, TypeError): raise AssertionError( 'Input Results variable should be Tensor.') else: print("No Results Tensor to save is given.") return saveFile
def create_features_avg_ratio_value_by_categories(data, list_of_categories): """Create new features relating the transaction value for each product category. Args: data: input spark data frame. list_of_categories: features to be inserted on global features list """ utils.save_log('{0} :: {1}'.format( create_features_avg_ratio_value_by_categories.__module__, create_features_avg_ratio_value_by_categories.__name__)) for item in list_of_categories: data = create_feature_average_value_for_category(data, item) data = create_feature_ratio_between_value_and_category(data, item) return data
def main(): args = parse_args() train_dataset, test_dataset = dataset.get_dataset(args.path, args.use_augmentation, args.use_fivecrop) train_loader = DataLoader(train_dataset, args.batch, True, num_workers=args.worker, pin_memory=True) test_loader = DataLoader(test_dataset, args.batch, False, num_workers=args.worker, pin_memory=True) if args.cuda: torch.cuda.set_device(0) device = torch.device('cuda') else: device = torch.device('cpu') if args.model == 'ResNet18': mymodel = model.ResNet18(args.frozen_layers).to(device) elif args.model == 'ResNet34': mymodel = model.ResNet34(args.frozen_layers).to(device) elif args.model == 'ResNet50': mymodel = model.ResNet50(args.frozen_layers).to(device) elif args.model == 'DenseNet': mymodel = model.DenseNet().to(device) else: pass op = optim.Adam(mymodel.parameters(), lr=args.lr) train_losses, test_mF1s, test_precisions, test_recalls = [], [], [], [] early = args.early for i in range(args.epoch): train_loss = train.train(mymodel, op, train_loader, i, device, args.log, utils.pos_weight) mF1, recall, presicion = test.test(mymodel, test_loader, device, args.use_fivecrop) train_losses.append(train_loss) test_mF1s.append(mF1) test_precisions.append(presicion) test_recalls.append(recall) early = utils.early_stop(test_mF1s, early) if early <= 0: break utils.save_log(mymodel, train_losses, test_mF1s, test_precisions, test_recalls)
def validate(valloader, model, criterion): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top2 = AverageMeter() # switch to evaluate mode model.eval() end = time.time() for i, (input_data, target) in enumerate(valloader): if use_gpu: input_data = input_data.cuda(async=True) target = target.cuda(async=True) input_var = torch.autograd.Variable(input_data, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec2 = accuracy(output.data, target, topk=(1, 2)) losses.update(loss.data[0], input_data.size(0)) top1.update(prec1[0], input_data.size(0)) top2.update(prec2[0], input_data.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: info = 'Testing: [{0}/{1}]\t'\ 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'\ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'\ 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'\ 'Prec@2 {top2.val:.3f} ({top2.avg:.3f})'.format( i, len(valloader), batch_time=batch_time, loss=losses, top1=top1, top2=top2) print(info) save_log(info, log_file) info = '* Test: Prec@1 {top1.avg:.3f} Prec@2 {top2.avg:.3f}'.format( top1=top1, top2=top2) print(info) save_log(info, log_file) return top1.avg
def update_list_features(list_type, list_column_name): """Update adding name features to the list of features Args: list_type: categorical or numerical. list_column_name: list of features name to be added. If is a unique value, need be as list as well. """ utils.save_log('{0} :: {1}'.format(update_list_features.__module__, update_list_features.__name__)) for column_name in list_column_name: if list_type == 'categorical': categorical_features_list.append(column_name) if list_type == 'numerical': numerical_features_list.append(column_name) features_list.append(column_name)
def create_feature_is_credit_debit(data): """ Create new column Operation based on Amount column Identifier if the operation is: - credit (-1) or debit (1) Args: data (spark data frame): input spark data frame. """ utils.save_log('{0} :: {1}'.format( create_feature_is_credit_debit.__module__, create_feature_is_credit_debit.__name__)) data = data.withColumn("Operation", when(data.Amount > 0, 1).otherwise(0)) update_list_features("numerical", ["Operation"]) return data
def get_percentage_of_fraudulent_transactions(data): """Compute the proportion of fraudulent transactions on data. Args: data (Pandas dataframe): a matrix dataframe Returns: Percentage of fraud into dataframe. """ utils.save_log('{0} :: {1}'.format( get_percentage_of_fraudulent_transactions.__module__, get_percentage_of_fraudulent_transactions.__name__)) if features_engineering.target_label in data.columns: features_engineering.fraudulent_percentage = \ data.filter('FraudResult==1').count() / data.count() return features_engineering.fraudulent_percentage
def get_log(self, date): """Парсинг json и сохранение данных логов и пользователей в БД""" data = self.get_json() if 'error' in data: logger.debug(data['error']) if 'logs' in data: # Получение всех логов с форматированным временем all_logs = list(correct_time(data['logs'])) sort_by_created_at(all_logs) for log in all_logs: first_name = log['first_name'] second_name = log['second_name'] user_id = log['user_id'] message = log['message'] created_at = log['created_at'] save_user(first_name, second_name, user_id) save_log(message, created_at, user_id) logger.info(f'Запись завершена')
def predict(data, input_file_name='../data/model_if'): """Generate predictions using the Isolation Forest model. Args: data (Pandas dataframe): a matrix dataframe input_file_name: input file name of IF model Returns: predictions: Model outcomes (predictions) """ utils.save_log('{0} :: {1}'.format(predict.__module__, predict.__name__)) with open(input_file_name, 'rb') as pickle_file: model = pickle.load(pickle_file) predictions = model.predict(data) return predictions