def main(args): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'output-dir {args.output_dir}') sgt = load_model_from_directory(args.model_input_dir, model_loader=joblib_loader).data input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: print(f'column{seq_col} contains missing values ') sys.exit(1) embedding_df = score(input_df, sgt, seq_col, id_col) print('f embedding shape{embedding_df.shape}') print(embedding_df.head()) save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def detect(timestamp, data_to_detect, detect_mode, batch_size, threshold=0.3, sensitivity=99): column_length = len(data_to_detect.columns) if column_length == 1: logger.debug('single column to detect') frame = pd.DataFrame(columns=['timestamp', 'value']) frame['timestamp'] = timestamp frame['value'] = data_to_detect.iloc[:, 0] output = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity) else: logger.debug(f'detect {column_length} columns') output = pd.DataFrame() for col in data_to_detect.columns: frame = pd.DataFrame(columns=['timestamp', 'value']) frame['timestamp'] = timestamp frame['value'] = data_to_detect[col] result = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity) result.columns = [f'{rc}_{col}' for rc in result.columns] output = pd.concat((output, result), axis=1) return output
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity, appendMode, compute_stats_in_visualization, output_path): data_frame_directory = load_data_frame_from_directory(input_path) logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}") if data_frame_directory.data.shape[0] < MIN_POINTS: raise UserError(NotEnoughPoints.format(MIN_POINTS)) if 0 < batch_size < MIN_POINTS: raise UserError(InvalidBatchSize.format(MIN_POINTS)) query_string = unquote(timestamp_column) timestamp_column_selector = ColumnSelection(query_string) timestamp = timestamp_column_selector.select_dataframe_directory(data_frame_directory).data timestamps = pd.to_datetime(timestamp.iloc[:, 0].values) if np.any(np.isnat(timestamps)): raise UserError(InvalidTimestamps) res = is_timestamp_ascending(timestamps) if res == -1: raise UserError(InvalidSeriesOrder) elif res == -2: raise UserError(DuplicateSeriesTimestamp) query_string = unquote(value_column) data_column_selector = ColumnSelection(query_string) data_columns = data_column_selector.select_dataframe_directory(data_frame_directory).data for col in data_columns.columns: try: float_data = data_columns[col].apply(float) except Exception as e: raise UserError(InvalidValueFormat.format(col)) if not np.all(np.isfinite(float_data)): raise UserError(InvalidSeriesValue.format(col)) if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)): raise UserError(ValueOverflow.format(col)) data_columns[col] = float_data result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode, batch_size=batch_size, threshold=threshold, sensitivity=sensitivity) if appendMode is True: result = pd.merge(data_frame_directory.data, result, left_index=True, right_index=True) save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
def main(args=None): ''' Module entry function ''' input_dir = args.input_dir corr_type = args.correlation_method logger.debug(f'input-dir {input_dir}') logger.debug(f'correlation-method {corr_type}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data corr_df = ComputeCorrelationModule(corr_type).compute(input_df) logger.debug(f'correlation matrix shape {corr_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=corr_df, schema=DataFrameSchema.data_frame_to_dict(corr_df))
parser.add_argument("--score-result", help="Result of the computation.") args, _ = parser.parse_known_args() rating_true = load_data_frame_from_directory(args.rating_true).data rating_pred = load_data_frame_from_directory(args.rating_pred).data col_user = args.col_user col_item = args.col_item col_rating = args.col_rating col_prediction = args.col_prediction relevancy_method = args.relevancy_method k = args.k threshold = args.threshold logger.debug(f"Received parameters:") logger.debug(f"User: {col_user}") logger.debug(f"Item: {col_item}") logger.debug(f"Rating: {col_rating}") logger.debug(f"Prediction: {col_prediction}") logger.debug(f"Relevancy: {relevancy_method}") logger.debug(f"K: {k}") logger.debug(f"Threshold: {threshold}") logger.debug(f"Rating True path: {args.rating_true}") logger.debug(f"Shape of loaded DataFrame: {rating_true.shape}") logger.debug(f"Rating Pred path: {args.rating_pred}") logger.debug(f"Shape of loaded DataFrame: {rating_pred.shape}") eval_recall = recall_at_k( rating_true,
) args, _ = parser.parse_known_args() logger.info(f"Hello world MPI from {PACKAGE_NAME} {VERSION}") comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() str_param = args.string_parameter int_param = args.int_parameter bool_param = args.boolean_parameter enum_param = args.enum_parameter logger.debug(f"Received parameters:") logger.debug(f" {str_param}") logger.debug(f" {int_param}") logger.debug(f" {bool_param}") logger.debug(f" {enum_param}") if rank > 0: logger.debug(f"I'm rank {rank}/{size}, wait for data.") data = comm.recv(source=0, tag=rank) logger.debug(f"Received shape of loaded DataFrame: {data} ") else: logger.debug(f"I'm rank 0/{size}, load and dump.") logger.debug(f"Input path: {args.input_path}") data_frame_directory = load_data_frame_from_directory(args.input_path)
help='Remove items seen in training from recommendation') parser.add_argument('--score-result', help='Ratings or items to output') args, _ = parser.parse_known_args() logger.info(f"Arguments: {args}") sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None remove_seen_items = strtobool( args.remove_seen_items) if args.remove_seen_items else None normalize = strtobool(args.normalize) if args.normalize else None sar_model = load_model_from_directory(args.trained_model, model_loader=joblib_loader).data dataset_to_score = load_data_frame_from_directory( args.dataset_to_score).data logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}") score_sar_module = ScoreSARModule(model=sar_model, input_data=dataset_to_score) score_type = ScoreType(args.score_type) if score_type == ScoreType.ITEM_RECOMMENDATION: score_result = score_sar_module.recommend_items( ranking_metric=RankingMetric(args.ranking_metric), top_k=args.top_k, sort_top_k=sort_top_k, remove_seen=args.remove_seen_items, normalize=normalize) elif score_type == ScoreType.RATING_PREDICTION: score_result = score_sar_module.predict_ratings( items_to_predict=ItemSet(args.items_to_predict),
parser.add_argument( '--compute-stats-in-visualization', type=str2bool, default=False, help='Enable this parameter to get stats visualization.' ) parser.add_argument( '--output-path', help='Output Dataframe path' ) args, _ = parser.parse_known_args() logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}") logger.debug("Received parameters:") logger.debug(f"input: {args.input_path}") logger.debug(f"detect mode: {args.detect_mode}") logger.debug(f"timestamp column: {args.timestamp_column}") logger.debug(f"value column: {args.value_column}") logger.debug(f"batch size: {args.batch_size}") logger.debug(f"threshold: {args.threshold}") logger.debug(f"sensitivity: {args.sensitivity}") logger.debug(f"appendMode: {args.append_mode}") logger.debug(f"appendMode: {args.compute_stats_in_visualization}") logger.debug(f"output path: {args.output_path}") invoke(args.input_path, args.detect_mode, args.timestamp_column, args.value_column, args.batch_size, args.threshold, args.sensitivity, args.append_mode, args.compute_stats_in_visualization, args.output_path)
def main(args): ''' Module entry function args: args:list, user parameters ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = load_model_from_directory(args.model_input_dir, model_loader=pcamodule_loader).data logger.debug(pca_module.pca_instance) output_df = score(pca_module, input_df) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df))
parser.add_argument("--output-model", help="The output model directory.") parser.add_argument("--col-user", type=str, help="A string parameter.") parser.add_argument("--col-item", type=str, help="A string parameter.") parser.add_argument("--col-rating", type=str, help="A string parameter.") parser.add_argument("--col-timestamp", type=str, help="A string parameter.") parser.add_argument("--normalize", type=str) parser.add_argument("--time-decay", type=str) args, _ = parser.parse_known_args() input_df = load_data_frame_from_directory(args.input_path).data input_df[args.col_rating] = input_df[args.col_rating].astype(float) logger.debug(f"Shape of loaded DataFrame: {input_df.shape}") logger.debug(f"Cols of DataFrame: {input_df.columns}") model = SAR( col_user=args.col_user, col_item=args.col_item, col_rating=args.col_rating, col_timestamp=args.col_timestamp, normalize=strtobool(args.normalize), timedecay_formula=strtobool(args.time_decay), ) start_time = time.time() model.fit(input_df)
def main(args=None): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column length_sensitive = args.length_sensitive kappa = args.kappa logger.debug(f'input-dir {args.input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'length-sensitive {length_sensitive}') logger.debug(f'kappa {args.kappa}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: logger.debug(f'column {seq_col} contains missing values ') sys.exit(1) embedding_df, sgt = compute_embeddings(input_df, seq_col, kappa, length_sensitive, id_col) logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=sgt_dumper(data=sgt))
) parser.add_argument( "--output-test", help="The output test data directory.", ) args, _ = parser.parse_known_args() input_df = load_data_frame_from_directory(args.input_path).data ratio = args.ratio col_user = args.col_user col_item = args.col_item seed = args.seed logger.debug(f"Received parameters:") logger.debug(f"Ratio: {ratio}") logger.debug(f"User: {col_user}") logger.debug(f"Item: {col_item}") logger.debug(f"Seed: {seed}") logger.debug(f"Input path: {args.input_path}") logger.debug(f"Shape of loaded DataFrame: {input_df.shape}") logger.debug(f"Cols of DataFrame: {input_df.columns}") output_train, output_test = python_stratified_split( input_df, ratio=args.ratio, col_user=args.col_user, col_item=args.col_item, seed=args.seed,
def main(args): ''' Module entry function ''' transformer = SUPPORTED_TRANSFORMERS[args.transformer] logger.debug(f'input-dir {args.input_dir}') logger.debug(f'column {args.column_name}') logger.debug(f'distance {args.distance}') logger.debug(f'transformer {transformer}') logger.debug(f'sim-dir {args.sim_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[args.column_name].isnull().sum().sum() > 0: logger.debug(f'column{args.column_name} contains missing values ') sys.exit(1) sts = TextualSimilarity(transformer=transformer, distance_func=args.distance) embedding_df, sim_df = sts.fit_transform(input_df[args.column_name].values) sim_df.insert(0, args.column_name, input_df[args.column_name]) logger.debug(f'similarity matrix shape {sim_df.shape}') logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.sim_dir, data=sim_df, schema=DataFrameSchema.data_frame_to_dict(sim_df)) save_data_frame_to_directory( save_to=args.embedding_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args): ''' Module entry function args: args:list transformer parameters requested by user/ ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = PCAModule(args) logger.debug(pca_module.pca_instance) output_df = pca_module.fit_transform(input_df) pca_module.log_metrics(input_df.columns) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=pca_module_dumper(data=pca_module))
'--enum-parameter', type=str, help='A enum parameter.', ) parser.add_argument( '--output-path', help='The output directory.', ) args, _ = parser.parse_known_args() logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}") str_param = args.string_parameter int_param = args.int_parameter bool_param = args.boolean_parameter enum_param = args.enum_parameter logger.debug(f"Received parameters:") logger.debug(f" {str_param}") logger.debug(f" {int_param}") logger.debug(f" {bool_param}") logger.debug(f" {enum_param}") logger.debug(f"Input path: {args.input_path}") data_frame_directory = load_data_frame_from_directory(args.input_path) logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}") logger.debug(f"Output path: {args.output_path}") save_data_frame_to_directory(args.output_path, data_frame_directory.data)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input-path', help='Input Dataframe path') parser.add_argument('--detect-mode', choices=['AnomalyOnly', 'AnomalyAndMargin'], help='Specify the detect mode.') parser.add_argument('--timestamp-column', help='Choose the column that contains timestamps.') parser.add_argument('--value-column', help='Choose the column that contains values.') parser.add_argument( '--batch-size', type=int, help= 'This parameter specifies the size of each batch that the detection is perfomed.' ) parser.add_argument( '--threshold', type=float, help= 'This parameter specifies the threshold anomaly score that a point is judged as anomaly.' ) parser.add_argument( '--sensitivity', type=float, help= 'This parameter is used in AnomalyAndMargin mode to control the width of margin.' ) parser.add_argument( '--append-mode', type=str2bool, default=False, help= 'This parameter is used in AnomalyAndMargin mode to control the width of margin.' ) parser.add_argument( '--compute-stats-in-visualization', type=str2bool, default=False, help='Enable this parameter to get stats visualization.') parser.add_argument('--output-path', help='Output Dataframe path') args, _ = parser.parse_known_args() logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}") logger.debug("Received parameters:") logger.debug(f"input: {args.input_path}") logger.debug(f"detect mode: {args.detect_mode}") logger.debug(f"timestamp column: {args.timestamp_column}") logger.debug(f"value column: {args.value_column}") logger.debug(f"batch size: {args.batch_size}") logger.debug(f"threshold: {args.threshold}") logger.debug(f"sensitivity: {args.sensitivity}") logger.debug(f"appendMode: {args.append_mode}") logger.debug(f"appendMode: {args.compute_stats_in_visualization}") logger.debug(f"output path: {args.output_path}") invoke(args.input_path, args.detect_mode, args.timestamp_column, args.value_column, args.batch_size, args.threshold, args.sensitivity, args.append_mode, args.compute_stats_in_visualization, args.output_path)