예제 #1
0
def main(args):
    '''
        Module entry point function
    '''

    seq_col = args.sequence_column
    id_col = args.identifier_column

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'model input dir {args.model_input_dir}')
    logger.debug(f'sequence-column {seq_col}')
    logger.debug(f'identifier-column {id_col}')
    logger.debug(f'output-dir {args.output_dir}')

    sgt = load_model_from_directory(args.model_input_dir,
                                    model_loader=joblib_loader).data
    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[seq_col].isnull().sum().sum() > 0:
        print(f'column{seq_col} contains missing values ')
        sys.exit(1)

    embedding_df = score(input_df, sgt, seq_col, id_col)
    print('f embedding shape{embedding_df.shape}')
    print(embedding_df.head())

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))
예제 #2
0
def detect(timestamp,
           data_to_detect,
           detect_mode,
           batch_size,
           threshold=0.3,
           sensitivity=99):

    column_length = len(data_to_detect.columns)
    if column_length == 1:
        logger.debug('single column to detect')

        frame = pd.DataFrame(columns=['timestamp', 'value'])
        frame['timestamp'] = timestamp
        frame['value'] = data_to_detect.iloc[:, 0]
        output = sr_detect(frame, detect_mode, batch_size, threshold,
                           sensitivity)
    else:
        logger.debug(f'detect {column_length} columns')
        output = pd.DataFrame()

        for col in data_to_detect.columns:
            frame = pd.DataFrame(columns=['timestamp', 'value'])
            frame['timestamp'] = timestamp
            frame['value'] = data_to_detect[col]
            result = sr_detect(frame, detect_mode, batch_size, threshold,
                               sensitivity)
            result.columns = [f'{rc}_{col}' for rc in result.columns]
            output = pd.concat((output, result), axis=1)

    return output
예제 #3
0
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity,
            appendMode, compute_stats_in_visualization, output_path):
    data_frame_directory = load_data_frame_from_directory(input_path)

    logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}")

    if data_frame_directory.data.shape[0] < MIN_POINTS:
        raise UserError(NotEnoughPoints.format(MIN_POINTS))

    if 0 < batch_size < MIN_POINTS:
        raise UserError(InvalidBatchSize.format(MIN_POINTS))

    query_string = unquote(timestamp_column)
    timestamp_column_selector = ColumnSelection(query_string)
    timestamp = timestamp_column_selector.select_dataframe_directory(data_frame_directory).data

    timestamps = pd.to_datetime(timestamp.iloc[:, 0].values)

    if np.any(np.isnat(timestamps)):
        raise UserError(InvalidTimestamps)

    res = is_timestamp_ascending(timestamps)
    if res == -1:
        raise UserError(InvalidSeriesOrder)
    elif res == -2:
        raise UserError(DuplicateSeriesTimestamp)


    query_string = unquote(value_column)
    data_column_selector = ColumnSelection(query_string)
    data_columns = data_column_selector.select_dataframe_directory(data_frame_directory).data

    for col in data_columns.columns:
        try:
            float_data = data_columns[col].apply(float)
        except Exception as e:
            raise UserError(InvalidValueFormat.format(col))

        if not np.all(np.isfinite(float_data)):
            raise UserError(InvalidSeriesValue.format(col))

        if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)):
            raise UserError(ValueOverflow.format(col))

        data_columns[col] = float_data

    result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode,
                                batch_size=batch_size, threshold=threshold, sensitivity=sensitivity)

    if appendMode is True:
        result = pd.merge(data_frame_directory.data, result, left_index=True, right_index=True)

    save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
예제 #4
0
def main(args=None):
    '''
        Module entry function
    '''
    input_dir = args.input_dir
    corr_type = args.correlation_method

    logger.debug(f'input-dir {input_dir}')
    logger.debug(f'correlation-method {corr_type}')
    logger.debug(f'output-dir {args.output_dir}')
    input_df = load_data_frame_from_directory(args.input_dir).data

    corr_df = ComputeCorrelationModule(corr_type).compute(input_df)
    logger.debug(f'correlation matrix shape {corr_df.shape}')

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=corr_df,
        schema=DataFrameSchema.data_frame_to_dict(corr_df))
    parser.add_argument("--score-result", help="Result of the computation.")

    args, _ = parser.parse_known_args()

    rating_true = load_data_frame_from_directory(args.rating_true).data
    rating_pred = load_data_frame_from_directory(args.rating_pred).data

    col_user = args.col_user
    col_item = args.col_item
    col_rating = args.col_rating
    col_prediction = args.col_prediction
    relevancy_method = args.relevancy_method
    k = args.k
    threshold = args.threshold

    logger.debug(f"Received parameters:")
    logger.debug(f"User:       {col_user}")
    logger.debug(f"Item:       {col_item}")
    logger.debug(f"Rating:     {col_rating}")
    logger.debug(f"Prediction: {col_prediction}")
    logger.debug(f"Relevancy:  {relevancy_method}")
    logger.debug(f"K:          {k}")
    logger.debug(f"Threshold:  {threshold}")

    logger.debug(f"Rating True path: {args.rating_true}")
    logger.debug(f"Shape of loaded DataFrame: {rating_true.shape}")
    logger.debug(f"Rating Pred path: {args.rating_pred}")
    logger.debug(f"Shape of loaded DataFrame: {rating_pred.shape}")

    eval_recall = recall_at_k(
        rating_true,
    )

    args, _ = parser.parse_known_args()

    logger.info(f"Hello world MPI from {PACKAGE_NAME} {VERSION}")

    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()

    str_param = args.string_parameter
    int_param = args.int_parameter
    bool_param = args.boolean_parameter
    enum_param = args.enum_parameter

    logger.debug(f"Received parameters:")
    logger.debug(f"    {str_param}")
    logger.debug(f"    {int_param}")
    logger.debug(f"    {bool_param}")
    logger.debug(f"    {enum_param}")

    if rank > 0:
        logger.debug(f"I'm rank {rank}/{size}, wait for data.")
        data = comm.recv(source=0, tag=rank)
        logger.debug(f"Received shape of loaded DataFrame: {data} ")
    else:
        logger.debug(f"I'm rank 0/{size}, load and dump.")

        logger.debug(f"Input path: {args.input_path}")
        data_frame_directory = load_data_frame_from_directory(args.input_path)
예제 #7
0
        help='Remove items seen in training from recommendation')
    parser.add_argument('--score-result', help='Ratings or items to output')

    args, _ = parser.parse_known_args()

    logger.info(f"Arguments: {args}")
    sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None
    remove_seen_items = strtobool(
        args.remove_seen_items) if args.remove_seen_items else None
    normalize = strtobool(args.normalize) if args.normalize else None

    sar_model = load_model_from_directory(args.trained_model,
                                          model_loader=joblib_loader).data
    dataset_to_score = load_data_frame_from_directory(
        args.dataset_to_score).data
    logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}")

    score_sar_module = ScoreSARModule(model=sar_model,
                                      input_data=dataset_to_score)

    score_type = ScoreType(args.score_type)
    if score_type == ScoreType.ITEM_RECOMMENDATION:
        score_result = score_sar_module.recommend_items(
            ranking_metric=RankingMetric(args.ranking_metric),
            top_k=args.top_k,
            sort_top_k=sort_top_k,
            remove_seen=args.remove_seen_items,
            normalize=normalize)
    elif score_type == ScoreType.RATING_PREDICTION:
        score_result = score_sar_module.predict_ratings(
            items_to_predict=ItemSet(args.items_to_predict),
예제 #8
0
    parser.add_argument(
        '--compute-stats-in-visualization', type=str2bool, default=False,
        help='Enable this parameter to get stats visualization.'
    )

    parser.add_argument(
        '--output-path',
        help='Output Dataframe path'
    )

    args, _ = parser.parse_known_args()

    logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}")

    logger.debug("Received parameters:")
    logger.debug(f"input: {args.input_path}")
    logger.debug(f"detect mode: {args.detect_mode}")
    logger.debug(f"timestamp column: {args.timestamp_column}")
    logger.debug(f"value column: {args.value_column}")
    logger.debug(f"batch size: {args.batch_size}")
    logger.debug(f"threshold: {args.threshold}")
    logger.debug(f"sensitivity: {args.sensitivity}")
    logger.debug(f"appendMode: {args.append_mode}")
    logger.debug(f"appendMode: {args.compute_stats_in_visualization}")
    logger.debug(f"output path: {args.output_path}")

    invoke(args.input_path, args.detect_mode, args.timestamp_column, args.value_column,
        args.batch_size, args.threshold, args.sensitivity, args.append_mode,
        args.compute_stats_in_visualization, args.output_path)
예제 #9
0
def main(args):
    '''
    Module entry function

    args:
      args:list, user parameters

   '''

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'model input dir {args.model_input_dir}')

    logger.debug(f'output-dir {args.output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data
    logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ')

    pca_module = load_model_from_directory(args.model_input_dir,
                                           model_loader=pcamodule_loader).data

    logger.debug(pca_module.pca_instance)

    output_df = score(pca_module, input_df)

    logger.debug(f'output shape {output_df.shape}')
    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=output_df,
        schema=DataFrameSchema.data_frame_to_dict(output_df))
예제 #10
0
    parser.add_argument("--output-model", help="The output model directory.")
    parser.add_argument("--col-user", type=str, help="A string parameter.")
    parser.add_argument("--col-item", type=str, help="A string parameter.")
    parser.add_argument("--col-rating", type=str, help="A string parameter.")
    parser.add_argument("--col-timestamp",
                        type=str,
                        help="A string parameter.")
    parser.add_argument("--normalize", type=str)
    parser.add_argument("--time-decay", type=str)

    args, _ = parser.parse_known_args()

    input_df = load_data_frame_from_directory(args.input_path).data
    input_df[args.col_rating] = input_df[args.col_rating].astype(float)

    logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
    logger.debug(f"Cols of DataFrame: {input_df.columns}")

    model = SAR(
        col_user=args.col_user,
        col_item=args.col_item,
        col_rating=args.col_rating,
        col_timestamp=args.col_timestamp,
        normalize=strtobool(args.normalize),
        timedecay_formula=strtobool(args.time_decay),
    )

    start_time = time.time()

    model.fit(input_df)
def main(args=None):
    '''
      Module entry point function
    '''

    seq_col = args.sequence_column
    id_col = args.identifier_column
    length_sensitive = args.length_sensitive
    kappa = args.kappa

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'sequence-column {seq_col}')
    logger.debug(f'identifier-column {id_col}')
    logger.debug(f'length-sensitive {length_sensitive}')
    logger.debug(f'kappa {args.kappa}')
    logger.debug(f'output-dir {args.output_dir}')
    logger.debug(f'model output dir {args.model_output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[seq_col].isnull().sum().sum() > 0:
        logger.debug(f'column {seq_col} contains missing values ')
        sys.exit(1)

    embedding_df, sgt = compute_embeddings(input_df, seq_col, kappa,
                                           length_sensitive, id_col)

    logger.debug(f'embedding shape {embedding_df.shape}')

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))

    save_model_to_directory(save_to=args.model_output_dir,
                            model_dumper=sgt_dumper(data=sgt))
예제 #12
0
    )
    parser.add_argument(
        "--output-test",
        help="The output test data directory.",
    )

    args, _ = parser.parse_known_args()

    input_df = load_data_frame_from_directory(args.input_path).data

    ratio = args.ratio
    col_user = args.col_user
    col_item = args.col_item
    seed = args.seed

    logger.debug(f"Received parameters:")
    logger.debug(f"Ratio:    {ratio}")
    logger.debug(f"User:    {col_user}")
    logger.debug(f"Item:    {col_item}")
    logger.debug(f"Seed:    {seed}")

    logger.debug(f"Input path: {args.input_path}")
    logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
    logger.debug(f"Cols of DataFrame: {input_df.columns}")

    output_train, output_test = python_stratified_split(
        input_df,
        ratio=args.ratio,
        col_user=args.col_user,
        col_item=args.col_item,
        seed=args.seed,
def main(args):
    '''
        Module entry function
    '''

    transformer = SUPPORTED_TRANSFORMERS[args.transformer]

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'column {args.column_name}')
    logger.debug(f'distance {args.distance}')
    logger.debug(f'transformer {transformer}')
    logger.debug(f'sim-dir {args.sim_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[args.column_name].isnull().sum().sum() > 0:
        logger.debug(f'column{args.column_name} contains missing values ')
        sys.exit(1)

    sts = TextualSimilarity(transformer=transformer,
                            distance_func=args.distance)
    embedding_df, sim_df = sts.fit_transform(input_df[args.column_name].values)

    sim_df.insert(0, args.column_name, input_df[args.column_name])

    logger.debug(f'similarity matrix shape {sim_df.shape}')
    logger.debug(f'embedding  shape {embedding_df.shape}')

    save_data_frame_to_directory(
        save_to=args.sim_dir,
        data=sim_df,
        schema=DataFrameSchema.data_frame_to_dict(sim_df))

    save_data_frame_to_directory(
        save_to=args.embedding_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))
예제 #14
0
def main(args):
    '''
    Module entry function

    args:
      args:list transformer parameters requested by user/

   '''

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'output-dir {args.output_dir}')
    logger.debug(f'model output dir {args.model_output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data
    logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ')

    pca_module = PCAModule(args)
    logger.debug(pca_module.pca_instance)

    output_df = pca_module.fit_transform(input_df)
    pca_module.log_metrics(input_df.columns)

    logger.debug(f'output shape {output_df.shape}')
    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=output_df,
        schema=DataFrameSchema.data_frame_to_dict(output_df))

    save_model_to_directory(save_to=args.model_output_dir,
                            model_dumper=pca_module_dumper(data=pca_module))
        '--enum-parameter', type=str,
        help='A enum parameter.',
    )
    parser.add_argument(
        '--output-path',
        help='The output directory.',
    )

    args, _ = parser.parse_known_args()

    logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}")

    str_param = args.string_parameter
    int_param = args.int_parameter
    bool_param = args.boolean_parameter
    enum_param = args.enum_parameter

    logger.debug(f"Received parameters:")
    logger.debug(f"    {str_param}")
    logger.debug(f"    {int_param}")
    logger.debug(f"    {bool_param}")
    logger.debug(f"    {enum_param}")

    logger.debug(f"Input path: {args.input_path}")
    data_frame_directory = load_data_frame_from_directory(args.input_path)

    logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}")

    logger.debug(f"Output path: {args.output_path}")
    save_data_frame_to_directory(args.output_path, data_frame_directory.data)
예제 #16
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--input-path', help='Input Dataframe path')

    parser.add_argument('--detect-mode',
                        choices=['AnomalyOnly', 'AnomalyAndMargin'],
                        help='Specify the detect mode.')

    parser.add_argument('--timestamp-column',
                        help='Choose the column that contains timestamps.')

    parser.add_argument('--value-column',
                        help='Choose the column that contains values.')

    parser.add_argument(
        '--batch-size',
        type=int,
        help=
        'This parameter specifies the size of each batch that the detection is perfomed.'
    )

    parser.add_argument(
        '--threshold',
        type=float,
        help=
        'This parameter specifies the threshold anomaly score that a point is judged as anomaly.'
    )

    parser.add_argument(
        '--sensitivity',
        type=float,
        help=
        'This parameter is used in AnomalyAndMargin mode to control the width of margin.'
    )

    parser.add_argument(
        '--append-mode',
        type=str2bool,
        default=False,
        help=
        'This parameter is used in AnomalyAndMargin mode to control the width of margin.'
    )

    parser.add_argument(
        '--compute-stats-in-visualization',
        type=str2bool,
        default=False,
        help='Enable this parameter to get stats visualization.')

    parser.add_argument('--output-path', help='Output Dataframe path')

    args, _ = parser.parse_known_args()

    logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}")

    logger.debug("Received parameters:")
    logger.debug(f"input: {args.input_path}")
    logger.debug(f"detect mode: {args.detect_mode}")
    logger.debug(f"timestamp column: {args.timestamp_column}")
    logger.debug(f"value column: {args.value_column}")
    logger.debug(f"batch size: {args.batch_size}")
    logger.debug(f"threshold: {args.threshold}")
    logger.debug(f"sensitivity: {args.sensitivity}")
    logger.debug(f"appendMode: {args.append_mode}")
    logger.debug(f"appendMode: {args.compute_stats_in_visualization}")
    logger.debug(f"output path: {args.output_path}")

    invoke(args.input_path, args.detect_mode, args.timestamp_column,
           args.value_column, args.batch_size, args.threshold,
           args.sensitivity, args.append_mode,
           args.compute_stats_in_visualization, args.output_path)