예제 #1
0
def main(args):
    '''
    Module entry function

    args:
      args:list, user parameters

   '''

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'model input dir {args.model_input_dir}')

    logger.debug(f'output-dir {args.output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data
    logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ')

    pca_module = load_model_from_directory(args.model_input_dir,
                                           model_loader=pcamodule_loader).data

    logger.debug(pca_module.pca_instance)

    output_df = score(pca_module, input_df)

    logger.debug(f'output shape {output_df.shape}')
    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=output_df,
        schema=DataFrameSchema.data_frame_to_dict(output_df))
예제 #2
0
def main(args):
    '''
        Module entry point function
    '''

    seq_col = args.sequence_column
    id_col = args.identifier_column

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'model input dir {args.model_input_dir}')
    logger.debug(f'sequence-column {seq_col}')
    logger.debug(f'identifier-column {id_col}')
    logger.debug(f'output-dir {args.output_dir}')

    sgt = load_model_from_directory(args.model_input_dir,
                                    model_loader=joblib_loader).data
    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[seq_col].isnull().sum().sum() > 0:
        print(f'column{seq_col} contains missing values ')
        sys.exit(1)

    embedding_df = score(input_df, sgt, seq_col, id_col)
    print('f embedding shape{embedding_df.shape}')
    print(embedding_df.head())

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))
예제 #3
0
def main(args):
    '''
    Module entry function

    args:
      args:list transformer parameters requested by user/

   '''

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'output-dir {args.output_dir}')
    logger.debug(f'model output dir {args.model_output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data
    logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ')

    pca_module = PCAModule(args)
    logger.debug(pca_module.pca_instance)

    output_df = pca_module.fit_transform(input_df)
    pca_module.log_metrics(input_df.columns)

    logger.debug(f'output shape {output_df.shape}')
    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=output_df,
        schema=DataFrameSchema.data_frame_to_dict(output_df))

    save_model_to_directory(save_to=args.model_output_dir,
                            model_dumper=pca_module_dumper(data=pca_module))
예제 #4
0
 def test_empty_input(self):
     df = pd.DataFrame()
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError,
         "The dataset should contain at leaslt 12 points to run this module.",
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #5
0
 def test_invalid_timestamp(self):
     df = pd.DataFrame()
     df['timestamp'] = 'invalid'
     df['value'] = np.ones(20)
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError, "The timestamp column specified is malformed.",
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #6
0
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity,
            appendMode, compute_stats_in_visualization, output_path):
    data_frame_directory = load_data_frame_from_directory(input_path)

    logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}")

    if data_frame_directory.data.shape[0] < MIN_POINTS:
        raise UserError(NotEnoughPoints.format(MIN_POINTS))

    if 0 < batch_size < MIN_POINTS:
        raise UserError(InvalidBatchSize.format(MIN_POINTS))

    query_string = unquote(timestamp_column)
    timestamp_column_selector = ColumnSelection(query_string)
    timestamp = timestamp_column_selector.select_dataframe_directory(data_frame_directory).data

    timestamps = pd.to_datetime(timestamp.iloc[:, 0].values)

    if np.any(np.isnat(timestamps)):
        raise UserError(InvalidTimestamps)

    res = is_timestamp_ascending(timestamps)
    if res == -1:
        raise UserError(InvalidSeriesOrder)
    elif res == -2:
        raise UserError(DuplicateSeriesTimestamp)


    query_string = unquote(value_column)
    data_column_selector = ColumnSelection(query_string)
    data_columns = data_column_selector.select_dataframe_directory(data_frame_directory).data

    for col in data_columns.columns:
        try:
            float_data = data_columns[col].apply(float)
        except Exception as e:
            raise UserError(InvalidValueFormat.format(col))

        if not np.all(np.isfinite(float_data)):
            raise UserError(InvalidSeriesValue.format(col))

        if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)):
            raise UserError(ValueOverflow.format(col))

        data_columns[col] = float_data

    result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode,
                                batch_size=batch_size, threshold=threshold, sensitivity=sensitivity)

    if appendMode is True:
        result = pd.merge(data_frame_directory.data, result, left_index=True, right_index=True)

    save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
예제 #7
0
 def test_invalid_series_value(self):
     df = pd.DataFrame()
     timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
     df['timestamp'] = timestamps
     df['value'] = np.nan
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError, 'The data in column "value" contains nan values.',
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #8
0
 def test_value_column_missing(self):
     df = pd.DataFrame()
     timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
     df['timestamp'] = timestamps
     df['missed'] = np.sin(np.linspace(1, 10, 20))
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         Exception, 'Column with name or index "value" not found.',
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #9
0
 def test_dunplicate_sereis(self):
     df = pd.DataFrame()
     df['value'] = np.ones(20)
     df['timestamp'] = '2020-01-01'
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError,
         "The timestamp column specified has duplicated timestamps.",
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #10
0
 def test_value_overflow(self):
     df = pd.DataFrame()
     timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
     df['timestamp'] = timestamps
     df['value'] = 1e200
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError,
         'The magnitude of data in column "value" exceeds limitation.',
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #11
0
 def test_not_enough_points(self):
     df = pd.DataFrame()
     timestamps = pd.date_range(start='2020-01-01', periods=10, freq='1D')
     df['timestamp'] = timestamps
     df['value'] = np.sin(np.linspace(1, 10, 10))
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError,
         "The dataset should contain at leaslt 12 points to run this module.",
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #12
0
 def test_invalid_batch_size(self):
     df = pd.DataFrame()
     timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
     df['timestamp'] = timestamps
     df['value'] = np.sin(np.linspace(1, 10, 20))
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError,
         'The "batchSize" parameter should be at least 12 or 0 that indicates to run all data in a batch',
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, 5, self.__threshold,
         self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
예제 #13
0
 def test_invalid_series_order(self):
     df = pd.DataFrame()
     timestamps = pd.date_range(start='2020-01-01', periods=20,
                                freq='1D')[::-1]
     df['timestamp'] = timestamps
     df['value'] = np.ones(20)
     save_data_frame_to_directory(self.__input_path, df)
     self.assertRaisesRegexp(
         UserError,
         "The timestamp column specified is not in ascending order.",
         invoker.invoke, self.__input_path, self.__detect_mode,
         self.__timestamp_column, self.__value_column, self.__batch_size,
         self.__threshold, self.__sensitivity, self.__append_mode,
         self.compute_stats_in_visualization, self.__output_path)
def image_to_df(image_path, output_path):
    imgs = []
    encoder = img2base64
    for f in os.listdir(image_path):
        _, ext = os.path.splitext(f)
        if ext not in IMG_EXTS:
            continue
        print(f"Loading image {f}")
        imgs.append(encoder(os.path.join(image_path, f)))

    if not imgs:
        raise FileNotFoundError(f"No valid image file in path: {image_path}")

    os.makedirs(output_path, exist_ok=True)
    df = pd.DataFrame({'image_string': imgs})
    save_data_frame_to_directory(output_path, data=df)
예제 #15
0
 def testAnomalyAndMargin(self):
     df = pd.DataFrame()
     df['timestamp'] = pd.date_range(start='2020-01-01', periods=200, freq='1D')
     df['value'] = np.sin(np.linspace(1, 20, 200))
     save_data_frame_to_directory(self.__input_path, df)
     invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
                     self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
                     self.compute_stats_in_visualization, self.__output_path)
     result = load_data_frame_from_directory(self.__output_path).data
     self.assertEqual(result.shape[0], 200)
     self.assertTrue('value' in result.columns)
     self.assertTrue('isAnomaly' in result.columns)
     self.assertTrue('score' in result.columns)
     self.assertTrue('expectedValue' in result.columns)
     self.assertTrue('upperBoundary' in result.columns)
     self.assertTrue('lowerBoundary' in result.columns)
예제 #16
0
def entrance(trained_model: str,
             dataset: str,
             scored_dataset: str,
             append_score_columns_to_output: str = "true"):
    logger.info(
        f"append_score_columns_to_output = {append_score_columns_to_output}")
    params = {
        constants.APPEND_SCORE_COLUMNS_TO_OUTPUT_KEY:
        append_score_columns_to_output
    }
    score_module = BuiltinScoreModule(trained_model, params)
    any_directory = AnyDirectory.load(dataset)
    if any_directory.type == "DataFrameDirectory":
        input_dfd = DataFrameDirectory.load(dataset)
        logger.info(f"input_dfd =\n{input_dfd}")
        output_df = score_module.run(input_dfd)
    elif any_directory.type == "ImageDirectory":
        image_directory = ImageDirectory.load(dataset)
        output_df = score_module.run(image_directory)
    else:
        raise Exception(f"Unsupported directory type: {type(any_directory)}.")

    logger.info(f"output_df =\n{output_df}")
    logger.info(f"dumping to DFD {scored_dataset}")

    # TODO: Support other task types
    if score_module.model.task_type == TaskType.MultiClassification:
        predict_df = output_df
        _LABEL_NAME = 'label'
        score_columns = schema_utils.generate_score_column_meta(
            predict_df=predict_df)
        if score_module.model.label_column_name in predict_df.columns:
            label_column_name = score_module.model.label_column_name
        else:
            label_column_name = None
        meta_data = DataFrameSchema(
            column_attributes=DataFrameSchema.generate_column_attributes(
                df=predict_df),
            score_column_names=score_columns,
            label_column_name=label_column_name)
        save_data_frame_to_directory(scored_dataset,
                                     data=predict_df,
                                     schema=meta_data.to_dict())
    else:
        ioutils.save_dfd(output_df, scored_dataset)
예제 #17
0
def main(args=None):
    '''
        Module entry function
    '''
    input_dir = args.input_dir
    corr_type = args.correlation_method

    logger.debug(f'input-dir {input_dir}')
    logger.debug(f'correlation-method {corr_type}')
    logger.debug(f'output-dir {args.output_dir}')
    input_df = load_data_frame_from_directory(args.input_dir).data

    corr_df = ComputeCorrelationModule(corr_type).compute(input_df)
    logger.debug(f'correlation matrix shape {corr_df.shape}')

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=corr_df,
        schema=DataFrameSchema.data_frame_to_dict(corr_df))
def main(args):
    '''
        Module entry function
    '''

    transformer = SUPPORTED_TRANSFORMERS[args.transformer]

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'column {args.column_name}')
    logger.debug(f'distance {args.distance}')
    logger.debug(f'transformer {transformer}')
    logger.debug(f'sim-dir {args.sim_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[args.column_name].isnull().sum().sum() > 0:
        logger.debug(f'column{args.column_name} contains missing values ')
        sys.exit(1)

    sts = TextualSimilarity(transformer=transformer,
                            distance_func=args.distance)
    embedding_df, sim_df = sts.fit_transform(input_df[args.column_name].values)

    sim_df.insert(0, args.column_name, input_df[args.column_name])

    logger.debug(f'similarity matrix shape {sim_df.shape}')
    logger.debug(f'embedding  shape {embedding_df.shape}')

    save_data_frame_to_directory(
        save_to=args.sim_dir,
        data=sim_df,
        schema=DataFrameSchema.data_frame_to_dict(sim_df))

    save_data_frame_to_directory(
        save_to=args.embedding_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args=None):
    '''
      Module entry point function
    '''

    seq_col = args.sequence_column
    id_col = args.identifier_column
    length_sensitive = args.length_sensitive
    kappa = args.kappa

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'sequence-column {seq_col}')
    logger.debug(f'identifier-column {id_col}')
    logger.debug(f'length-sensitive {length_sensitive}')
    logger.debug(f'kappa {args.kappa}')
    logger.debug(f'output-dir {args.output_dir}')
    logger.debug(f'model output dir {args.model_output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[seq_col].isnull().sum().sum() > 0:
        logger.debug(f'column {seq_col} contains missing values ')
        sys.exit(1)

    embedding_df, sgt = compute_embeddings(input_df, seq_col, kappa,
                                           length_sensitive, id_col)

    logger.debug(f'embedding shape {embedding_df.shape}')

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))

    save_model_to_directory(save_to=args.model_output_dir,
                            model_dumper=sgt_dumper(data=sgt))
    logger.debug(f"Rating True path: {args.rating_true}")
    logger.debug(f"Shape of loaded DataFrame: {rating_true.shape}")
    logger.debug(f"Rating Pred path: {args.rating_pred}")
    logger.debug(f"Shape of loaded DataFrame: {rating_pred.shape}")

    eval_recall = recall_at_k(
        rating_true,
        rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        threshold=threshold,
    )

    logger.debug(f"Score: {eval_recall}")

    # Log to AzureML dashboard
    run = Run.get_context()
    run.parent.log("Recall at {}".format(k), eval_recall)

    score_result = pd.DataFrame({"recall_at_k": [eval_recall]})
    save_data_frame_to_directory(
        args.score_result,
        score_result,
        schema=DataFrameSchema.data_frame_to_dict(score_result),
    )
    int_param = args.int_parameter
    bool_param = args.boolean_parameter
    enum_param = args.enum_parameter

    logger.debug(f"Received parameters:")
    logger.debug(f"    {str_param}")
    logger.debug(f"    {int_param}")
    logger.debug(f"    {bool_param}")
    logger.debug(f"    {enum_param}")

    if rank > 0:
        logger.debug(f"I'm rank {rank}/{size}, wait for data.")
        data = comm.recv(source=0, tag=rank)
        logger.debug(f"Received shape of loaded DataFrame: {data} ")
    else:
        logger.debug(f"I'm rank 0/{size}, load and dump.")

        logger.debug(f"Input path: {args.input_path}")
        data_frame_directory = load_data_frame_from_directory(args.input_path)

        logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}")

        logger.debug(f"Output path: {args.output_path}")
        save_data_frame_to_directory(args.output_path, data_frame_directory.data)

        for i in range(1, size):
            data = data_frame_directory.data.shape
            logger.debug(f"Send shape to rank {i}")
            comm.send(data, dest=i, tag=i)

def write_prediction_dataframe(dir_path, dataframe):
    print("Writing predictions back...")
    os.makedirs(dir_path, exist_ok=True)
    save_data_frame_to_directory(dir_path, dataframe)
예제 #23
0
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size,
           threshold, sensitivity, appendMode, compute_stats_in_visualization,
           output_path):
    df = load_data_frame_from_directory(input_path).data
    logging.info(f"Shape of loaded DataFrame: {df.shape}")

    if df.shape[0] < MIN_POINTS:
        raise Exception(NotEnoughPoints.format(MIN_POINTS))

    if 0 < batch_size < MIN_POINTS:
        raise Exception(InvalidBatchSize.format(MIN_POINTS))

    if timestamp_column not in list(df.columns):
        raise Exception(ColumnNotFoundError.format(timestamp_column))

    if value_column not in list(df.columns):
        raise Exception(ColumnNotFoundError.format(value_column))

    timestamp = pd.DataFrame(df, columns=[timestamp_column])
    timestamps = pd.to_datetime(timestamp.iloc[:, 0].values)

    if np.any(np.isnat(timestamps)):
        raise Exception(InvalidTimestamps)

    res = is_timestamp_ascending(timestamps)

    if res == -1:
        raise Exception(InvalidSeriesOrder)
    elif res == -2:
        raise Exception(DuplicateSeriesTimestamp)

    data_columns = pd.DataFrame(df, columns=[value_column])

    for col in data_columns:
        try:
            float_data = data_columns[col].apply(float)
        except Exception as e:
            raise Exception(InvalidValueFormat.format(col))

        if not np.all(np.isfinite(float_data)):
            raise Exception(InvalidSeriesValue.format(col))

        if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(
                np.greater(float_data, VALUE_UPPER_BOUND)):
            raise Exception(ValueOverflow.format(col))

        data_columns[col] = float_data

    result = sr_detector.detect(timestamps,
                                data_columns,
                                detect_mode=detect_mode,
                                batch_size=batch_size,
                                threshold=threshold,
                                sensitivity=sensitivity)

    if appendMode is True:
        result = pd.merge(df, result, left_index=True, right_index=True)

    save_data_frame_to_directory(
        output_path,
        result,
        compute_stats_in_visualization=compute_stats_in_visualization)
args = parser.parse_args()

print("Argument 1(raw data id): %s" % args.raw_data)
print("Argument 2(columns to keep): %s" %
      str(args.useful_columns.strip("[]").split(";")))
print("Argument 3(columns renaming mapping): %s" %
      str(args.columns.strip("{}").split(";")))
print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse)

run = Run.get_context()
raw_data = Dataset.get_by_id(run.experiment.workspace, id=args.raw_data)

# These functions ensure that null data is removed from the dataset,
# which will help increase machine learning model accuracy.

useful_columns = [
    s.strip().strip("'") for s in args.useful_columns.strip("[]").split(";")
]
columns = get_dict(args.columns)

new_df = (raw_data.to_pandas_dataframe().dropna(how='all').rename(
    columns=columns))[useful_columns]

new_df.reset_index(inplace=True, drop=True)

if not (args.output_cleanse is None):
    os.makedirs(args.output_cleanse, exist_ok=True)
    print("%s created" % args.output_cleanse)
    save_data_frame_to_directory(args.output_cleanse, new_df)
import argparse
import pandas as pd
from azureml.studio.core.io.data_frame_directory import save_data_frame_to_directory

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', default='inputdir')
    parser.add_argument('--output', default='outputdfd')
    args, _ = parser.parse_known_args()
    df = pd.read_parquet(args.input)
    save_data_frame_to_directory(args.output,
                                 data=df,
                                 compute_stats_in_visualization=True)
    print(f"Dataframe is saved to {args.output}")
    print(df)
예제 #26
0
                    type=str,
                    help="Path where contains model file.")
parser.add_argument("--Model_FileName",
                    type=str,
                    help="Name of the model file.")
parser.add_argument("--Evaluation_Output", type=str, help="Evaluation result")
args = parser.parse_args()

## Load data from DataFrameDirectory to Pandas DataFrame
evaluation_df = load_data_frame_from_directory(args.Evaluation_Data).data

## Prepare evaluation data
evaluation_df_features = evaluation_df[[
    c for c in evaluation_df.columns if c != args.Lable_Col
]]
evaluation_df_lable = evaluation_df[args.Lable_Col]

## Load model
xg_reg = xgb.XGBRegressor()
xg_reg.load_model(args.Model_Path + "/" + args.Model_FileName)

## Evaluation
preds = xg_reg.predict(evaluation_df_features)
rmse = np.sqrt(mean_squared_error(evaluation_df_lable, preds))
print("RMSE: %f" % (rmse))

## Output evaluation result
evaluation_result_df = pd.DataFrame(np.array([rmse]), columns=['RMSE Result'])
os.makedirs(args.Evaluation_Output, exist_ok=True)
save_data_frame_to_directory(args.Evaluation_Output, evaluation_result_df)
예제 #27
0
 def inference(self, data_path, save_path):
     os.makedirs(save_path, exist_ok=True)
     input = load_data_frame_from_directory(data_path).data
     df = self.run(input)
     save_data_frame_to_directory(save_path, data=df)
예제 #28
0
from textclscnn.args_util import preprocess_args

nltk.download('punkt')


class DataPreprocessor(object):
    def __init__(self, vocab_path, text_column):
        self.vocab_path = vocab_path
        self.text_column = text_column
        self.rule = re.compile(r"[^\u4e00-\u9fa5]")
        self.cut = word_tokenize
        with open(self.vocab_path + '/' + 'word2id.pkl', 'rb') as f:
            self.word2id = pickle.load(f)

    def process(self, data_frame: pd.DataFrame):
        out_df = data_frame.copy()
        out_df['text_id'] = data_frame[self.text_column].apply(lambda text: [
            self.word2id[word] if word != '\x00' and word in self.word2id else
            0 for word in word_tokenize(text)
        ])
        print(f'first 5 lines of processed df: {out_df.head()}')
        return out_df


if __name__ == '__main__':
    args = preprocess_args()
    processor = DataPreprocessor(args.input_vocab, args.text_column)
    data_frame = load_data_frame_from_directory(args.input_data).data
    save_data_frame_to_directory(args.output_data,
                                 data=processor.process(data_frame))
예제 #29
0
    logger.debug(f"Ratio:    {ratio}")
    logger.debug(f"User:    {col_user}")
    logger.debug(f"Item:    {col_item}")
    logger.debug(f"Seed:    {seed}")

    logger.debug(f"Input path: {args.input_path}")
    logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
    logger.debug(f"Cols of DataFrame: {input_df.columns}")

    output_train, output_test = python_stratified_split(
        input_df,
        ratio=args.ratio,
        col_user=args.col_user,
        col_item=args.col_item,
        seed=args.seed,
    )

    logger.debug(f"Output path: {args.output_train}")
    logger.debug(f"Output path: {args.output_test}")

    save_data_frame_to_directory(
        args.output_train,
        output_train,
        schema=DataFrameSchema.data_frame_to_dict(output_train),
    )
    save_data_frame_to_directory(
        args.output_test,
        output_test,
        schema=DataFrameSchema.data_frame_to_dict(output_test),
    )
예제 #30
0
        plt.ylim([0, 1.1])
        plt.ylabel('score')
        plt.title('Scores')

        return f2_plt

    def evaluation(self, df_true, df_predict, df_prob, output_eval_dir):
        run = Run.get_context()

        f1_plt = self.prcurve(df_true, df_predict, df_prob)
        run.log_image("precision/recall curve", plot=f1_plt)
        f1_plt.savefig(os.path.join(output_eval_dir, 'precision_recall.png'))

        f2_plt = self.scores(df_true, df_predict)
        run.log_image("scores", plot=f2_plt)
        f2_plt.savefig(os.path.join(output_eval_dir, 'scores.png'))


if __name__ == '__main__':
    args = predict_args()
    predictor = Predictor(args.trained_model)
    df = load_data_frame_from_directory(args.predict_path).data
    out_df = predictor.predict(df)
    save_data_frame_to_directory(args.predict_result_path, data=out_df)
    label_column = predictor.label_column
    print(f'label column {label_column}')
    if label_column in df.columns:
        print(f"Got actual label column {label_column}, evaluating:")
        predictor.evaluation(df[label_column], out_df['Scored Label'],
                             out_df['Scored Prob'], args.predict_result_path)