def main(args): ''' Module entry function args: args:list, user parameters ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = load_model_from_directory(args.model_input_dir, model_loader=pcamodule_loader).data logger.debug(pca_module.pca_instance) output_df = score(pca_module, input_df) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df))
def main(args): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'output-dir {args.output_dir}') sgt = load_model_from_directory(args.model_input_dir, model_loader=joblib_loader).data input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: print(f'column{seq_col} contains missing values ') sys.exit(1) embedding_df = score(input_df, sgt, seq_col, id_col) print('f embedding shape{embedding_df.shape}') print(embedding_df.head()) save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args): ''' Module entry function args: args:list transformer parameters requested by user/ ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = PCAModule(args) logger.debug(pca_module.pca_instance) output_df = pca_module.fit_transform(input_df) pca_module.log_metrics(input_df.columns) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=pca_module_dumper(data=pca_module))
def test_empty_input(self): df = pd.DataFrame() save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, "The dataset should contain at leaslt 12 points to run this module.", invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def test_invalid_timestamp(self): df = pd.DataFrame() df['timestamp'] = 'invalid' df['value'] = np.ones(20) save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, "The timestamp column specified is malformed.", invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity, appendMode, compute_stats_in_visualization, output_path): data_frame_directory = load_data_frame_from_directory(input_path) logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}") if data_frame_directory.data.shape[0] < MIN_POINTS: raise UserError(NotEnoughPoints.format(MIN_POINTS)) if 0 < batch_size < MIN_POINTS: raise UserError(InvalidBatchSize.format(MIN_POINTS)) query_string = unquote(timestamp_column) timestamp_column_selector = ColumnSelection(query_string) timestamp = timestamp_column_selector.select_dataframe_directory(data_frame_directory).data timestamps = pd.to_datetime(timestamp.iloc[:, 0].values) if np.any(np.isnat(timestamps)): raise UserError(InvalidTimestamps) res = is_timestamp_ascending(timestamps) if res == -1: raise UserError(InvalidSeriesOrder) elif res == -2: raise UserError(DuplicateSeriesTimestamp) query_string = unquote(value_column) data_column_selector = ColumnSelection(query_string) data_columns = data_column_selector.select_dataframe_directory(data_frame_directory).data for col in data_columns.columns: try: float_data = data_columns[col].apply(float) except Exception as e: raise UserError(InvalidValueFormat.format(col)) if not np.all(np.isfinite(float_data)): raise UserError(InvalidSeriesValue.format(col)) if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)): raise UserError(ValueOverflow.format(col)) data_columns[col] = float_data result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode, batch_size=batch_size, threshold=threshold, sensitivity=sensitivity) if appendMode is True: result = pd.merge(data_frame_directory.data, result, left_index=True, right_index=True) save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
def test_invalid_series_value(self): df = pd.DataFrame() timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') df['timestamp'] = timestamps df['value'] = np.nan save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, 'The data in column "value" contains nan values.', invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def test_value_column_missing(self): df = pd.DataFrame() timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') df['timestamp'] = timestamps df['missed'] = np.sin(np.linspace(1, 10, 20)) save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( Exception, 'Column with name or index "value" not found.', invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def test_dunplicate_sereis(self): df = pd.DataFrame() df['value'] = np.ones(20) df['timestamp'] = '2020-01-01' save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, "The timestamp column specified has duplicated timestamps.", invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def test_value_overflow(self): df = pd.DataFrame() timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') df['timestamp'] = timestamps df['value'] = 1e200 save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, 'The magnitude of data in column "value" exceeds limitation.', invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def test_not_enough_points(self): df = pd.DataFrame() timestamps = pd.date_range(start='2020-01-01', periods=10, freq='1D') df['timestamp'] = timestamps df['value'] = np.sin(np.linspace(1, 10, 10)) save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, "The dataset should contain at leaslt 12 points to run this module.", invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def test_invalid_batch_size(self): df = pd.DataFrame() timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') df['timestamp'] = timestamps df['value'] = np.sin(np.linspace(1, 10, 20)) save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, 'The "batchSize" parameter should be at least 12 or 0 that indicates to run all data in a batch', invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 5, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def test_invalid_series_order(self): df = pd.DataFrame() timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')[::-1] df['timestamp'] = timestamps df['value'] = np.ones(20) save_data_frame_to_directory(self.__input_path, df) self.assertRaisesRegexp( UserError, "The timestamp column specified is not in ascending order.", invoker.invoke, self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path)
def image_to_df(image_path, output_path): imgs = [] encoder = img2base64 for f in os.listdir(image_path): _, ext = os.path.splitext(f) if ext not in IMG_EXTS: continue print(f"Loading image {f}") imgs.append(encoder(os.path.join(image_path, f))) if not imgs: raise FileNotFoundError(f"No valid image file in path: {image_path}") os.makedirs(output_path, exist_ok=True) df = pd.DataFrame({'image_string': imgs}) save_data_frame_to_directory(output_path, data=df)
def testAnomalyAndMargin(self): df = pd.DataFrame() df['timestamp'] = pd.date_range(start='2020-01-01', periods=200, freq='1D') df['value'] = np.sin(np.linspace(1, 20, 200)) save_data_frame_to_directory(self.__input_path, df) invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path) result = load_data_frame_from_directory(self.__output_path).data self.assertEqual(result.shape[0], 200) self.assertTrue('value' in result.columns) self.assertTrue('isAnomaly' in result.columns) self.assertTrue('score' in result.columns) self.assertTrue('expectedValue' in result.columns) self.assertTrue('upperBoundary' in result.columns) self.assertTrue('lowerBoundary' in result.columns)
def entrance(trained_model: str, dataset: str, scored_dataset: str, append_score_columns_to_output: str = "true"): logger.info( f"append_score_columns_to_output = {append_score_columns_to_output}") params = { constants.APPEND_SCORE_COLUMNS_TO_OUTPUT_KEY: append_score_columns_to_output } score_module = BuiltinScoreModule(trained_model, params) any_directory = AnyDirectory.load(dataset) if any_directory.type == "DataFrameDirectory": input_dfd = DataFrameDirectory.load(dataset) logger.info(f"input_dfd =\n{input_dfd}") output_df = score_module.run(input_dfd) elif any_directory.type == "ImageDirectory": image_directory = ImageDirectory.load(dataset) output_df = score_module.run(image_directory) else: raise Exception(f"Unsupported directory type: {type(any_directory)}.") logger.info(f"output_df =\n{output_df}") logger.info(f"dumping to DFD {scored_dataset}") # TODO: Support other task types if score_module.model.task_type == TaskType.MultiClassification: predict_df = output_df _LABEL_NAME = 'label' score_columns = schema_utils.generate_score_column_meta( predict_df=predict_df) if score_module.model.label_column_name in predict_df.columns: label_column_name = score_module.model.label_column_name else: label_column_name = None meta_data = DataFrameSchema( column_attributes=DataFrameSchema.generate_column_attributes( df=predict_df), score_column_names=score_columns, label_column_name=label_column_name) save_data_frame_to_directory(scored_dataset, data=predict_df, schema=meta_data.to_dict()) else: ioutils.save_dfd(output_df, scored_dataset)
def main(args=None): ''' Module entry function ''' input_dir = args.input_dir corr_type = args.correlation_method logger.debug(f'input-dir {input_dir}') logger.debug(f'correlation-method {corr_type}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data corr_df = ComputeCorrelationModule(corr_type).compute(input_df) logger.debug(f'correlation matrix shape {corr_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=corr_df, schema=DataFrameSchema.data_frame_to_dict(corr_df))
def main(args): ''' Module entry function ''' transformer = SUPPORTED_TRANSFORMERS[args.transformer] logger.debug(f'input-dir {args.input_dir}') logger.debug(f'column {args.column_name}') logger.debug(f'distance {args.distance}') logger.debug(f'transformer {transformer}') logger.debug(f'sim-dir {args.sim_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[args.column_name].isnull().sum().sum() > 0: logger.debug(f'column{args.column_name} contains missing values ') sys.exit(1) sts = TextualSimilarity(transformer=transformer, distance_func=args.distance) embedding_df, sim_df = sts.fit_transform(input_df[args.column_name].values) sim_df.insert(0, args.column_name, input_df[args.column_name]) logger.debug(f'similarity matrix shape {sim_df.shape}') logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.sim_dir, data=sim_df, schema=DataFrameSchema.data_frame_to_dict(sim_df)) save_data_frame_to_directory( save_to=args.embedding_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args=None): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column length_sensitive = args.length_sensitive kappa = args.kappa logger.debug(f'input-dir {args.input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'length-sensitive {length_sensitive}') logger.debug(f'kappa {args.kappa}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: logger.debug(f'column {seq_col} contains missing values ') sys.exit(1) embedding_df, sgt = compute_embeddings(input_df, seq_col, kappa, length_sensitive, id_col) logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=sgt_dumper(data=sgt))
logger.debug(f"Rating True path: {args.rating_true}") logger.debug(f"Shape of loaded DataFrame: {rating_true.shape}") logger.debug(f"Rating Pred path: {args.rating_pred}") logger.debug(f"Shape of loaded DataFrame: {rating_pred.shape}") eval_recall = recall_at_k( rating_true, rating_pred, col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, relevancy_method=relevancy_method, k=k, threshold=threshold, ) logger.debug(f"Score: {eval_recall}") # Log to AzureML dashboard run = Run.get_context() run.parent.log("Recall at {}".format(k), eval_recall) score_result = pd.DataFrame({"recall_at_k": [eval_recall]}) save_data_frame_to_directory( args.score_result, score_result, schema=DataFrameSchema.data_frame_to_dict(score_result), )
int_param = args.int_parameter bool_param = args.boolean_parameter enum_param = args.enum_parameter logger.debug(f"Received parameters:") logger.debug(f" {str_param}") logger.debug(f" {int_param}") logger.debug(f" {bool_param}") logger.debug(f" {enum_param}") if rank > 0: logger.debug(f"I'm rank {rank}/{size}, wait for data.") data = comm.recv(source=0, tag=rank) logger.debug(f"Received shape of loaded DataFrame: {data} ") else: logger.debug(f"I'm rank 0/{size}, load and dump.") logger.debug(f"Input path: {args.input_path}") data_frame_directory = load_data_frame_from_directory(args.input_path) logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}") logger.debug(f"Output path: {args.output_path}") save_data_frame_to_directory(args.output_path, data_frame_directory.data) for i in range(1, size): data = data_frame_directory.data.shape logger.debug(f"Send shape to rank {i}") comm.send(data, dest=i, tag=i)
def write_prediction_dataframe(dir_path, dataframe): print("Writing predictions back...") os.makedirs(dir_path, exist_ok=True) save_data_frame_to_directory(dir_path, dataframe)
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity, appendMode, compute_stats_in_visualization, output_path): df = load_data_frame_from_directory(input_path).data logging.info(f"Shape of loaded DataFrame: {df.shape}") if df.shape[0] < MIN_POINTS: raise Exception(NotEnoughPoints.format(MIN_POINTS)) if 0 < batch_size < MIN_POINTS: raise Exception(InvalidBatchSize.format(MIN_POINTS)) if timestamp_column not in list(df.columns): raise Exception(ColumnNotFoundError.format(timestamp_column)) if value_column not in list(df.columns): raise Exception(ColumnNotFoundError.format(value_column)) timestamp = pd.DataFrame(df, columns=[timestamp_column]) timestamps = pd.to_datetime(timestamp.iloc[:, 0].values) if np.any(np.isnat(timestamps)): raise Exception(InvalidTimestamps) res = is_timestamp_ascending(timestamps) if res == -1: raise Exception(InvalidSeriesOrder) elif res == -2: raise Exception(DuplicateSeriesTimestamp) data_columns = pd.DataFrame(df, columns=[value_column]) for col in data_columns: try: float_data = data_columns[col].apply(float) except Exception as e: raise Exception(InvalidValueFormat.format(col)) if not np.all(np.isfinite(float_data)): raise Exception(InvalidSeriesValue.format(col)) if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any( np.greater(float_data, VALUE_UPPER_BOUND)): raise Exception(ValueOverflow.format(col)) data_columns[col] = float_data result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode, batch_size=batch_size, threshold=threshold, sensitivity=sensitivity) if appendMode is True: result = pd.merge(df, result, left_index=True, right_index=True) save_data_frame_to_directory( output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
args = parser.parse_args() print("Argument 1(raw data id): %s" % args.raw_data) print("Argument 2(columns to keep): %s" % str(args.useful_columns.strip("[]").split(";"))) print("Argument 3(columns renaming mapping): %s" % str(args.columns.strip("{}").split(";"))) print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse) run = Run.get_context() raw_data = Dataset.get_by_id(run.experiment.workspace, id=args.raw_data) # These functions ensure that null data is removed from the dataset, # which will help increase machine learning model accuracy. useful_columns = [ s.strip().strip("'") for s in args.useful_columns.strip("[]").split(";") ] columns = get_dict(args.columns) new_df = (raw_data.to_pandas_dataframe().dropna(how='all').rename( columns=columns))[useful_columns] new_df.reset_index(inplace=True, drop=True) if not (args.output_cleanse is None): os.makedirs(args.output_cleanse, exist_ok=True) print("%s created" % args.output_cleanse) save_data_frame_to_directory(args.output_cleanse, new_df)
import argparse import pandas as pd from azureml.studio.core.io.data_frame_directory import save_data_frame_to_directory if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input', default='inputdir') parser.add_argument('--output', default='outputdfd') args, _ = parser.parse_known_args() df = pd.read_parquet(args.input) save_data_frame_to_directory(args.output, data=df, compute_stats_in_visualization=True) print(f"Dataframe is saved to {args.output}") print(df)
type=str, help="Path where contains model file.") parser.add_argument("--Model_FileName", type=str, help="Name of the model file.") parser.add_argument("--Evaluation_Output", type=str, help="Evaluation result") args = parser.parse_args() ## Load data from DataFrameDirectory to Pandas DataFrame evaluation_df = load_data_frame_from_directory(args.Evaluation_Data).data ## Prepare evaluation data evaluation_df_features = evaluation_df[[ c for c in evaluation_df.columns if c != args.Lable_Col ]] evaluation_df_lable = evaluation_df[args.Lable_Col] ## Load model xg_reg = xgb.XGBRegressor() xg_reg.load_model(args.Model_Path + "/" + args.Model_FileName) ## Evaluation preds = xg_reg.predict(evaluation_df_features) rmse = np.sqrt(mean_squared_error(evaluation_df_lable, preds)) print("RMSE: %f" % (rmse)) ## Output evaluation result evaluation_result_df = pd.DataFrame(np.array([rmse]), columns=['RMSE Result']) os.makedirs(args.Evaluation_Output, exist_ok=True) save_data_frame_to_directory(args.Evaluation_Output, evaluation_result_df)
def inference(self, data_path, save_path): os.makedirs(save_path, exist_ok=True) input = load_data_frame_from_directory(data_path).data df = self.run(input) save_data_frame_to_directory(save_path, data=df)
from textclscnn.args_util import preprocess_args nltk.download('punkt') class DataPreprocessor(object): def __init__(self, vocab_path, text_column): self.vocab_path = vocab_path self.text_column = text_column self.rule = re.compile(r"[^\u4e00-\u9fa5]") self.cut = word_tokenize with open(self.vocab_path + '/' + 'word2id.pkl', 'rb') as f: self.word2id = pickle.load(f) def process(self, data_frame: pd.DataFrame): out_df = data_frame.copy() out_df['text_id'] = data_frame[self.text_column].apply(lambda text: [ self.word2id[word] if word != '\x00' and word in self.word2id else 0 for word in word_tokenize(text) ]) print(f'first 5 lines of processed df: {out_df.head()}') return out_df if __name__ == '__main__': args = preprocess_args() processor = DataPreprocessor(args.input_vocab, args.text_column) data_frame = load_data_frame_from_directory(args.input_data).data save_data_frame_to_directory(args.output_data, data=processor.process(data_frame))
logger.debug(f"Ratio: {ratio}") logger.debug(f"User: {col_user}") logger.debug(f"Item: {col_item}") logger.debug(f"Seed: {seed}") logger.debug(f"Input path: {args.input_path}") logger.debug(f"Shape of loaded DataFrame: {input_df.shape}") logger.debug(f"Cols of DataFrame: {input_df.columns}") output_train, output_test = python_stratified_split( input_df, ratio=args.ratio, col_user=args.col_user, col_item=args.col_item, seed=args.seed, ) logger.debug(f"Output path: {args.output_train}") logger.debug(f"Output path: {args.output_test}") save_data_frame_to_directory( args.output_train, output_train, schema=DataFrameSchema.data_frame_to_dict(output_train), ) save_data_frame_to_directory( args.output_test, output_test, schema=DataFrameSchema.data_frame_to_dict(output_test), )
plt.ylim([0, 1.1]) plt.ylabel('score') plt.title('Scores') return f2_plt def evaluation(self, df_true, df_predict, df_prob, output_eval_dir): run = Run.get_context() f1_plt = self.prcurve(df_true, df_predict, df_prob) run.log_image("precision/recall curve", plot=f1_plt) f1_plt.savefig(os.path.join(output_eval_dir, 'precision_recall.png')) f2_plt = self.scores(df_true, df_predict) run.log_image("scores", plot=f2_plt) f2_plt.savefig(os.path.join(output_eval_dir, 'scores.png')) if __name__ == '__main__': args = predict_args() predictor = Predictor(args.trained_model) df = load_data_frame_from_directory(args.predict_path).data out_df = predictor.predict(df) save_data_frame_to_directory(args.predict_result_path, data=out_df) label_column = predictor.label_column print(f'label column {label_column}') if label_column in df.columns: print(f"Got actual label column {label_column}, evaluating:") predictor.evaluation(df[label_column], out_df['Scored Label'], out_df['Scored Prob'], args.predict_result_path)