def main(args): ''' Module entry function args: args:list, user parameters ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = load_model_from_directory(args.model_input_dir, model_loader=pcamodule_loader).data logger.debug(pca_module.pca_instance) output_df = score(pca_module, input_df) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df))
def main(args): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'output-dir {args.output_dir}') sgt = load_model_from_directory(args.model_input_dir, model_loader=joblib_loader).data input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: print(f'column{seq_col} contains missing values ') sys.exit(1) embedding_df = score(input_df, sgt, seq_col, id_col) print('f embedding shape{embedding_df.shape}') print(embedding_df.head()) save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args): ''' Module entry function args: args:list transformer parameters requested by user/ ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = PCAModule(args) logger.debug(pca_module.pca_instance) output_df = pca_module.fit_transform(input_df) pca_module.log_metrics(input_df.columns) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=pca_module_dumper(data=pca_module))
def entrance(trained_model: str, dataset: str, scored_dataset: str, append_score_columns_to_output: str = "true"): logger.info( f"append_score_columns_to_output = {append_score_columns_to_output}") params = { constants.APPEND_SCORE_COLUMNS_TO_OUTPUT_KEY: append_score_columns_to_output } score_module = BuiltinScoreModule(trained_model, params) any_directory = AnyDirectory.load(dataset) if any_directory.type == "DataFrameDirectory": input_dfd = DataFrameDirectory.load(dataset) logger.info(f"input_dfd =\n{input_dfd}") output_df = score_module.run(input_dfd) elif any_directory.type == "ImageDirectory": image_directory = ImageDirectory.load(dataset) output_df = score_module.run(image_directory) else: raise Exception(f"Unsupported directory type: {type(any_directory)}.") logger.info(f"output_df =\n{output_df}") logger.info(f"dumping to DFD {scored_dataset}") # TODO: Support other task types if score_module.model.task_type == TaskType.MultiClassification: predict_df = output_df _LABEL_NAME = 'label' score_columns = schema_utils.generate_score_column_meta( predict_df=predict_df) if score_module.model.label_column_name in predict_df.columns: label_column_name = score_module.model.label_column_name else: label_column_name = None meta_data = DataFrameSchema( column_attributes=DataFrameSchema.generate_column_attributes( df=predict_df), score_column_names=score_columns, label_column_name=label_column_name) save_data_frame_to_directory(scored_dataset, data=predict_df, schema=meta_data.to_dict()) else: ioutils.save_dfd(output_df, scored_dataset)
def main(args): ''' Module entry function ''' transformer = SUPPORTED_TRANSFORMERS[args.transformer] logger.debug(f'input-dir {args.input_dir}') logger.debug(f'column {args.column_name}') logger.debug(f'distance {args.distance}') logger.debug(f'transformer {transformer}') logger.debug(f'sim-dir {args.sim_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[args.column_name].isnull().sum().sum() > 0: logger.debug(f'column{args.column_name} contains missing values ') sys.exit(1) sts = TextualSimilarity(transformer=transformer, distance_func=args.distance) embedding_df, sim_df = sts.fit_transform(input_df[args.column_name].values) sim_df.insert(0, args.column_name, input_df[args.column_name]) logger.debug(f'similarity matrix shape {sim_df.shape}') logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.sim_dir, data=sim_df, schema=DataFrameSchema.data_frame_to_dict(sim_df)) save_data_frame_to_directory( save_to=args.embedding_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args=None): ''' Module entry function ''' input_dir = args.input_dir corr_type = args.correlation_method logger.debug(f'input-dir {input_dir}') logger.debug(f'correlation-method {corr_type}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data corr_df = ComputeCorrelationModule(corr_type).compute(input_df) logger.debug(f'correlation matrix shape {corr_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=corr_df, schema=DataFrameSchema.data_frame_to_dict(corr_df))
def main(args=None): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column length_sensitive = args.length_sensitive kappa = args.kappa logger.debug(f'input-dir {args.input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'length-sensitive {length_sensitive}') logger.debug(f'kappa {args.kappa}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: logger.debug(f'column {seq_col} contains missing values ') sys.exit(1) embedding_df, sgt = compute_embeddings(input_df, seq_col, kappa, length_sensitive, id_col) logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=sgt_dumper(data=sgt))
logger.debug(f"Rating True path: {args.rating_true}") logger.debug(f"Shape of loaded DataFrame: {rating_true.shape}") logger.debug(f"Rating Pred path: {args.rating_pred}") logger.debug(f"Shape of loaded DataFrame: {rating_pred.shape}") eval_recall = recall_at_k( rating_true, rating_pred, col_user=col_user, col_item=col_item, col_rating=col_rating, col_prediction=col_prediction, relevancy_method=relevancy_method, k=k, threshold=threshold, ) logger.debug(f"Score: {eval_recall}") # Log to AzureML dashboard run = Run.get_context() run.parent.log("Recall at {}".format(k), eval_recall) score_result = pd.DataFrame({"recall_at_k": [eval_recall]}) save_data_frame_to_directory( args.score_result, score_result, schema=DataFrameSchema.data_frame_to_dict(score_result), )
logger.debug(f"Ratio: {ratio}") logger.debug(f"User: {col_user}") logger.debug(f"Item: {col_item}") logger.debug(f"Seed: {seed}") logger.debug(f"Input path: {args.input_path}") logger.debug(f"Shape of loaded DataFrame: {input_df.shape}") logger.debug(f"Cols of DataFrame: {input_df.columns}") output_train, output_test = python_stratified_split( input_df, ratio=args.ratio, col_user=args.col_user, col_item=args.col_item, seed=args.seed, ) logger.debug(f"Output path: {args.output_train}") logger.debug(f"Output path: {args.output_test}") save_data_frame_to_directory( args.output_train, output_train, schema=DataFrameSchema.data_frame_to_dict(output_train), ) save_data_frame_to_directory( args.output_test, output_test, schema=DataFrameSchema.data_frame_to_dict(output_test), )
def build_column_attributes(self): self.column_attributes = DataFrameSchema.generate_column_attributes( df=self.df) return self.column_attributes