def save_prediction_outputs(postprocessed_output, experiment_dir_name, skip_output_types=None): if skip_output_types is None: skip_output_types = set() csv_filename = os.path.join(experiment_dir_name, '{}_{}.csv') for output_field, outputs in postprocessed_output.items(): for output_type, values in outputs.items(): if output_type not in skip_output_types: save_csv(csv_filename.format(output_field, output_type), values)
def save_prediction_outputs( postprocessed_output, output_features, output_directory, backend, ): postprocessed_output, column_shapes = flatten_df(postprocessed_output, backend) postprocessed_output.to_parquet(os.path.join(output_directory, PREDICTIONS_PARQUET_FILE_NAME)) save_json(os.path.join(output_directory, PREDICTIONS_SHAPES_FILE_NAME), column_shapes) if not backend.df_engine.partitioned: # csv can only be written out for unpartitioned df format (i.e., pandas) postprocessed_dict = convert_to_dict(postprocessed_output, output_features) csv_filename = os.path.join(output_directory, "{}_{}.csv") for output_field, outputs in postprocessed_dict.items(): for output_type, values in outputs.items(): save_csv(csv_filename.format(output_field, output_type), values)
def cli_synthesize_dataset(dataset_size: int, features: List[dict], output_path: str) -> None: """Symthesizes a dataset for testing purposes :param dataset_size: (int) size of the dataset :param features: (List[dict]) list of features to generate in YAML format. Provide a list contaning one dictionary for each feature, each dictionary must include a name, a type and can include some generation parameters depending on the type :param output_path: (str) path where to save the output CSV file Example content for features: [ {name: text_1, type: text, vocab_size: 20, max_len: 20}, {name: text_2, type: text, vocab_size: 20, max_len: 20}, {name: category_1, type: category, vocab_size: 10}, {name: category_2, type: category, vocab_size: 15}, {name: numerical_1, type: numerical}, {name: numerical_2, type: numerical}, {name: binary_1, type: binary}, {name: binary_2, type: binary}, {name: set_1, type: set, vocab_size: 20, max_len: 20}, {name: set_2, type: set, vocab_size: 20, max_len: 20}, {name: bag_1, type: bag, vocab_size: 20, max_len: 10}, {name: bag_2, type: bag, vocab_size: 20, max_len: 10}, {name: sequence_1, type: sequence, vocab_size: 20, max_len: 20}, {name: sequence_2, type: sequence, vocab_size: 20, max_len: 20}, {name: timeseries_1, type: timeseries, max_len: 20}, {name: timeseries_2, type: timeseries, max_len: 20}, {name: date_1, type: date}, {name: date_2, type: date}, {name: h3_1, type: h3}, {name: h3_2, type: h3}, {name: vector_1, type: vector}, {name: vector_2, type: vector}, ] """ if dataset_size is None or features is None or output_path is None: raise ValueError( "Missing one or more required parameters: '--daset_size', " "'--features' or '--output_path'") dataset = build_synthetic_dataset(dataset_size, features) save_csv(output_path, dataset)
'--dataset_size', help='size of the dataset', type=int, default=100) parser.add_argument('-f', '--features', default='[\ {name: text_1, type: text, vocab_size: 20, max_len: 20}, \ {name: text_2, type: text, vocab_size: 20, max_len: 20}, \ {name: category_1, type: category, vocab_size: 10}, \ {name: category_2, type: category, vocab_size: 15}, \ {name: numerical_1, type: numerical}, \ {name: numerical_2, type: numerical}, \ {name: binary_1, type: binary}, \ {name: binary_2, type: binary}, \ {name: set_1, type: set, vocab_size: 20, max_len: 20}, \ {name: set_2, type: set, vocab_size: 20, max_len: 20}, \ {name: bag_1, type: bag, vocab_size: 20, max_len: 10}, \ {name: bag_2, type: bag, vocab_size: 20, max_len: 10}, \ {name: sequence_1, type: sequence, vocab_size: 20, max_len: 20}, \ {name: sequence_2, type: sequence, vocab_size: 20, max_len: 20}, \ {name: timeseries_1, type: timeseries, max_len: 20}, \ {name: timeseries_2, type: timeseries, max_len: 20}, \ ]', type=yaml.safe_load, help='dataset features') args = parser.parse_args() dataset = build_synthetic_dataset(args.dataset_size, args.features) save_csv(args.csv_file_path, dataset)