Exemple #1
0
def save_prediction_outputs(postprocessed_output,
                            experiment_dir_name,
                            skip_output_types=None):
    if skip_output_types is None:
        skip_output_types = set()
    csv_filename = os.path.join(experiment_dir_name, '{}_{}.csv')
    for output_field, outputs in postprocessed_output.items():
        for output_type, values in outputs.items():
            if output_type not in skip_output_types:
                save_csv(csv_filename.format(output_field, output_type),
                         values)
Exemple #2
0
def save_prediction_outputs(
    postprocessed_output,
    output_features,
    output_directory,
    backend,
):
    postprocessed_output, column_shapes = flatten_df(postprocessed_output, backend)
    postprocessed_output.to_parquet(os.path.join(output_directory, PREDICTIONS_PARQUET_FILE_NAME))
    save_json(os.path.join(output_directory, PREDICTIONS_SHAPES_FILE_NAME), column_shapes)
    if not backend.df_engine.partitioned:
        # csv can only be written out for unpartitioned df format (i.e., pandas)
        postprocessed_dict = convert_to_dict(postprocessed_output, output_features)
        csv_filename = os.path.join(output_directory, "{}_{}.csv")
        for output_field, outputs in postprocessed_dict.items():
            for output_type, values in outputs.items():
                save_csv(csv_filename.format(output_field, output_type), values)
Exemple #3
0
def cli_synthesize_dataset(dataset_size: int, features: List[dict],
                           output_path: str) -> None:
    """Symthesizes a dataset for testing purposes

    :param dataset_size: (int) size of the dataset
    :param features: (List[dict]) list of features to generate in YAML format.
        Provide a list contaning one dictionary for each feature,
        each dictionary must include a name, a type
        and can include some generation parameters depending on the type
    :param output_path: (str) path where to save the output CSV file

    Example content for features:

    [
        {name: text_1, type: text, vocab_size: 20, max_len: 20},
        {name: text_2, type: text, vocab_size: 20, max_len: 20},
        {name: category_1, type: category, vocab_size: 10},
        {name: category_2, type: category, vocab_size: 15},
        {name: numerical_1, type: numerical},
        {name: numerical_2, type: numerical},
        {name: binary_1, type: binary},
        {name: binary_2, type: binary},
        {name: set_1, type: set, vocab_size: 20, max_len: 20},
        {name: set_2, type: set, vocab_size: 20, max_len: 20},
        {name: bag_1, type: bag, vocab_size: 20, max_len: 10},
        {name: bag_2, type: bag, vocab_size: 20, max_len: 10},
        {name: sequence_1, type: sequence, vocab_size: 20, max_len: 20},
        {name: sequence_2, type: sequence, vocab_size: 20, max_len: 20},
        {name: timeseries_1, type: timeseries, max_len: 20},
        {name: timeseries_2, type: timeseries, max_len: 20},
        {name: date_1, type: date},
        {name: date_2, type: date},
        {name: h3_1, type: h3},
        {name: h3_2, type: h3},
        {name: vector_1, type: vector},
        {name: vector_2, type: vector},
    ]
    """
    if dataset_size is None or features is None or output_path is None:
        raise ValueError(
            "Missing one or more required parameters: '--daset_size', "
            "'--features' or '--output_path'")
    dataset = build_synthetic_dataset(dataset_size, features)
    save_csv(output_path, dataset)
Exemple #4
0
                        '--dataset_size',
                        help='size of the dataset',
                        type=int,
                        default=100)
    parser.add_argument('-f',
                        '--features',
                        default='[\
          {name: text_1, type: text, vocab_size: 20, max_len: 20}, \
          {name: text_2, type: text, vocab_size: 20, max_len: 20}, \
          {name: category_1, type: category, vocab_size: 10}, \
          {name: category_2, type: category, vocab_size: 15}, \
          {name: numerical_1, type: numerical}, \
          {name: numerical_2, type: numerical}, \
          {name: binary_1, type: binary}, \
          {name: binary_2, type: binary}, \
          {name: set_1, type: set, vocab_size: 20, max_len: 20}, \
          {name: set_2, type: set, vocab_size: 20, max_len: 20}, \
          {name: bag_1, type: bag, vocab_size: 20, max_len: 10}, \
          {name: bag_2, type: bag, vocab_size: 20, max_len: 10}, \
          {name: sequence_1, type: sequence, vocab_size: 20, max_len: 20}, \
          {name: sequence_2, type: sequence, vocab_size: 20, max_len: 20}, \
          {name: timeseries_1, type: timeseries, max_len: 20}, \
          {name: timeseries_2, type: timeseries, max_len: 20}, \
          ]',
                        type=yaml.safe_load,
                        help='dataset features')
    args = parser.parse_args()

    dataset = build_synthetic_dataset(args.dataset_size, args.features)
    save_csv(args.csv_file_path, dataset)