Exemple #1
0
 def test_write_text_local_2(self):
     temp = tempfile.mkdtemp()
     path = os.path.join(temp, "test.txt")
     write_text("file://" + path, "abc\n")
     text = open_text("file://" + path)
     shutil.rmtree(temp)
     assert text == ['abc']
Exemple #2
0
 def test_write_text_s3(self):
     access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
     secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
     if access_key_id and secret_access_key:
         file_path = "s3://analytics-zoo-data/test.txt"
         text = 'abc\ndef\n'
         write_text(file_path, text)
         lines = open_text(file_path)
         assert lines == ['abc', 'def']
         import boto3
         s3_client = boto3.Session(
             aws_access_key_id=access_key_id,
             aws_secret_access_key=secret_access_key).client('s3',
                                                             verify=False)
         s3_client.delete_object(Bucket='analytics-zoo-data',
                                 Key='test.txt')
Exemple #3
0
    def write(path,
              generator,
              schema,
              block_size=1000,
              write_mode="overwrite",
              **kwargs):
        """
        Take each record in the generator and write it to a parquet file.

        **generator**
        Each record in the generator is a dict, the key is a string and will be the
        column name of saved parquet record and the value is the data.

        **schema**
        schema defines the name, dtype, shape of a column, as well as the feature
        type of a column. The feature type, defines how to encode and decode the column value.

        There are three kinds of feature type:
        1. Scalar, such as a int or float number, or a string, which can be directly mapped
           to a parquet type
        2. NDarray, which takes a np.ndarray and save it serialized bytes. The corresponding
           parquet type is BYTE_ARRAY .
        3. Image, which takes a string representing a image file in local file system and save
           the raw file content bytes.
           The corresponding parquet type is BYTE_ARRAY.

        :param path: the output path, e.g. file:///output/path, hdfs:///output/path
        :param generator: generate a dict, whose key is a string and value is one of
                          (a scalar value, ndarray, image file path)
        :param schema: a dict, whose key is a string, value is one of
                      (schema_field.Scalar, schema_field.NDarray, schema_field.Image)
        :param kwargs: other args
        """

        sc = init_nncontext()
        spark = SparkSession(sc)
        node_num, core_num = get_node_and_core_number()
        for i, chunk in enumerate(chunks(generator, block_size)):
            chunk_path = os.path.join(path, f"chunk={i}")
            rows_rdd = sc.parallelize(chunk, core_num * node_num) \
                .map(lambda x: dict_to_row(schema, x))
            spark.createDataFrame(rows_rdd).write.mode(write_mode).parquet(
                chunk_path)
        metadata_path = os.path.join(path, "_orca_metadata")

        write_text(metadata_path, encode_schema(schema))
Exemple #4
0
    paths = [os.path.join(args.input_folder, 'day_%d.parquet' % i) for i in args.day_range]
    tbl = FeatureTable.read_parquet(paths)

    # change name for all columns
    columns = dict([("_c{}".format(i), "c{}".format(i)) for i in range(40)])
    tbl = tbl.rename(columns)
    idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit)
    cat_sizes = [idx.size() for idx in idx_list]

    cross_sizes = args.cross_sizes

    # save meta
    if not exists(os.path.join(args.output_folder, "meta")):
        makedirs(os.path.join(args.output_folder, "meta"))
    cate_sizes_text = "\n".join([str(s) for s in cat_sizes])
    write_text(os.path.join(args.output_folder, "meta/categorical_sizes.txt"), cate_sizes_text)

    cross_sizes_text = "\n".join([str(s) for s in cross_sizes])
    write_text(os.path.join(args.output_folder, "meta/cross_sizes.txt"), cross_sizes_text)

    if args.days == 24:  # Full Criteo dataset
        train_data = FeatureTable.read_parquet(paths[:-1])
        preprocess_and_save(train_data, idx_list, os.path.join(args.output_folder, "train_parquet"))

        test_data = FeatureTable.read_parquet(
            os.path.join(args.input_folder, "day_23_test.parquet"))
        preprocess_and_save(test_data, idx_list, os.path.join(args.output_folder, "test_parquet"))
    else:  # Sample data
        data = FeatureTable.read_parquet(paths)
        preprocess_and_save(data, idx_list, os.path.join(args.output_folder, "data_parquet"))