def test_write_text_local_2(self): temp = tempfile.mkdtemp() path = os.path.join(temp, "test.txt") write_text("file://" + path, "abc\n") text = open_text("file://" + path) shutil.rmtree(temp) assert text == ['abc']
def test_write_text_s3(self): access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") if access_key_id and secret_access_key: file_path = "s3://analytics-zoo-data/test.txt" text = 'abc\ndef\n' write_text(file_path, text) lines = open_text(file_path) assert lines == ['abc', 'def'] import boto3 s3_client = boto3.Session( aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key).client('s3', verify=False) s3_client.delete_object(Bucket='analytics-zoo-data', Key='test.txt')
def write(path, generator, schema, block_size=1000, write_mode="overwrite", **kwargs): """ Take each record in the generator and write it to a parquet file. **generator** Each record in the generator is a dict, the key is a string and will be the column name of saved parquet record and the value is the data. **schema** schema defines the name, dtype, shape of a column, as well as the feature type of a column. The feature type, defines how to encode and decode the column value. There are three kinds of feature type: 1. Scalar, such as a int or float number, or a string, which can be directly mapped to a parquet type 2. NDarray, which takes a np.ndarray and save it serialized bytes. The corresponding parquet type is BYTE_ARRAY . 3. Image, which takes a string representing a image file in local file system and save the raw file content bytes. The corresponding parquet type is BYTE_ARRAY. :param path: the output path, e.g. file:///output/path, hdfs:///output/path :param generator: generate a dict, whose key is a string and value is one of (a scalar value, ndarray, image file path) :param schema: a dict, whose key is a string, value is one of (schema_field.Scalar, schema_field.NDarray, schema_field.Image) :param kwargs: other args """ sc = init_nncontext() spark = SparkSession(sc) node_num, core_num = get_node_and_core_number() for i, chunk in enumerate(chunks(generator, block_size)): chunk_path = os.path.join(path, f"chunk={i}") rows_rdd = sc.parallelize(chunk, core_num * node_num) \ .map(lambda x: dict_to_row(schema, x)) spark.createDataFrame(rows_rdd).write.mode(write_mode).parquet( chunk_path) metadata_path = os.path.join(path, "_orca_metadata") write_text(metadata_path, encode_schema(schema))
paths = [os.path.join(args.input_folder, 'day_%d.parquet' % i) for i in args.day_range] tbl = FeatureTable.read_parquet(paths) # change name for all columns columns = dict([("_c{}".format(i), "c{}".format(i)) for i in range(40)]) tbl = tbl.rename(columns) idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit) cat_sizes = [idx.size() for idx in idx_list] cross_sizes = args.cross_sizes # save meta if not exists(os.path.join(args.output_folder, "meta")): makedirs(os.path.join(args.output_folder, "meta")) cate_sizes_text = "\n".join([str(s) for s in cat_sizes]) write_text(os.path.join(args.output_folder, "meta/categorical_sizes.txt"), cate_sizes_text) cross_sizes_text = "\n".join([str(s) for s in cross_sizes]) write_text(os.path.join(args.output_folder, "meta/cross_sizes.txt"), cross_sizes_text) if args.days == 24: # Full Criteo dataset train_data = FeatureTable.read_parquet(paths[:-1]) preprocess_and_save(train_data, idx_list, os.path.join(args.output_folder, "train_parquet")) test_data = FeatureTable.read_parquet( os.path.join(args.input_folder, "day_23_test.parquet")) preprocess_and_save(test_data, idx_list, os.path.join(args.output_folder, "test_parquet")) else: # Sample data data = FeatureTable.read_parquet(paths) preprocess_and_save(data, idx_list, os.path.join(args.output_folder, "data_parquet"))