def test_exists_s3(self): access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") if access_key_id and secret_access_key: file_path = "s3://analytics-zoo-data/nyc_taxi.csv" assert exists(file_path) file_path = "s3://analytics-zoo-data/abc.csv" assert not exists(file_path)
def test_mkdirs_local_2(self): temp = tempfile.mkdtemp() path = os.path.join(temp, "dir1") makedirs("file://" + path) assert exists("file://" + path) path = os.path.join(temp, "dir2/dir3") makedirs("file://" + path) assert exists("file://" + path) shutil.rmtree(temp)
def test_mkdirs_s3(self): access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") if access_key_id and secret_access_key: file_path = "s3://analytics-zoo-data/temp/abc/" makedirs(file_path) assert exists(file_path) import boto3 s3_client = boto3.Session( aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key).client('s3', verify=False) s3_client.delete_object(Bucket='analytics-zoo-data', Key='temp/abc/')
def test_exists_local(self): file_path = os.path.join(self.resource_path, "orca/data/random.npy") assert exists("file://" + file_path) file_path = os.path.join(self.resource_path, "orca/data/abc.npy") assert not exists("file://" + file_path)
idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit) cat_sizes = [idx.size() for idx in idx_list] cross_sizes = args.cross_sizes tbl_all_data = tbl.encode_string(CAT_COLS, idx_list)\ .fillna(0, INT_COLS + CAT_COLS)\ .normalize(INT_COLS)\ .cross_columns(crossed_columns=[CAT_COLS[0:2], CAT_COLS[2:4]], bucket_sizes=cross_sizes) tbl_all_data.compute() time_end = time() print("Train data loading and preprocessing time: ", time_end - time_start) # save meta if not exists(os.path.join(args.output_folder, "meta")): makedirs(os.path.join(args.output_folder, "meta")) cate_sizes_text = "" for i in cat_sizes: cate_sizes_text += str(i) + '\n' write_text(os.path.join(args.output_folder, "meta/categorical_sizes.txt"), cate_sizes_text) cross_sizes_text = "" for i in cross_sizes: cross_sizes_text += str(i) + '\n' write_text(os.path.join(args.output_folder, "meta/cross_sizes.txt"), cross_sizes_text) tbl_all_data.show(5) print("Finished")