def test_hdfs_file_exists(): random.seed() fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' rdd = Context().parallelize(f'Hello World {x}' for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_hdfs_file_exists(): random.seed() fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_hdfs_file_exists(): if not HDFS_TEST_PATH: raise SkipTest random.seed() fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_s3_1(): filenames = File.resolve_filenames( 's3n://aws-publicdatasets/common-crawl/' 'crawl-data/CC-MAIN-2015-11/warc.paths.*') print(filenames) assert ('s3n://aws-publicdatasets/common-crawl/' 'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
def test_s3_1(): filenames = File.resolve_filenames( 's3n://aws-publicdatasets/common-crawl/' 'crawl-data/CC-MAIN-2015-11/warc.paths.*' ) print(filenames) assert ('s3n://aws-publicdatasets/common-crawl/' 'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
def resolve_partitions(patterns): """ Given a list of patterns, returns all the files matching or in folders matching one of them. The file are returned in a list of tuple of 2 elements: - The first tuple is the file path - The second being the partition keys and values if any were encountered else None In addition to this list, return, if the data was partitioned, a schema for the partition keys, else None :type patterns: list of str :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]] """ file_paths = File.get_content(patterns) if not file_paths: raise AnalysisException('Path does not exist: {0}'.format(patterns)) partitions = {} for file_path in file_paths: if "=" in file_path: row = row_from_keyed_values( folder.split("=") for folder in file_path.split("/")[:-1] if folder.count("=") == 1 ) partitions[file_path] = row else: partitions[file_path] = None partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None) if len(partitioning_field_sets) > 1: raise Exception( "Conflicting directory structures detected while reading {0}. " "All partitions must have the same partitioning fields, found fields {1}".format( ",".join(patterns), " and also ".join( str(fields) for fields in partitioning_field_sets ) ) ) if partitioning_field_sets: if any(value is None for value in partitions.values()): raise AnalysisException( "Unable to parse those malformed folders: {1} of {0}".format( file_paths, [path for path, value in partitions.items() if value is None] ) ) partitioning_fields = partitioning_field_sets.pop() partition_schema = guess_schema_from_strings( partitioning_fields, partitions.values(), options={} ) else: partition_schema = None return partitions, partition_schema
def test_s3_1(): if not os.getenv("AWS_ACCESS_KEY_ID"): raise SkipTest filenames = File.resolve_filenames( "s3n://aws-publicdatasets/common-crawl/" "crawl-data/CC-MAIN-2015-11/warc.paths.*" ) print(filenames) assert "s3n://aws-publicdatasets/common-crawl/" "crawl-data/CC-MAIN-2015-11/warc.paths.gz" in filenames
def test_s3_1(): if not os.getenv('AWS_ACCESS_KEY_ID'): return filenames = File.resolve_filenames( 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.*' ) print(filenames) assert 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames
def test_dumpToFile(): if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH: raise SkipTest random.seed() fn = '{}/pysparkling_test_{0}.pickle'.format( S3_TEST_PATH, int(random.random() * 999999.0)) File(fn).dump(pickle.dumps({'hello': 'world'}))
def test_s3_1(): if not os.getenv('AWS_ACCESS_KEY_ID'): raise SkipTest filenames = File.resolve_filenames( 's3n://aws-publicdatasets/common-crawl/' 'crawl-data/CC-MAIN-2015-11/warc.paths.*') print(filenames) assert ('s3n://aws-publicdatasets/common-crawl/' 'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
def test_local_2(): filenames = File.resolve_filenames( 'tests/test_resolve_filenames.py' ) assert (len(filenames) == 1 and 'tests/test_resolve_filenames.py' == filenames[0])
def test_dumpToFile(): random.seed() fn = '{}/pysparkling_test_{:d}.pickle'.format(S3_TEST_PATH, random.random() * 999999.0) File(fn).dump(pickle.dumps({'hello': 'world'}))
def test_local_2(): filenames = File.resolve_filenames(CURRENT_FILE_LOCATION) assert filenames == [CURRENT_FILE_LOCATION]
def test_local_1(): filenames = File.resolve_filenames('{}/*'.format( os.path.dirname(CURRENT_FILE_LOCATION))) assert CURRENT_FILE_LOCATION in filenames
def test_dumpToFile(): random.seed() fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.pickle' File(fn).dump(pickle.dumps({'hello': 'world'}))
def test_local_2(): filenames = File.resolve_filenames("tests/test_resolve_filenames.py") assert len(filenames) == 1 and "tests/test_resolve_filenames.py" == filenames[0]
def test_local_1(): filenames = File.resolve_filenames( 'tests/*' ) assert 'tests/test_resolve_filenames.py' in filenames
def test_local_2(): filenames = File.resolve_filenames( 'tests/test_resolve_filenames.py' ) assert filenames == ['tests/test_resolve_filenames.py']
def test_local_1(): filenames = File.resolve_filenames("tests/*") assert "tests/test_resolve_filenames.py" in filenames
def test_local_1(): filenames = File.resolve_filenames( '{}/*'.format(os.path.dirname(CURRENT_FILE_LOCATION)) ) assert CURRENT_FILE_LOCATION in filenames
def test_local_2(): filenames = File.resolve_filenames('tests/test_resolve_filenames.py') assert filenames == ['tests/test_resolve_filenames.py']
def test_local_2(): filenames = File.resolve_filenames( 'tests/test_resolve_filenames.py' ) assert len(filenames) == 1 and 'tests/test_resolve_filenames.py' == filenames[0]
def test_local_1(): filenames = File.resolve_filenames( f'{os.path.dirname(CURRENT_FILE_LOCATION)}{os.path.sep}*' ) assert CURRENT_FILE_LOCATION in filenames
def test_local_1(): filenames = File.resolve_filenames('tests/*') assert 'tests/test_resolve_filenames.py' in filenames