Exemple #1
0
def test_hdfs_file_exists():
    random.seed()

    fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'
    fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Hello World {x}' for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Exemple #2
0
def test_hdfs_file_exists():
    random.seed()

    fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)
    fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Exemple #3
0
def test_hdfs_file_exists():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))
    fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
def test_s3_1():
    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/'
        'crawl-data/CC-MAIN-2015-11/warc.paths.*')
    print(filenames)
    assert ('s3n://aws-publicdatasets/common-crawl/'
            'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
def test_s3_1():
    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/'
        'crawl-data/CC-MAIN-2015-11/warc.paths.*'
    )
    print(filenames)
    assert ('s3n://aws-publicdatasets/common-crawl/'
            'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
Exemple #6
0
def resolve_partitions(patterns):
    """
    Given a list of patterns, returns all the files matching or in folders matching
    one of them.

    The file are returned in a list of tuple of 2 elements:
    - The first tuple is the file path
    - The second being the partition keys and values if any were encountered else None

    In addition to this list, return, if the data was partitioned, a schema for the
    partition keys, else None

    :type patterns: list of str
    :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
    """
    file_paths = File.get_content(patterns)
    if not file_paths:
        raise AnalysisException('Path does not exist: {0}'.format(patterns))
    partitions = {}
    for file_path in file_paths:
        if "=" in file_path:
            row = row_from_keyed_values(
                folder.split("=")
                for folder in file_path.split("/")[:-1]
                if folder.count("=") == 1
            )
            partitions[file_path] = row
        else:
            partitions[file_path] = None

    partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None)
    if len(partitioning_field_sets) > 1:
        raise Exception(
            "Conflicting directory structures detected while reading {0}. "
            "All partitions must have the same partitioning fields, found fields {1}".format(
                ",".join(patterns),
                " and also ".join(
                    str(fields) for fields in partitioning_field_sets
                )
            )
        )

    if partitioning_field_sets:
        if any(value is None for value in partitions.values()):
            raise AnalysisException(
                "Unable to parse those malformed folders: {1} of {0}".format(
                    file_paths,
                    [path for path, value in partitions.items() if value is None]
                )
            )
        partitioning_fields = partitioning_field_sets.pop()
        partition_schema = guess_schema_from_strings(
            partitioning_fields, partitions.values(), options={}
        )
    else:
        partition_schema = None

    return partitions, partition_schema
def test_s3_1():
    if not os.getenv("AWS_ACCESS_KEY_ID"):
        raise SkipTest

    filenames = File.resolve_filenames(
        "s3n://aws-publicdatasets/common-crawl/" "crawl-data/CC-MAIN-2015-11/warc.paths.*"
    )
    print(filenames)
    assert "s3n://aws-publicdatasets/common-crawl/" "crawl-data/CC-MAIN-2015-11/warc.paths.gz" in filenames
def test_s3_1():
    if not os.getenv('AWS_ACCESS_KEY_ID'):
        return

    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.*'
    )
    print(filenames)
    assert 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames
Exemple #9
0
def test_dumpToFile():
    if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = '{}/pysparkling_test_{0}.pickle'.format(
        S3_TEST_PATH, int(random.random() * 999999.0))
    File(fn).dump(pickle.dumps({'hello': 'world'}))
Exemple #10
0
def test_s3_1():
    if not os.getenv('AWS_ACCESS_KEY_ID'):
        raise SkipTest

    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/'
        'crawl-data/CC-MAIN-2015-11/warc.paths.*')
    print(filenames)
    assert ('s3n://aws-publicdatasets/common-crawl/'
            'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)
def test_local_2():
    filenames = File.resolve_filenames(
        'tests/test_resolve_filenames.py'
    )
    assert (len(filenames) == 1 and
            'tests/test_resolve_filenames.py' == filenames[0])
Exemple #12
0
def test_dumpToFile():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.pickle'.format(S3_TEST_PATH,
                                                  random.random() * 999999.0)
    File(fn).dump(pickle.dumps({'hello': 'world'}))
def test_local_2():
    filenames = File.resolve_filenames(CURRENT_FILE_LOCATION)
    assert filenames == [CURRENT_FILE_LOCATION]
def test_local_1():
    filenames = File.resolve_filenames('{}/*'.format(
        os.path.dirname(CURRENT_FILE_LOCATION)))
    assert CURRENT_FILE_LOCATION in filenames
Exemple #15
0
def test_dumpToFile():
    random.seed()

    fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.pickle'
    File(fn).dump(pickle.dumps({'hello': 'world'}))
def test_local_2():
    filenames = File.resolve_filenames("tests/test_resolve_filenames.py")
    assert len(filenames) == 1 and "tests/test_resolve_filenames.py" == filenames[0]
def test_local_1():
    filenames = File.resolve_filenames(
        'tests/*'
    )
    assert 'tests/test_resolve_filenames.py' in filenames
def test_local_2():
    filenames = File.resolve_filenames(
        'tests/test_resolve_filenames.py'
    )
    assert filenames == ['tests/test_resolve_filenames.py']
def test_local_1():
    filenames = File.resolve_filenames("tests/*")
    assert "tests/test_resolve_filenames.py" in filenames
def test_local_2():
    filenames = File.resolve_filenames(CURRENT_FILE_LOCATION)
    assert filenames == [CURRENT_FILE_LOCATION]
def test_local_1():
    filenames = File.resolve_filenames(
        '{}/*'.format(os.path.dirname(CURRENT_FILE_LOCATION))
    )
    assert CURRENT_FILE_LOCATION in filenames
Exemple #22
0
def test_local_2():
    filenames = File.resolve_filenames('tests/test_resolve_filenames.py')
    assert filenames == ['tests/test_resolve_filenames.py']
def test_local_2():
    filenames = File.resolve_filenames(
        'tests/test_resolve_filenames.py'
    )
    assert len(filenames) == 1 and 'tests/test_resolve_filenames.py' == filenames[0]
def test_local_1():
    filenames = File.resolve_filenames(
        f'{os.path.dirname(CURRENT_FILE_LOCATION)}{os.path.sep}*'
    )
    assert CURRENT_FILE_LOCATION in filenames
Exemple #25
0
def test_local_1():
    filenames = File.resolve_filenames('tests/*')
    assert 'tests/test_resolve_filenames.py' in filenames