Python File Exemples, pysparkling.fileio.File Python Exemples

Exemple #1

0

Afficher le fichier

def test_hdfs_file_exists():
    random.seed()

    fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'
    fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Hello World {x}' for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()

Exemple #2

0

Afficher le fichier

Fichier : test_textFile.py Projet : szdbl/pysparkling

def test_hdfs_file_exists():
    random.seed()

    fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)
    fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()

Exemple #3

0

Afficher le fichier

def test_hdfs_file_exists():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))
    fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()

Exemple #4

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : xor007/pysparkling

def test_s3_1():
    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/'
        'crawl-data/CC-MAIN-2015-11/warc.paths.*')
    print(filenames)
    assert ('s3n://aws-publicdatasets/common-crawl/'
            'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)

Exemple #5

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : svenkreiss/pysparkling

def test_s3_1():
    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/'
        'crawl-data/CC-MAIN-2015-11/warc.paths.*'
    )
    print(filenames)
    assert ('s3n://aws-publicdatasets/common-crawl/'
            'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)

Exemple #6

0

Afficher le fichier

Fichier : utils.py Projet : bireports/pysparkling

def resolve_partitions(patterns):
    """
    Given a list of patterns, returns all the files matching or in folders matching
    one of them.

    The file are returned in a list of tuple of 2 elements:
    - The first tuple is the file path
    - The second being the partition keys and values if any were encountered else None

    In addition to this list, return, if the data was partitioned, a schema for the
    partition keys, else None

    :type patterns: list of str
    :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
    """
    file_paths = File.get_content(patterns)
    if not file_paths:
        raise AnalysisException('Path does not exist: {0}'.format(patterns))
    partitions = {}
    for file_path in file_paths:
        if "=" in file_path:
            row = row_from_keyed_values(
                folder.split("=")
                for folder in file_path.split("/")[:-1]
                if folder.count("=") == 1
            )
            partitions[file_path] = row
        else:
            partitions[file_path] = None

    partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None)
    if len(partitioning_field_sets) > 1:
        raise Exception(
            "Conflicting directory structures detected while reading {0}. "
            "All partitions must have the same partitioning fields, found fields {1}".format(
                ",".join(patterns),
                " and also ".join(
                    str(fields) for fields in partitioning_field_sets
                )
            )
        )

    if partitioning_field_sets:
        if any(value is None for value in partitions.values()):
            raise AnalysisException(
                "Unable to parse those malformed folders: {1} of {0}".format(
                    file_paths,
                    [path for path, value in partitions.items() if value is None]
                )
            )
        partitioning_fields = partitioning_field_sets.pop()
        partition_schema = guess_schema_from_strings(
            partitioning_fields, partitions.values(), options={}
        )
    else:
        partition_schema = None

    return partitions, partition_schema

Exemple #7

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : nicoheidtke/pysparkling

def test_s3_1():
    if not os.getenv("AWS_ACCESS_KEY_ID"):
        raise SkipTest

    filenames = File.resolve_filenames(
        "s3n://aws-publicdatasets/common-crawl/" "crawl-data/CC-MAIN-2015-11/warc.paths.*"
    )
    print(filenames)
    assert "s3n://aws-publicdatasets/common-crawl/" "crawl-data/CC-MAIN-2015-11/warc.paths.gz" in filenames

Exemple #8

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : gitter-badger/pysparkling

def test_s3_1():
    if not os.getenv('AWS_ACCESS_KEY_ID'):
        return

    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.*'
    )
    print(filenames)
    assert 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames

Exemple #9

0

Afficher le fichier

def test_dumpToFile():
    if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = '{}/pysparkling_test_{0}.pickle'.format(
        S3_TEST_PATH, int(random.random() * 999999.0))
    File(fn).dump(pickle.dumps({'hello': 'world'}))

Exemple #10

0

Afficher le fichier

def test_s3_1():
    if not os.getenv('AWS_ACCESS_KEY_ID'):
        raise SkipTest

    filenames = File.resolve_filenames(
        's3n://aws-publicdatasets/common-crawl/'
        'crawl-data/CC-MAIN-2015-11/warc.paths.*')
    print(filenames)
    assert ('s3n://aws-publicdatasets/common-crawl/'
            'crawl-data/CC-MAIN-2015-11/warc.paths.gz' in filenames)

Exemple #11

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : giserh/pysparkling

def test_local_2():
    filenames = File.resolve_filenames(
        'tests/test_resolve_filenames.py'
    )
    assert (len(filenames) == 1 and
            'tests/test_resolve_filenames.py' == filenames[0])

Exemple #12

0

Afficher le fichier

Fichier : test_textFile.py Projet : szdbl/pysparkling

def test_dumpToFile():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.pickle'.format(S3_TEST_PATH,
                                                  random.random() * 999999.0)
    File(fn).dump(pickle.dumps({'hello': 'world'}))

Exemple #13

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : xor007/pysparkling

def test_local_2():
    filenames = File.resolve_filenames(CURRENT_FILE_LOCATION)
    assert filenames == [CURRENT_FILE_LOCATION]

Exemple #14

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : xor007/pysparkling

def test_local_1():
    filenames = File.resolve_filenames('{}/*'.format(
        os.path.dirname(CURRENT_FILE_LOCATION)))
    assert CURRENT_FILE_LOCATION in filenames

Exemple #15

0

Afficher le fichier

def test_dumpToFile():
    random.seed()

    fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.pickle'
    File(fn).dump(pickle.dumps({'hello': 'world'}))

Exemple #16

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : nicoheidtke/pysparkling

def test_local_2():
    filenames = File.resolve_filenames("tests/test_resolve_filenames.py")
    assert len(filenames) == 1 and "tests/test_resolve_filenames.py" == filenames[0]

Exemple #17

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : alexprengere/pysparkling

def test_local_1():
    filenames = File.resolve_filenames(
        'tests/*'
    )
    assert 'tests/test_resolve_filenames.py' in filenames

Exemple #18

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : alexprengere/pysparkling

def test_local_2():
    filenames = File.resolve_filenames(
        'tests/test_resolve_filenames.py'
    )
    assert filenames == ['tests/test_resolve_filenames.py']

Exemple #19

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : nicoheidtke/pysparkling

def test_local_1():
    filenames = File.resolve_filenames("tests/*")
    assert "tests/test_resolve_filenames.py" in filenames

Exemple #20

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : svenkreiss/pysparkling

def test_local_2():
    filenames = File.resolve_filenames(CURRENT_FILE_LOCATION)
    assert filenames == [CURRENT_FILE_LOCATION]

Exemple #21

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : svenkreiss/pysparkling

def test_local_1():
    filenames = File.resolve_filenames(
        '{}/*'.format(os.path.dirname(CURRENT_FILE_LOCATION))
    )
    assert CURRENT_FILE_LOCATION in filenames

Exemple #22

0

Afficher le fichier

def test_local_2():
    filenames = File.resolve_filenames('tests/test_resolve_filenames.py')
    assert filenames == ['tests/test_resolve_filenames.py']

Exemple #23

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : gitter-badger/pysparkling

def test_local_2():
    filenames = File.resolve_filenames(
        'tests/test_resolve_filenames.py'
    )
    assert len(filenames) == 1 and 'tests/test_resolve_filenames.py' == filenames[0]

Exemple #24

0

Afficher le fichier

Fichier : test_resolve_filenames.py Projet : svenkreiss/pysparkling

def test_local_1():
    filenames = File.resolve_filenames(
        f'{os.path.dirname(CURRENT_FILE_LOCATION)}{os.path.sep}*'
    )
    assert CURRENT_FILE_LOCATION in filenames

Exemple #25

0

Afficher le fichier

def test_local_1():
    filenames = File.resolve_filenames('tests/*')
    assert 'tests/test_resolve_filenames.py' in filenames