Beispiel #1
0
def test_statistics(tempdir):
    s = pd.Series([b'a', b'b', b'c'] * 20)
    df = pd.DataFrame({
        'a': s,
        'b': s.astype('category'),
        'c': s.astype('category').cat.as_ordered()
    })
    fastparquet.write(tempdir, df, file_scheme='hive')
    pf = fastparquet.ParquetFile(tempdir)
    stat = pf.statistics
    assert stat['max']['a'] == [b'c']
    assert stat['min']['a'] == [b'a']
    assert stat['max']['b'] == [b'c']
    assert stat['min']['b'] == [b'a']
    assert stat['max']['c'] == [b'c']
    assert stat['min']['c'] == [b'a']
Beispiel #2
0
def read_header(
        bucket: str,
        key: str,
        open_with: Callable[[str, str],
                            Any] = _minio_open_random) -> ParquetFile:
    """
    Ensure a ParquetFile exists, and return it with headers read.

    May raise FileNotFoundError or FastparquetCouldNotHandleFile.

    `retval.fn` gives the filename; `retval.columns` gives column names;
    `retval.dtypes` gives pandas dtypes, and `retval.to_pandas()` reads
    the entire file.
    """
    filelike = open_with(bucket, key)  # raises FileNotFoundError
    return fastparquet.ParquetFile(filelike)
Beispiel #3
0
def test_read_footer_fail(tempdir, size):
    """Test reading the footer."""
    import struct
    fn = os.path.join(TEST_DATA, "nation.impala.parquet")
    fout = os.path.join(tempdir, "temp.parquet")
    with open(fn, 'rb') as f1:
        with open(fout, 'wb') as f2:
            f1.seek(-8, 2)
            head_size = struct.unpack('<i', f1.read(4))[0]
            f1.seek(-(head_size + 8), 2)
            block = f1.read(head_size)
            f2.write(b'0' * 25)  # padding
            f2.write(block[:-size])
            f2.write(f1.read())
    with pytest.raises(TypeError):
        p = fastparquet.ParquetFile(fout)
Beispiel #4
0
def test_ordering(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp)

    pf = fastparquet.ParquetFile(tmp)
    assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex')
    assert_eq(ddf, ddf2)
def test_ordering(tmpdir, write_engine, read_engine):
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp, engine=write_engine)

    if read_engine == 'fastparquet':
        pf = fastparquet.ParquetFile(tmp)
        assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine)
    assert_eq(ddf, ddf2, check_divisions=False)
Beispiel #6
0
    def test_partition_cols_supported(self, fp, df_full):
        # GH #23283
        partition_cols = ["bool", "int"]
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(
                path,
                engine="fastparquet",
                partition_cols=partition_cols,
                compression=None,
            )
            assert os.path.exists(path)
            import fastparquet

            actual_partition_cols = fastparquet.ParquetFile(path, False).cats
            assert len(actual_partition_cols) == 2
Beispiel #7
0
def test_map_multipage(tempdir):
    pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "map-test.snappy.parquet"))
    assert pf.count() == 3551
    df = pf.to_pandas()
    first_row_keys = [u'FoxNews.com', u'News Network', u'mobile technology', u'broadcast', u'sustainability',
                      u'collective intelligence', u'radio', u'business law', u'LLC', u'telecommunications',
                      u'FOX News Network']
    last_row_keys = [u'protests', u'gas mask', u'Pot & Painting Party', u'Denver', u'New Year', u'Anderson Cooper',
                     u'gas mask bonk', u'digital media', u'marijuana leaf earrings', u'Screengrab', u'gas mask bongs',
                     u'Randi Kaye', u'Lee Rogers', u'Andy Cohen', u'CNN', u'Times Square', u'Colorado', u'opera',
                     u'slavery', u'Kathy Griffin', u'marijuana cigarette', u'executive producer']

    assert len(df) == 3551
    assert sorted(df["topics"].iloc[0].keys()) == sorted(first_row_keys)
    assert sorted(df["topics"].iloc[-1].keys()) == sorted(last_row_keys)
    assert df.isnull().sum().sum() == 0 # ensure every row got converted
Beispiel #8
0
    def test_partition_cols_string(self, fp, df_full):
        # GH #27117
        partition_cols = "bool"
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(
                path,
                engine="fastparquet",
                partition_cols=partition_cols,
                compression=None,
            )
            assert os.path.exists(path)
            import fastparquet  # noqa: F811

            actual_partition_cols = fastparquet.ParquetFile(path, False).cats
            assert len(actual_partition_cols) == 1
Beispiel #9
0
def read_parquet(path, storage_options=None):
    """
    Construct a SpatialPointsFrame from a spatially partitioned parquet
    file

    If the input parquet file does not contain compatible spatial metadata,
    then the resulting SpatialPointsFrame will have a .spatial property of
    None, and the spatial_query operation will be unavailable.

    Parameters
    ----------
    path: str
        Path to a spatially partitioned parquet file that was created
        using datashader.spatial.points.to_parquet

    storage_options : dict or None (default None)
        Key/value pairs to be passed on to the file-system backend, if any.

    Returns
    -------
    SpatialPointsFrame
        A spatially sorted Dask dataframe reconstructed from disk
    """
    _validate_fastparquet()

    # Read parquet file
    frame = dd.read_parquet(path, storage_options=storage_options)

    # Open parquet file
    fs, _, paths = get_fs_token_paths(path,
                                      mode="rb",
                                      storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)
    pf = fp.ParquetFile(path, open_with=fs.open)

    # Check for spatial points metadata
    if 'SpatialPointsFrame' in pf.key_value_metadata:
        # Load metadata
        props = json.loads(pf.key_value_metadata['SpatialPointsFrame'])
    else:
        props = None

    # Call DataFrame constructor with the internals of frame
    return SpatialPointsFrame(frame.dask, frame._name, frame._meta,
                              frame.divisions, props)
Beispiel #10
0
def pq_to_np(rows, source, destination, prefix):
    """A function to save the parquet files as npy files instead. The data can
    be split into many smaller npy files by using the rows argument.
    """

    dir_path = pathlib.Path(source)
    id_prefix_len = len('Train_')

    if dir_path.is_dir():
        for file_path in dir_path.iterdir():
            if file_path.suffix == '.parquet':
                if file_path.stem.startswith(prefix):

                    print(f"Loading {file_path.resolve().as_posix()}.")

                    parquet = pq.ParquetFile(file_path.resolve().as_posix())
                    df = parquet.to_pandas()
                    df['image_id'] = df['image_id'].map(
                        lambda x: x[id_prefix_len:])
                    df['image_id'] = pd.to_numeric(df['image_id'],
                                                   downcast='unsigned')
                    print(df.dtypes)
                    print(f"Exporting data as .npz files.")

                    df_samples = df.shape[0]
                    processed_samples = 0
                    file_idx = 0

                    while processed_samples < df_samples:
                        file_size = min(rows, df_samples - processed_samples)
                        output_file = f"{destination}/{file_path.stem}_" \
                                      f"{file_idx}_{file_size}rows.npz"
                        np_samples = df.iloc[
                            processed_samples:processed_samples + file_size,
                            1:].to_numpy()
                        np_ids = df.iloc[processed_samples:processed_samples +
                                         file_size, 0].to_numpy()
                        np.savez(output_file, ids=np_ids, images=np_samples)
                        print(np_ids.shape, np_samples.shape)
                        processed_samples += rows
                        file_idx += 1

                    print("Complete.")

    return
Beispiel #11
0
def count_remaining_interactions():
    interactions_remained = 0
    for file in listdir(path_dataset):
        if file[-8:] == '.parquet':
            current_file = fastparquet.ParquetFile(path_dataset + file)
            current_df = current_file.to_pandas(['post_id', 'uid'])

            for row in current_df.values:
                current_oid = int(row[0].split('_')[0])
                current_uid = int(row[1])
                try:
                    temp = [oid2ind[current_oid], uid2ind[current_uid]]
                    interactions_remained += 1
                except KeyError:
                    pass

    print('Interactions remained:', interactions_remained)
    return interactions_remained
Beispiel #12
0
def test_index(tempdir):
    s = pd.Series(['a', 'c', 'b'] * 20)
    df = pd.DataFrame({
        'a': s,
        'b': s.astype('category'),
        'c': range(60, 0, -1)
    })

    for column in df:
        d2 = df.set_index(column)
        fastparquet.write(tempdir, d2, file_scheme='hive', write_index=True)
        pf = fastparquet.ParquetFile(tempdir)
        out = pf.to_pandas(index=column, categories=['b'])
        pd.testing.assert_frame_equal(out,
                                      d2,
                                      check_categorical=False,
                                      check_index_type=False,
                                      check_dtype=False)
Beispiel #13
0
def test_writing_parquet_with_compression(tmpdir, compression, engine):
    fn = str(tmpdir)

    if engine == 'fastparquet' and compression in ['snappy', 'default']:
        pytest.importorskip('snappy')

    df = pd.DataFrame({'x': ['a', 'b', 'c'] * 10, 'y': [1, 2, 3] * 10})
    ddf = dd.from_pandas(df, npartitions=3)

    ddf.to_parquet(fn, compression=compression, engine=engine)
    if engine == 'fastparquet' and compression == 'default':
        # ensure default compression for fastparquet is Snappy
        import fastparquet
        pf = fastparquet.ParquetFile(fn)
        assert pf.row_groups[0].columns[0].meta_data.codec == 1

    out = dd.read_parquet(fn, engine=engine)
    assert_eq(out, df, check_index=(engine != 'fastparquet'))
Beispiel #14
0
def save_embeddings_to_pickle_file():
    import pandas
    import datetime
    timestart = datetime.datetime.now()

    print("Embedding vector size =", vector_size)

    embedding_pickle_file = os.path.join(home_dir, "Models/w2vmodel_pubmed_vs_{}_ws_{}_mc_{}.pkl" \
        .format(vector_size, window_size, min_count))

    Word2Vec_Model = {}

    print("Reading the Parquet embedding files ....")
    files = os.listdir(embedding_full_path)
    for index, filename in enumerate(files):
        if "part" in filename:
            parquet_file_path = os.path.join(embedding_full_path, filename)
            print("reading {}".format(parquet_file_path))

            try:
                pfile = fastparquet.ParquetFile(parquet_file_path)
                # convert to pandas dataframe
                df = pfile.to_pandas()
                #             df = pandas.read_csv(tsv_full_path, sep='\t')

                #print(df.head())
                arr = list(df.values)
                for ind, vals in enumerate(arr):
                    word = vals[0]
                    word_vec = vals[-vector_size:]
                    word_vec = np.array(word_vec)
                    Word2Vec_Model[word] = word_vec.astype('float32')
            except:
                print("Skip {}".format(filename))

    #save the embedding matrix into a pickle file
    print("save the embedding matrix of {} entries into a pickle file".format(
        len(Word2Vec_Model)))
    pickle.dump(Word2Vec_Model, open(embedding_pickle_file, "wb"))

    timeend = datetime.datetime.now()
    timedelta = round((timeend - timestart).total_seconds() / 60, 2)
    print("Time taken to execute above cell: " + str(timedelta) + " mins")
    return (embedding_pickle_file)
Beispiel #15
0
def save_embeddings_to_pickle_file(embedding_full_path, embedding_pickle_file,
                                   embed_vector_size):
    import pandas
    import datetime
    timestart = datetime.datetime.now()

    print("Embedding vector size =", embed_vector_size)
    Word2Vec_Model = {}

    print("Reading the Parquet embedding files .... {}".format(
        embedding_full_path))
    files = os.listdir(embedding_full_path)
    for index, filename in enumerate(files):
        if "part" in filename:
            parquet_file_path = os.path.join(embedding_full_path, filename)
            print("reading {}".format(parquet_file_path))

            try:
                pfile = fastparquet.ParquetFile(parquet_file_path)
                # convert to pandas dataframe
                df = pfile.to_pandas()

                #print(df.head())
                arr = list(df.values)
                for ind, vals in enumerate(arr):
                    word = vals[0]
                    word_vec = vals[1:embed_vector_size + 1]
                    word_vec = np.array(word_vec)
                    Word2Vec_Model[word] = word_vec.astype('float32')
            except:
                print("Skip {}".format(filename))

    #save the embedding matrix into a pickle file
    print("save the embedding matrix of {} entries into a pickle file".format(
        len(Word2Vec_Model)))
    pickle.dump(Word2Vec_Model, open(embedding_pickle_file, "wb"))

    timeend = datetime.datetime.now()
    timedelta = round((timeend - timestart).total_seconds() / 60, 2)
    print(
        "Time taken to execute the save_embeddings_to_pickle_file function: " +
        str(timedelta) + " mins")
    print("Done.")
Beispiel #16
0
def test_v2():
    # from https://github.com/apache/parquet-testing/tree/master/data
    pf = fastparquet.ParquetFile(
        os.path.join(TEST_DATA, 'datapage_v2.snappy.parquet'))
    expected = {
        'a': {
            0: 'abc',
            1: 'abc',
            2: 'abc',
            3: None,
            4: 'abc'
        },
        'b': {
            0: 1,
            1: 2,
            2: 3,
            3: 4,
            4: 5
        },
        'c': {
            0: 2.0,
            1: 3.0,
            2: 4.0,
            3: 5.0,
            4: 2.0
        },
        'd': {
            0: True,
            1: True,
            2: True,
            3: False,
            4: True
        },
        'e': {
            0: [1, 2, 3],
            1: None,
            2: None,
            3: [1, 2, 3],
            4: [1, 2]
        }
    }
    out = pf.to_pandas()
    assert out.to_dict() == expected
Beispiel #17
0
def test_with_cache():
    import tempfile
    d = tempfile.mkdtemp()
    old = intake.config.conf['cache_dir']
    expected = fastparquet.ParquetFile(os.path.join(here, 'split')).to_pandas()
    try:
        intake.config.conf['cache_dir'] = d
        cat = intake.open_catalog(os.path.join(here, 'cache_cat.yaml'))
        s = cat.split()
        assert isinstance(s.cache[0], intake.source.cache.DirCache)
        outfiles = s.cache[0].load(s._urlpath, output=False)
        assert outfiles
        assert outfiles[0].startswith(s.cache_dirs[0])
        loc = s.cache[0]._path(s._urlpath)
        assert glob.glob(loc + '/*/*/*.parquet')
        assert s.read().reset_index(drop=True).equals(expected)
    finally:
        shutil.rmtree(d)
        intake.config.conf['cache_dir'] = old
Beispiel #18
0
def test_file_csv(parquet_file):
    """Test the various file times
    """
    p = fastparquet.ParquetFile(parquet_file)
    data = p.to_pandas()
    if 'comment_col' in data.columns:
        mapping = {
            'comment_col': "n_comment",
            'name': 'n_name',
            'nation_key': 'n_nationkey',
            'region_key': 'n_regionkey'
        }
        data.columns = [mapping[k] for k in data.columns]
    data.set_index('n_nationkey', inplace=True)

    for col in cols[1:]:
        if isinstance(data[col][0], bytes):
            data[col] = data[col].str.decode('utf8')
        assert (data[col] == expected[col]).all()
 def value(self, erase_overlap=False):
     '''
     :rtype: Pandas dataframe of clustering
     '''
     ret = None
     if self.cluster is not None:
         ret = self.cluster
     else:
         self.logger.info("reading" + self.path)
         df = fastparquet.ParquetFile(self.path).to_pandas()
         self.cluster = df
         if self.cluster is not None:
             self.cluster = self.cluster.drop_duplicates()
         ret = self.cluster
     if not erase_overlap:
         return ret
     else:
         fn = lambda obj: obj.loc[np.random.choice(obj.index, 1, False), :]
         return ret.groupby('node', as_index=False).apply(fn)
Beispiel #20
0
def test_null_plain_dictionary():
    """Test reading a file that contains null records for a plain dictionary
     column."""
    p = fastparquet.ParquetFile(
        os.path.join(TEST_DATA, "test-null-dictionary.parquet"))
    data = p.to_pandas()
    expected = pd.DataFrame([{
        "foo": None
    }] + [{
        "foo": "bar"
    }, {
        "foo": "baz"
    }] * 3)
    for col in data:
        if isinstance(data[col][1], bytes):
            # Remove when re-implemented converted types
            data[col] = data[col].str.decode('utf8')
        assert (data[col] == expected[col])[~expected[col].isnull()].all()
        assert sum(data[col].isnull()) == sum(expected[col].isnull())
Beispiel #21
0
def test_dir_partition():
    """Test creation of categories from directory structure"""
    x = np.arange(2000)
    df = pd.DataFrame({
        'num':
        x,
        'cat':
        pd.Series(np.array(['fred', 'freda'])[x % 2], dtype='category'),
        'catnum':
        pd.Series(np.array([1, 2, 3])[x % 3], dtype='category')
    })
    pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "split"))
    out = pf.to_pandas()
    for cat, catnum in product(['fred', 'freda'], [1, 2, 3]):
        assert (df.num[(df.cat == cat) & (df.catnum == catnum)].tolist()) ==\
                out.num[(out.cat == cat) & (out.catnum == catnum)].tolist()
    assert out.cat.dtype == 'category'
    assert out.catnum.dtype == 'category'
    assert out.catnum.cat.categories.dtype == 'int64'
Beispiel #22
0
def test_cat_filters():
    path = os.path.join(TEST_DATA, 'split')
    pf = fastparquet.ParquetFile(path)
    base_shape = len(pf.to_pandas())

    filters = [('cat', '==', 'freda')]
    assert len(pf.to_pandas(filters=filters)) == 1000

    filters = [('cat', '!=', 'freda')]
    assert len(pf.to_pandas(filters=filters)) == 1000

    filters = [('cat', 'in', ['fred', 'freda'])]
    assert 0 < len(pf.to_pandas(filters=filters)) == 2000

    filters = [('cat', 'not in', ['fred', 'frederick'])]
    assert 0 < len(pf.to_pandas(filters=filters)) == 1000

    filters = [('catnum', '==', 2000)]
    assert len(pf.to_pandas(filters=filters)) == 0

    filters = [('catnum', '>=', 2)]
    assert 0 < len(pf.to_pandas(filters=filters)) == 1333

    filters = [('catnum', '>=', 1)]
    assert len(pf.to_pandas(filters=filters)) == base_shape

    filters = [('catnum', 'in', [0, 1])]
    assert len(pf.to_pandas(filters=filters)) == 667

    filters = [('catnum', 'not in', [1, 2, 3])]
    assert len(pf.to_pandas(filters=filters)) == 0

    # AND
    filters = [[('cat', '==', 'freda'), ('catnum', '>=', 2.5)]]
    assert len(pf.to_pandas(filters=filters)) == 333

    # OR
    filters = [('cat', '==', 'freda'), ('catnum', '>=', 2.5)]
    assert len(pf.to_pandas(filters=filters)) == 1333

    # AND
    filters = [[('cat', '==', 'freda'), ('catnum', '!=', 2.5)]]
    assert len(pf.to_pandas(filters=filters)) == 1000
Beispiel #23
0
def test_stat_filters():
    path = os.path.join(TEST_DATA, 'split')
    pf = fastparquet.ParquetFile(path)
    base_shape = len(pf.to_pandas())

    filters = [('num', '>', 0)]
    assert len(pf.to_pandas(filters=filters)) == base_shape

    filters = [('num', '<', 0)]
    assert len(pf.to_pandas(filters=filters)) == 0

    filters = [('num', '>', 500)]
    assert 0 < len(pf.to_pandas(filters=filters)) < base_shape

    filters = [('num', '>', 1500)]
    assert 0 < len(pf.to_pandas(filters=filters)) < base_shape

    filters = [('num', '>', 2000)]
    assert len(pf.to_pandas(filters=filters)) == 0

    filters = [('num', '>=', 1999)]
    assert 0 < len(pf.to_pandas(filters=filters)) < base_shape

    filters = [('num', '!=', 1000)]
    assert len(pf.to_pandas(filters=filters)) == base_shape

    filters = [('num', 'in', [-1, -2])]
    assert len(pf.to_pandas(filters=filters)) == 0

    filters = [('num', 'not in', [-1, -2])]
    assert len(pf.to_pandas(filters=filters)) == base_shape

    filters = [('num', 'in', [0])]
    l = len(pf.to_pandas(filters=filters))
    assert 0 < l < base_shape

    filters = [('num', 'in', [0, 1500])]
    assert l < len(pf.to_pandas(filters=filters)) < base_shape

    filters = [('num', 'in', [-1, 1999])]
    l = len(pf.to_pandas(filters=filters))
    assert 0 < l < base_shape
Beispiel #24
0
def load_parquet_fp(path: str, **kwargs) -> pd.DataFrame:
    """ Helper function to load a parquet Dataset as a Pandas DataFrame using
        fastparquet

    First creates a [ParquetFile](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile)
    and then converts the ParquetFile to a DataFrame using .to_pandas.
    Refer to the fastparquet documentation for accepted arguments

    Parameters
    -----------
    path : str
        The root directory of the Parquet Dataset stored locally or in S3

    Returns
    --------
    pd.DataFrame
    """
    import fastparquet as fp

    logger.info(
        f"Reading in Parquet dataset to ParquetFile. kwargs passed {kwargs!r}")

    fs = kwargs.pop("fs", None)

    # Pull out arguments that should be directed to to_pandas
    to_pandas_args = parse_args(fp, ["ParquetFile", "to_pandas"], **kwargs)
    # Remove these args from kwargs
    kwargs = {
        k: v
        for k, v in kwargs.items() if k in set(kwargs) - set(to_pandas_args)
    }

    if s3.is_s3path(path):
        fs = fs or s3fs.S3FileSystem()
        myopen = fs.open
    else:
        myopen = open

    pf = fp.ParquetFile(path, open_with=myopen, **kwargs)

    df = pf.to_pandas(**to_pandas_args)
    return df
Beispiel #25
0
def test_empty_row_groups(tempdir, sql):
    fn = os.path.join(tempdir, 'output.parquet')
    d0 = pd.DataFrame({'name': ['alice'], 'age': [20]})
    df = sql.createDataFrame(d0)
    df.write.parquet(fn)
    import glob
    files = glob.glob(os.path.join(fn, '*.parquet'))
    sizes = [os.stat(p).st_size for p in files]
    msize = max(sizes)
    pf = fastparquet.ParquetFile(files)  # don't necessarily have metadata
    assert len(files) > 1  # more than one worker was writing
    d = pf.to_pandas(index=False)
    pd.util.testing.assert_frame_equal(d, d0)

    # destroy empty files
    [os.unlink(f) for (f, s) in zip(files, sizes) if s < msize]

    # loads anyway, since empty row-groups are not touched
    d = pf.to_pandas()
    pd.util.testing.assert_frame_equal(d, d0)
Beispiel #26
0
def test_or_filtering(tempdir):
    path = os.path.join(TEST_DATA, 'split')
    pf = fastparquet.ParquetFile(path)
    # Defining 2 filters resulting in 2 disjointed row groups.
    up_filter = [('num', '>=', 1925)]
    down_filter = [('num', '<=', 18)]
    # Check disjointed groups.
    empty_df = pf.to_pandas(filters=[up_filter + down_filter])
    assert empty_df.empty
    # Reading row groups separately for reference.
    up_df = pf.to_pandas(filters=up_filter)
    down_df = pf.to_pandas(filters=down_filter)
    cols = list(up_df.columns)
    ref_df = pd.concat([up_df, down_df]).sort_values(cols)\
                                        .reset_index(drop=True)
    # Reading row groups using OR operation in `filters`.
    or_filter = [up_filter, down_filter]
    or_df = pf.to_pandas(filters=or_filter).sort_values(cols)\
                                            .reset_index(drop=True)
    assert (or_df.equals(ref_df))
Beispiel #27
0
    def from_parquet(filename):
        """
        Construct a SpatialPointsFrame from a spatially partitioned parquet
        file

        Parameters
        ----------
        filename: str
            Path to a spatially partitioned parquet file that was created
            using SpatialPointsFrame.partition_and_write

        Returns
        -------
        SpatialPointsFrame
            A spatially sorted Dask dataframe reconstructed from disk
        """
        _validate_fastparquet()

        # Open parquet file
        pf = fp.ParquetFile(filename)

        # Check for required metadata
        if 'SpatialPointsFrame' not in pf.key_value_metadata:
            raise ValueError("""
The parquet file at '{filename}'
does not appear to be spatially partitioned.
Please construct a spatially partitioned parquet file using the
SpatialPointsFrame.partition_and_write static method.""".format(
                filename=filename))

        # Load metadata
        props = json.loads(pf.key_value_metadata['SpatialPointsFrame'])

        # Read parquet file
        frame = dd.read_parquet(filename)

        # Call DataFrame constructor with the internals of frame
        return SpatialPointsFrame(frame.dask, frame._name, frame._meta,
                                  frame.divisions, props)
Beispiel #28
0
def test_map_array(sql):
    """
from pyspark.sql.types import *
df_schema = StructType([
    StructField('map_op_op', MapType(StringType(), StringType(), True), True),
    StructField('map_op_req', MapType(StringType(), StringType(), False), True),
    StructField('map_req_op', MapType(StringType(), StringType(), True), False),
    StructField('map_req_req', MapType(StringType(), StringType(), False), False),
    StructField('arr_op_op', ArrayType(StringType(), True), True),
    StructField('arr_op_req', ArrayType(StringType(), False), True),
    StructField('arr_req_op', ArrayType(StringType(), True), False),
    StructField('arr_req_req', ArrayType(StringType(), False), False)])
keys = ['fred', 'wilma', 'barney', 'betty']
vals = ['franky', 'benji', 'mighty']
out = []
for i in range(1000):
    part = []
    for field in [f.name for f in df_schema.fields]:
        sort, nullable, nullvalue = field.split('_')
        if nullable == 'op' and np.random.random() < 0.3:
            part.append(None)
            continue
        N = np.random.randint(5)
        ks = np.random.choice(keys, size=N).tolist()
        vs = np.random.choice(vals + [None] if nullvalue == 'op' else vals,
                              size=N).tolist()
        if sort == 'map':
            part.append({k: v for (k, v) in zip(ks, vs)})
        else:
            part.append(vs)
    out.append(part)
df = sql.createDataFrame(out, df_schema)
    """
    fn = os.path.join(TEST_DATA, 'map_array.parq')
    expected = sql.read.parquet(fn).toPandas()
    pf = fastparquet.ParquetFile(fn)
    data = pf.to_pandas()
    pd.util.testing.assert_frame_equal(data, expected)
Beispiel #29
0
def read_parquet(path):
    """
    Construct a SpatialPointsFrame from a spatially partitioned parquet
    file

    If the input parquet file does not contain compatible spatial metadata,
    then the resulting SpatialPointsFrame will have a .spatial property of
    None, and the spatial_query operation will be unavailable.

    Parameters
    ----------
    path: str
        Path to a spatially partitioned parquet file that was created
        using datashader.spatial.points.to_parquet

    Returns
    -------
    SpatialPointsFrame
        A spatially sorted Dask dataframe reconstructed from disk
    """
    _validate_fastparquet()

    # Open parquet file
    pf = fp.ParquetFile(path)

    # Read parquet file
    frame = dd.read_parquet(path)

    # Check for spatial points metadata
    if 'SpatialPointsFrame' in pf.key_value_metadata:
        # Load metadata
        props = json.loads(pf.key_value_metadata['SpatialPointsFrame'])
    else:
        props = None

    # Call DataFrame constructor with the internals of frame
    return SpatialPointsFrame(frame.dask, frame._name, frame._meta,
                              frame.divisions, props)
Beispiel #30
0
def read_header(path: Path) -> ParquetFile:
    """
    Ensure a ParquetFile exists, and return it with headers read.

    May raise OSError (e.g., FileNotFoundError) or
    FastparquetCouldNotHandleFile.

    `retval.fn` gives the filename; `retval.columns` gives column names;
    `retval.dtypes` gives pandas dtypes, and `retval.to_pandas()` reads
    the entire file.
    """
    try:
        return fastparquet.ParquetFile(path)
    except IndexError:
        # TODO nix this when fastparquet resolves
        # https://github.com/dask/fastparquet/issues/361
        #
        # The file has a zero-length column list, and fastparquet can't
        # handle that.
        #
        # Our cached DataFrame should be "empty". No columns means no
        # rows.
        raise FastparquetIssue361