Beispiel #1
0
def test_categorical(tmpdir, write_engine, read_engine):
    tmp = str(tmpdir)
    df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category')
    ddf = dd.from_pandas(df, npartitions=3)
    dd.to_parquet(ddf, tmp, engine=write_engine)

    ddf2 = dd.read_parquet(tmp, categories='x', engine=read_engine)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2 = dd.read_parquet(tmp, categories=['x'], engine=read_engine)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    # autocat
    if read_engine != 'pyarrow':
        ddf2 = dd.read_parquet(tmp, engine=read_engine)
        assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

        ddf2.loc[:1000].compute()
        df.index.name = 'index'  # defaults to 'index' in this case
        assert assert_eq(df, ddf2)

    # dereference cats
    ddf2 = dd.read_parquet(tmp, categories=[], engine=read_engine)

    ddf2.loc[:1000].compute()
    assert (df.x == ddf2.x).all()
Beispiel #2
0
def test_append_with_partition(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df0 = pd.DataFrame({
        'lat': np.arange(0, 10),
        'lon': np.arange(10, 20),
        'value': np.arange(100, 110)
    })
    df0.index.name = 'index'
    df1 = pd.DataFrame({
        'lat': np.arange(10, 20),
        'lon': np.arange(10, 20),
        'value': np.arange(120, 130)
    })
    df1.index.name = 'index'
    dd_df0 = dd.from_pandas(df0, npartitions=1)
    dd_df1 = dd.from_pandas(df1, npartitions=1)
    dd.to_parquet(dd_df0, tmp, partition_on=['lon'])
    dd.to_parquet(dd_df1,
                  tmp,
                  partition_on=['lon'],
                  append=True,
                  ignore_divisions=True)

    out = dd.read_parquet(tmp).compute()
    out['lon'] = out.lon.astype('int64')  # just to pass assert
    # sort required since partitioning breaks index order
    assert_eq(out.sort_values('value'),
              pd.concat([df0, df1])[out.columns],
              check_index=False)
Beispiel #3
0
def test_categorical(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category')
    ddf = dd.from_pandas(df, npartitions=3)
    dd.to_parquet(ddf, tmp)

    ddf2 = dd.read_parquet(tmp, categories='x')
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2 = dd.read_parquet(tmp, categories=['x'])
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    # autocat
    ddf2 = dd.read_parquet(tmp)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2.loc[:1000].compute()
    df.index.name = 'index'  # defaults to 'index' in this case
    assert assert_eq(df, ddf2)

    # dereference cats
    ddf2 = dd.read_parquet(tmp, categories=[])

    ddf2.loc[:1000].compute()
    assert (df.x == ddf2.x).all()
Beispiel #4
0
def test_categorical(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category')
    ddf = dd.from_pandas(df, npartitions=3)
    dd.to_parquet(ddf, tmp)

    ddf2 = dd.read_parquet(tmp, categories='x')
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2 = dd.read_parquet(tmp, categories=['x'])
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    # autocat
    ddf2 = dd.read_parquet(tmp)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2.loc[:1000].compute()
    df.index.name = 'index'  # defaults to 'index' in this case
    assert assert_eq(df, ddf2)

    # dereference cats
    ddf2 = dd.read_parquet(tmp, categories=[])

    ddf2.loc[:1000].compute()
    assert (df.x == ddf2.x).all()
Beispiel #5
0
 def load_season(self, season_obj):
     self.logger.info(
         f'Started extracting season "{season_obj.season_id}" from {season_obj.platform} to GCS'
     )
     platform_tables_config = self.tables_config[season_obj.platform]
     for table, table_config in platform_tables_config.items():
         method_name = table_config["method"]
         method = season_obj.__getattribute__(method_name)
         df = method(key_map=table_config.get("key_map"))
         df = df.replace("", np.nan)
         dask_df = dd.from_pandas(df, npartitions=1)
         gcs_path = f"gs://{self.gcs_bucket}/{season_obj.platform}_{table}/{self.league_name}/{season_obj.season_id}"
         schema = get_data_types(table_config.get("key_map"))
         dd.to_parquet(
             dask_df,
             path=gcs_path,
             write_index=False,
             overwrite=True,
             schema=schema,
             engine="pyarrow",
         )
         self.logger.info(f"Extracted to {gcs_path}")
     self.logger.info(
         f'Finished extracting season "{season_obj.season_id}" from {season_obj.platform} to GCS\n'
     )
Beispiel #6
0
def test_categorical(tmpdir, write_engine, read_engine):
    tmp = str(tmpdir)
    df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category')
    ddf = dd.from_pandas(df, npartitions=3)
    dd.to_parquet(ddf, tmp, engine=write_engine)

    ddf2 = dd.read_parquet(tmp, categories='x', engine=read_engine)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    ddf2 = dd.read_parquet(tmp, categories=['x'], engine=read_engine)
    assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

    # autocat
    if read_engine != 'pyarrow':
        ddf2 = dd.read_parquet(tmp, engine=read_engine)
        assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

        ddf2.loc[:1000].compute()
        df.index.name = 'index'  # defaults to 'index' in this case
        assert assert_eq(df, ddf2)

    # dereference cats
    ddf2 = dd.read_parquet(tmp, categories=[], engine=read_engine)

    ddf2.loc[:1000].compute()
    assert (df.x == ddf2.x).all()
Beispiel #7
0
def test_roundtrip(tmpdir, df, write_kwargs, read_kwargs):
    check_fastparquet()
    tmp = str(tmpdir)
    if df.index.name is None:
        df.index.name = 'index'
    ddf = dd.from_pandas(df, npartitions=2)

    dd.to_parquet(ddf, tmp, **write_kwargs)
    ddf2 = dd.read_parquet(tmp, index=df.index.name, **read_kwargs)
    assert_eq(ddf, ddf2)
Beispiel #8
0
def test_roundtrip(tmpdir, df, write_kwargs, read_kwargs):
    check_fastparquet()
    tmp = str(tmpdir)
    if df.index.name is None:
        df.index.name = 'index'
    ddf = dd.from_pandas(df, npartitions=2)

    dd.to_parquet(ddf, tmp, **write_kwargs)
    ddf2 = dd.read_parquet(tmp, index=df.index.name, **read_kwargs)
    assert_eq(ddf, ddf2)
Beispiel #9
0
    def write(self,
              item,
              data,
              metadata={},
              npartitions=None,
              chunksize=None,
              overwrite=False,
              epochdate=False,
              reload_items=False,
              **kwargs):

        if utils.path_exists(self._item_path(item)) and not overwrite:
            raise ValueError("""
                Item already exists. To overwrite, use `overwrite=True`.
                Otherwise, use `<collection>.append()`""")

        if isinstance(data, Item):
            data = data.to_pandas()
        else:
            # work on copy
            data = data.copy()

        if epochdate or "datetime" in str(data.index.dtype):
            data = utils.datetime_to_int64(data)
            if 1 in data.index.nanosecond and "times" not in kwargs:
                kwargs["times"] = "int96"

        if data.index.name == "":
            data.index.name = "index"

        if npartitions is None and chunksize is None:
            memusage = data.memory_usage(deep=True).sum()
            if isinstance(data, dd.DataFrame):
                npartitions = int(1 +
                                  memusage.compute() // config.PARTITION_SIZE)
                data.repartition(npartitions=npartitions)
            else:
                npartitions = int(1 + memusage // config.PARTITION_SIZE)
                data = dd.from_pandas(data, npartitions=npartitions)

        dd.to_parquet(data,
                      self._item_path(item, as_string=True),
                      compression="snappy",
                      engine=self.engine,
                      **kwargs)

        utils.write_metadata(
            utils.make_path(self.datastore, self.collection, item), metadata)

        # update items
        self.items.add(item)
        if reload_items:
            self._list_items_threaded()
Beispiel #10
0
def test_passing_parquetfile(tmpdir):
    import shutil
    fp = pytest.importorskip('fastparquet')
    path = str(tmpdir)
    df = pd.DataFrame({"x": [1, 3, 2, 4]})
    ddf = dd.from_pandas(df, npartitions=1)

    dd.to_parquet(ddf, path)
    pf = fp.ParquetFile(path)
    shutil.rmtree(path)

    # should pass, because no need to re-read metadata
    dd.read_parquet(pf)
Beispiel #11
0
    def _write_dataset(self):
        """
        Helper function to write the actual dataset files
        I've had better luck writing the files with pyarrow, reading with fastparquet
        """
        import pyarrow
        engine = 'pyarrow'
        print('Writing trainings and test sets: ')

        dd.to_parquet(self.train_ddf, self.train_data_path, engine=engine)
        #         dd.to_parquet(self.y_train, self.train_target_path, engine=engine)

        dd.to_parquet(self.test_ddf, self.test_data_path, engine=engine)
Beispiel #12
0
def perform_transit_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + year

            df = dd.read_parquet(path=s3_in_url,
                                 storage_options=s3_options,
                                 engine='fastparquet')

            df['delex'] = df['exits'].diff()
            df['delent'] = df['entries'].diff()
            df = df.drop(['exits', 'entries'], axis=1)
            df = df.dropna()

            delex_lo_q = df['delex'].quantile(.25)
            delent_lo_q = df['delent'].quantile(.25)
            delex_hi_q = df['delex'].quantile(.75)
            delent_hi_q = df['delent'].quantile(.75)
            delex_iqr = delex_hi_q - delex_lo_q
            delent_iqr = delent_hi_q - delent_lo_q
            discard = (df['delex'] < delex_lo_q - 1.5 * delex_iqr) | \
                      (df['delex'] > delex_hi_q + 1.5 * delex_iqr) | \
                      (df['delent'] < delent_lo_q - 1.5 * delent_iqr) | \
                      (df['delent'] > delent_hi_q + 1.5 * delent_iqr)
            df = df.loc[~discard]

            dd.to_parquet(df=df,
                          path=s3_out_url,
                          engine='fastparquet',
                          compute=True,
                          compression='lz4',
                          storage_options=s3_options)

    except Exception as err:
        print('error in perform_transit %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
Beispiel #13
0
def test_append_cat_fp(tmpdir):
    pytest.importorskip('fastparquet')
    path = str(tmpdir)
    # https://github.com/dask/dask/issues/4120
    df = pd.DataFrame({"x": ["a", "a", "b", "a", "b"]})
    df["x"] = df["x"].astype("category")
    ddf = dd.from_pandas(df, npartitions=1)

    dd.to_parquet(ddf, path)

    # this fails:
    dd.to_parquet(ddf, path, append=True, ignore_divisions=True)
    d = dd.read_parquet(path).compute()
    assert d['x'].tolist() == ["a", "a", "b", "a", "b"] * 2
Beispiel #14
0
def perform_traffic_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()
        usecols = [1, 2, 4, 5]
        names = ['speed', 'traveltime', 'datetime', 'linkid']

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + '*' + year + '.csv'

            df = dd.read_csv(urlpath=s3_in_url,
                             storage_options=s3_options,
                             header=None,
                             usecols=usecols,
                             names=names,
                             parse_dates=['datetime'],
                             date_parser=row_ops.clean_traffic_date,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             converters={
                                 'speed': row_ops.clean_num,
                                 'traveltime': row_ops.clean_num,
                                 'linkid': row_ops.clean_num
                             },
                             encoding='utf-8')

            dd.to_parquet(df=df,
                          path=s3_out_url,
                          engine='fastparquet',
                          compute=True,
                          compression='GZIP',
                          storage_options=s3_options)

    except Exception as err:
        print('error in perform_transit %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
Beispiel #15
0
    def write(self,
              item,
              data,
              metadata={},
              npartitions=None,
              chunksize=None,
              overwrite=False,
              epochdate=False,
              compression="snappy",
              **kwargs):

        if utils.path_exists(self._item_path(item)) and not overwrite:
            raise ValueError("""
                Item already exists. To overwrite, use `overwrite=True`.
                Otherwise, use `<collection>.append()`""")

        if isinstance(data, Item):
            data = data.to_pandas()
        else:
            # work on copy
            data = data.copy()

        if epochdate or "datetime" in str(data.index.dtype):
            data = utils.datetime_to_int64(data)

        if data.index.name == "":
            data.index.name = "index"

        if not isinstance(data, dd.DataFrame):
            if npartitions is None and chunksize is None:
                npartitions = None
                chunksize = int(1e6)

            data = dd.from_pandas(data,
                                  npartitions=npartitions,
                                  chunksize=chunksize)

        dd.to_parquet(data,
                      self._item_path(item, as_string=True),
                      compression=compression,
                      engine='fastparquet',
                      **kwargs)

        utils.write_metadata(
            utils.make_path(self.datastore, self.collection, item), metadata)

        # update items
        self.items = self.list_items()
def preprocess_papers(metadata_filename) -> pd.DataFrame:
    # Get metadata of research papers
    metadata_cols = [
        "cord_uid",
        "title",
        "authors",
        "abstract",
        "publish_time",
        "url",
        "journal",
        "pdf_json_files",
    ]
    metadata_cols_dtypes = {col: str for col in metadata_cols}
    metadata_dd = dd.read_csv(metadata_filename,
                              dtype=metadata_cols_dtypes,
                              usecols=metadata_cols)
    print(
        f"Memory usage of metadata_df before clean: {metadata_dd.memory_usage(deep=True).sum()}"
    )
    # Perform operations in place to reduce memory usage
    metadata_dd = remove_papers_with_null_cols(metadata_dd, ["title"])
    metadata_dd = remove_papers_with_null_cols(metadata_dd,
                                               ["abstract", "url"])
    metadata_dd = remove_papers_with_null_cols(metadata_dd, ["authors"])
    metadata_dd = remove_papers_with_null_cols(metadata_dd, ["publish_time"])
    metadata_dd = fill_in_missing_data(metadata_dd)
    print(
        f"Memory usage of metadata_df after clean: {metadata_dd.memory_usage(deep=True).sum()}"
    )
    print(f"# partitions in metadata dd: {metadata_dd.npartitions}")

    # Get body of research papers and store in df
    # TODO: Rename gather_papers_data and put below embedding computation logic in it
    metadata_with_body_dd = gather_papers_data(metadata_dd)
    metadata_with_body_dd = metadata_with_body_dd.repartition(
        partition_size="100MB")
    print(
        f"# partitions in research papers' metadata dd: {metadata_with_body_dd.npartitions}"
    )
    dd.to_parquet(
        metadata_with_body_dd,
        "research_paper_bodies/",
        engine="fastparquet",
        compute_kwargs={"scheduler": "synchronous"},  # synchronous ~2m40s
    )

    # Get embeddings of each research paper's title and abstract (embeddings of body text would lose too much info due to current ineffective pooling techniques)
    """
Beispiel #17
0
def test_ordering(tmpdir, write_engine, read_engine):
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp, engine=write_engine)

    if read_engine == 'fastparquet':
        pf = fastparquet.ParquetFile(tmp)
        assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine)
    assert_eq(ddf, ddf2, check_divisions=False)
Beispiel #18
0
def test_ordering(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp)

    pf = fastparquet.ParquetFile(tmp)
    assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex')
    assert_eq(ddf, ddf2)
def test_ordering(tmpdir, write_engine, read_engine):
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp, engine=write_engine)

    if read_engine == 'fastparquet':
        pf = fastparquet.ParquetFile(tmp)
        assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine)
    assert_eq(ddf, ddf2, check_divisions=False)
Beispiel #20
0
def test_ordering(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3],
                       'b': [10, 20, 30],
                       'c': [100, 200, 300]},
                      index=pd.Index([-1, -2, -3], name='myindex'),
                      columns=['c', 'a', 'b'])
    ddf = dd.from_pandas(df, npartitions=2)
    dd.to_parquet(ddf, tmp)

    pf = fastparquet.ParquetFile(tmp)
    assert pf.columns == ['myindex', 'c', 'a', 'b']

    ddf2 = dd.read_parquet(tmp, index='myindex')
    assert_eq(ddf, ddf2)
Beispiel #21
0
def clean_cabs_at_path(special: bool, s3_in_url: str, s3_out_url: str,
                       s3_options: Dict) -> bool:

    try:
        df = dd.read_parquet(path=s3_in_url,
                             storage_options=s3_options,
                             engine='fastparquet')

        # add cab zones
        if not special:
            print('In data clean tasks for cabs. Field dolocationid not found')
            # fetch cab zones
            taxi_zones_df: GeoDataFrame = fetch_cab_zones()
            df['dolocationid'] = df.map_partitions(
                partial(add_cab_zone,
                        taxi_zone_df=taxi_zones_df,
                        lon_var='dolongitude',
                        lat_var='dolatitude',
                        locid_var='dolocationid'),
                meta=('dolocationid', int64))
            df['pulocationid'] = df.map_partitions(
                partial(add_cab_zone,
                        taxi_zone_df=taxi_zones_df,
                        lon_var='pulongitude',
                        lat_var='pulatitude',
                        locid_var='pulocationid'),
                meta=('pulocationid', int64))

            del taxi_zones_df
        df = df[[
            'pudatetime', 'dodatetime', 'passengers', 'distance',
            'dolocationid', 'pulocationid'
        ]]
        dd.to_parquet(df=df,
                      path=s3_out_url,
                      engine='fastparquet',
                      compute=True,
                      compression='GZIP',
                      storage_options=s3_options)
        del df

    except Exception as err:
        print('error in clean_cabs_at_path %s' % str(err))
        raise err

    else:
        return True
Beispiel #22
0
def prepare_dataset(inloc,
                    outloc,
                    date_class,
                    shapefile,
                    crs='epsg:32618',
                    seed=33):
    """
    Extract xarray dataset variables for locations in the shapefile
    
    Args:
        inloc (str): path of the location of datasets to process
        outloc (str): path of location to write resulting dataframe as parquet
        crs (str): crs corresponding to dataset coordinates, default 'epsg:32618'
        date_class ([(str, np.datetime64)]): list of tuples with the first
                        element indicating the column name with the phenology obs
                        data in the shapefile and the second the corresponding
                        date. e.g ('X20151222', np.datetime64('2015-12-22'))
        shapefile (str): path of location of the shapefile with class data
                        regions in shapefile must have a lot identifier,
                        coded as (IDLote)
        seed (int): seed for the generation of reproducible datasets, default 33
        
    """

    #Read shapefile
    fields_shp = gpd.read_file(shapefile).to_crs({'init': crs})

    #Indicate if used as train or test
    # based on dictionary for each dic[date][polygon_index]:test/train
    ## eg {np.datetime('2015-12-22'):'2WKSD7':'train'}
    class_dict = assign_polygons_to_class(date_class, fields_shp, seed)

    dss = os.listdir(inloc)
    #Open the datasets
    dss = list(map(lambda ds: xr.open_dataset(inloc + ds), dss))

    date_dataframes = []

    for date in date_class:
        date_dataframes.append(
            create_date_dataframe(dss, fields_shp, date[1], date[0],
                                  class_dict[date[1]]))

    data = dd.concat(date_dataframes, axis=0, interleave_partitions=True)

    #Write results with valid polygon class data outloc
    dd.to_parquet(data[data['vclass'] > 0], outloc)
    def run(self):

        conn_info={'host':'safsand',
                   'port':5433,
                   'user':'******',
                   'password':'******',
                   'database':'MBFS',
                   'read_timeout':600,
                   'unicode_error':'ignore',
                   'ssl':False}
        connection=vertica_python.connect(**conn_info)
        cur=connection.cursor()
        cur.execute(""" select * from abc""")
        # .....
        connection.close()
        # ....
        ddf.to_parquet(self.output().path)
def convert_to_parquet(path2files, cols):
    """
    Function to convert files to parquet format, after selecting columns of interest
    :param path2files: List of all monthly files
    :param cols: List of strings, Columns to be selected in final output
    :return: Nothing, export to parquet in same directory
    """
    r = re.compile(r"A2019[0-9]{2}")
    for f in path2files:
        print(f)
        df = dd.read_csv(f,
                         usecols=cols,
                         sep=";",
                         compression='gzip',
                         blocksize=None)
        df = df.repartition(npartitions=20)
        # Filter : legal part only
        df = df.loc[df["PRS_REM_TYP"].isin([0, 1]), :]
        dd.to_parquet(df, "data/" + r.findall(f)[0])
Beispiel #25
0
    def append(self,
               item,
               data,
               npartitions=None,
               chunksize=1e6,
               epochdate=False,
               compression="snappy",
               **kwargs):
        if not utils.path_exists(self._item_path(item)):
            raise ValueError(
                """Item do not exists. Use `<collection>.write(...)`""")

        # work on copy
        data = data.copy()

        try:
            if epochdate or ("datetime" in data.index.dtype_str
                             and any(data.index.nanosecond) > 0):
                data = utils.datetime_to_int64(data)
            old_index = dd.read_parquet(self._item_path(item, as_string=True),
                                        columns='index',
                                        engine='fastparquet').index.compute()
            data = data[~data.index.isin(old_index)]
        except Exception:
            return

        if data.empty:
            # if len(data.index) == 0:
            return

        if data.index.name == "":
            data.index.name = "index"

        data = dd.from_pandas(data,
                              npartitions=npartitions,
                              chunksize=int(chunksize))

        dd.to_parquet(data,
                      self._item_path(item, as_string=True),
                      append=True,
                      compression=compression,
                      engine='fastparquet',
                      **kwargs)
Beispiel #26
0
    def write_parquet(
        self,
        data: Union[pd.DataFrame, pd.Series, dd.DataFrame],
        overwrite: bool = True,
        present: str = "ignore",
        columns_to_string: bool = True,
        **kwargs,
    ) -> None:

        if not parquet_ok:
            raise TPImportError(errormessage_hdf5)

        if self.suffix != ".parquet":
            warnings.warn(
                f"path {self} does not have '.parquet' as suffix while using to_parquet. The path will be "
                f"changed to a path with '.parquet' as suffix")
            self.change_suffix(".parquet")

        compression = kwargs.get("compression", None)

        if compression is not None and compression != "snappy":
            warnings.warn(
                "TransparentPath can not write parquet files with a compression that is not snappy. You "
                f"specified '{compression}', it will be replaced by 'snappy'.")

        if not self.nocheck:
            self._check_multiplicity()

        if not overwrite and self.is_file() and present != "ignore":
            raise TPFileExistsError()

        if columns_to_string and not isinstance(data.columns[0], str):
            data.columns = data.columns.astype(str)

        if self.__class__.cli is None:
            self.__class__.cli = client.Client(processes=False)
        check_kwargs(dd.to_parquet, kwargs)
        dd.to_parquet(data,
                      self.with_suffix("").__fspath__(),
                      engine="pyarrow",
                      compression="snappy",
                      **kwargs)
Beispiel #27
0
    def aggregation_df(self, parquet=False):
        '''returns df that contains all of the patient level data
        When parquet is equal to True it copies data over to a parquet file format
        which is faster for querying'''

        df = dd.read_csv('./rolling_window_output/pt_level_*.csv')
        df = df.categorize(columns=[
            'Window', 'Category_Type', 'Category', 'Population_Type',
            'Diagnosis_Category', 'Diagnosis_Category', 'Age_Category',
            'Gender'
        ])

        df['AdjustedPriceAmt'] = df['AdjustedPriceAmt'].astype(float)
        df['Encounter'] = df['Encounter'].astype(int)
        df['ServiceCount'] = df['ServiceCount'].astype(int)
        df['VisitInpatientDays'] = df['VisitInpatientDays'].astype(int)
        if parquet == True:
            dd.to_parquet(df, 'rolling_window_parquet')
            df = dd.read_parquet('rolling_window_parquet/')
        return df
Beispiel #28
0
def test_append_with_partition(tmpdir):
    check_fastparquet()
    tmp = str(tmpdir)
    df0 = pd.DataFrame({'lat': np.arange(0, 10), 'lon': np.arange(10, 20),
                        'value': np.arange(100, 110)})
    df0.index.name = 'index'
    df1 = pd.DataFrame({'lat': np.arange(10, 20), 'lon': np.arange(10, 20),
                        'value': np.arange(120, 130)})
    df1.index.name = 'index'
    dd_df0 = dd.from_pandas(df0, npartitions=1)
    dd_df1 = dd.from_pandas(df1, npartitions=1)
    dd.to_parquet(dd_df0, tmp, partition_on=['lon'])
    dd.to_parquet(dd_df1, tmp, partition_on=['lon'], append=True,
                  ignore_divisions=True)

    out = dd.read_parquet(tmp).compute()
    out['lon'] = out.lon.astype('int')  # just to pass assert
    # sort required since partitioning breaks index order
    assert_eq(out.sort_values('value'), pd.concat([df0, df1])[out.columns],
              check_index=False)
Beispiel #29
0
def to_parquet(df: dd.DataFrame,
               out_bucket: str,
               folder: str,
               compute: bool = True) -> bool:
    try:
        s3_out_url: str = 's3://' + out_bucket + '/' + folder
        s3_options: Dict = ps.fetch_s3_options()
        dd.to_parquet(df=df,
                      path=s3_out_url,
                      engine='fastparquet',
                      compute=compute,
                      compression='lz4',
                      storage_options=s3_options)
    except Exception as err:
        print('error while saving to parquet to path %(path)s - %(error)s' % {
            'path': out_bucket + '/' + folder,
            'error': str(err)
        })
        raise err
    else:
        return True
Beispiel #30
0
 def _write(self, collection, path, **kwargs):
     """ This method implements Parquet writing.
     This method uses the Dask to_parquet() function, which automatically creates parent
     directories if required.
     Args:
         collection: dask dataframe, to be written to disk
         path: str, full path of the target
         **kwargs: dictionary, named arguments to be passed to to_parquet.
     Returns:
         filenames read or list of list of tasks
     """
     return to_parquet(collection, path, **kwargs)
def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    data = dd.read_parquet(data_path)
    data = data.repartition(npartitions=2)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = DaskLFApplier(lfs)
    L = applier.apply(data)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    data = data.reset_index().set_index("index")
    data_labeled = data.assign(y_prob=dd.from_array(y_prob))
    dd.to_parquet(data_labeled, output_path)
    logging.info(f"Labels saved to {output_path}")
def clean_write_parquet(dataframe, path, engine="fastparquet", compute=True):
    """Write a parquet file with common options standardized in the project
    """

    # Ensure we are writing dask dataframes, not accidentally pandas!
    if not isinstance(dataframe, dask.dataframe.DataFrame):
        raise ValueError(
            "Attempted to write dask dataframe, but got a {}".format(
                str(type(dataframe))))

    # Clear the dest directory to prevent partial mixing of files from an old
    # archive if the number of partitions has changed.
    try:
        shutil.rmtree(path)
    except FileNotFoundError:
        # No worries if the output doesn't exist yet.
        pass

    # Do the write with the given options.
    return dataframe.to_parquet(path,
                                compression="snappy",
                                engine=engine,
                                compute=compute)
Beispiel #33
0
def run_pipeline(task_type: str) -> bool:
    map: Dict = task_type_map[task_type]
    in_bucket: str = map['in']
    out_bucket: str = map['out']
    cols: Dict[str, str] = map['cols']
    converters: Dict[str, Callable] = map['converters']
    dtypes: Dict[str, str] = map['dtypes']
    index_col: str = map['index']['col']
    sorted: bool = map['index']['sorted']
    row_op: Callable = map['row_op']
    diff: Dict = map['diff']
    filter_by_key: str = resample_map['filter_by']['key']
    filter_by_val: int = resample_map['filter_by']['value']
    resample_freq: str = resample_map['freq']
    aggr_func: Callable = map['aggr_func']

    try:

        #client = Client(address='dscheduler:8786')

        s3_in_url: str = 's3://'+in_bucket+'/*.*'
        s3_options: Dict = ps.fetch_s3_options()
        #df = dd.read_table(path=s3_in_url, storage_options=s3_options)
        df = dd.read_table(urlpath='tmp/'+in_bucket+'/*.*',
                           header=0,
                           usecols=lambda x: x.upper() in list(cols.keys()),
                           skipinitialspace=True,
                           converters=converters
                           )

        # rename columns
        df = df.rename(columns=cols)
        df.compute()

        if sorted:
            df = df.map_partitions(lambda pdf: pdf.rename(columns=cols)
                                   .apply(func=row_op, axis=1),
                                   meta=dtypes).compute()
        else:
            df = df.map_partitions(lambda pdf: pdf.rename(columns=cols)
                                   .set_index(index_col).sort().reset_index()
                                   .apply(func=row_op, axis=1),
                                   meta=dtypes).compute()



        # map row-wise operations
        #df = df.map_partitions(lambda pdf: pdf.apply(func=row_op, axis=1), meta=dtypes)

        # diff
        if diff['compute']:
            df[diff['new_col']] = df[diff['col']].diff()

        # specific processing for transit
        if task_type == 'cl-transit':
            df = df.map_partitions(partial(remove_outliers, col='DELEXITS'), meta=dtypes)

        # drop na values
        df = df.dropna()

        # set index (assumes pre-sorted data)
        df = df.set_index(index_col, sorted=True)

        #df.compute()

        # filter
        if filter_by_key == 'weekday':
            df = df.loc[df[index_col].weekday() == filter_by_val]

        # resample using frequency and aggregate function specified
        df = compose(df.resample(resample_freq), aggr_func)

        # save in out bucket
        s3_out_url: str = 's3://' + out_bucket
        # dd.to_parquet(df=df, path=s3_out_url, storage_options=s3_options)
        dd.to_parquet(df=df, path='tmp/'+out_bucket+'/*.*')

    except Exception as err:
        print('error in run_pipeline %s' % str(err))
        raise err

    return True
Beispiel #34
0
    f_adj_neg = 'adj_neg/' + uuid.uuid4().hex
    f_adj_pos = 'adj_pos/' + uuid.uuid4().hex
    f_adj_neu = 'adj_neu/' + uuid.uuid4().hex
    with open(f_adj_pos, 'wb') as file:
        pickle.dump(adj_pos, file, pickle.HIGHEST_PROTOCOL)
    with open(f_adj_neu, 'wb') as file:
        pickle.dump(adj_neu, file, pickle.HIGHEST_PROTOCOL)
    with open(f_adj_neg, 'wb') as file:
        pickle.dump(adj_neg, file, pickle.HIGHEST_PROTOCOL)

    out["text"] = texts
    out["sentiment"] = df["sentiment"]
    out["length"] = length
    return out


meta = pd.DataFrame(columns=["text", "sentiment", "length"])
meta["sentiment"] = meta["sentiment"].astype(np.int8)
meta["text"] = meta["text"].astype(str)
meta["length"] = meta["length"].astype(np.int64)

records = dd.read_parquet(path=srcpath)
t0 = time.time()
result = records.map_partitions(analyze_df, meta=meta)
parquet = dd.to_parquet(df=result[result["length"] > 0],
                        path=tgtpath,
                        compute=False)
parquet.compute(scheduler="processes")
t1 = time.time()
print(t1 - t0)
Beispiel #35
0
def test_dask_parquet(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("test")
    STORAGE_OPTIONS = {
        "account_name": "devstoreaccount1",
        "connection_string": CONN_STR,
    }
    df = pd.DataFrame({
        "col1": [1, 2, 3, 4],
        "col2": [2, 4, 6, 8],
        "index_key": [1, 1, 2, 2],
        "partition_key": [1, 1, 2, 2],
    })

    dask_dataframe = dd.from_pandas(df, npartitions=1)
    for protocol in ["abfs", "az"]:
        dask_dataframe.to_parquet(
            "{}://test/test_group.parquet".format(protocol),
            storage_options=STORAGE_OPTIONS,
            engine="pyarrow",
        )

        fs = AzureBlobFileSystem(**STORAGE_OPTIONS)
        assert fs.ls("test/test_group.parquet") == [
            "test/test_group.parquet/_common_metadata",
            "test/test_group.parquet/_metadata",
            "test/test_group.parquet/part.0.parquet",
        ]
        fs.rm("test/test_group.parquet")

    df_test = dd.read_parquet(
        "abfs://test/test_group.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df, df_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df2 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf2 = dd.from_pandas(df2, npartitions=4)
    dd.to_parquet(
        ddf2,
        "abfs://test/test_group2.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.ls("test/test_group2.parquet") == [
        "test/test_group2.parquet/_common_metadata",
        "test/test_group2.parquet/_metadata",
        "test/test_group2.parquet/part.0.parquet",
        "test/test_group2.parquet/part.1.parquet",
        "test/test_group2.parquet/part.2.parquet",
        "test/test_group2.parquet/part.3.parquet",
    ]
    df2_test = dd.read_parquet(
        "abfs://test/test_group2.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df2, df2_test)

    a = np.full(shape=(10000, 1), fill_value=1)
    b = np.full(shape=(10000, 1), fill_value=2)
    c = np.full(shape=(10000, 1), fill_value=3)
    d = np.full(shape=(10000, 1), fill_value=4)
    B = np.concatenate((a, b, c, d), axis=1)
    df3 = pd.DataFrame(data=B, columns=list("ABCD"))
    ddf3 = dd.from_pandas(df3, npartitions=4)
    dd.to_parquet(
        ddf3,
        "abfs://test/test_group3.parquet",
        partition_on=["A", "B"],
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.glob("test/test_group3.parquet/*") == [
        "test/test_group3.parquet/A=1",
        "test/test_group3.parquet/_common_metadata",
        "test/test_group3.parquet/_metadata",
    ]
    df3_test = dd.read_parquet(
        "abfs://test/test_group3.parquet",
        filters=[("A", "=", 1)],
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    df3_test = df3_test[["A", "B", "C", "D"]]
    df3_test = df3_test[["A", "B", "C", "D"]].astype(int)
    assert_frame_equal(df3, df3_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df4 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf4 = dd.from_pandas(df4, npartitions=4)
    dd.to_parquet(
        ddf4,
        "abfs://test/test_group4.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
        flavor="spark",
        write_statistics=False,
    )
    fs.rmdir("test/test_group4.parquet/_common_metadata", recursive=True)
    fs.rmdir("test/test_group4.parquet/_metadata", recursive=True)
    fs.rm("test/test_group4.parquet/_common_metadata")
    fs.rm("test/test_group4.parquet/_metadata")
    assert fs.ls("test/test_group4.parquet") == [
        "test/test_group4.parquet/part.0.parquet",
        "test/test_group4.parquet/part.1.parquet",
        "test/test_group4.parquet/part.2.parquet",
        "test/test_group4.parquet/part.3.parquet",
    ]
    df4_test = dd.read_parquet(
        "abfs://test/test_group4.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df4, df4_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df5 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf5 = dd.from_pandas(df5, npartitions=4)
    dd.to_parquet(
        ddf5,
        "abfs://test/test group5.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.ls("test/test group5.parquet") == [
        "test/test group5.parquet/_common_metadata",
        "test/test group5.parquet/_metadata",
        "test/test group5.parquet/part.0.parquet",
        "test/test group5.parquet/part.1.parquet",
        "test/test group5.parquet/part.2.parquet",
        "test/test group5.parquet/part.3.parquet",
    ]
    df5_test = dd.read_parquet(
        "abfs://test/test group5.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df5, df5_test)
Beispiel #36
0
def to_parquet(df,
               path,
               x,
               y,
               p=10,
               npartitions=None,
               shuffle=None,
               compression='default'):
    """
    Perform spatial partitioning on an input dataframe and write the
    result to a parquet file.  The resulting parquet file will contain
    the same columns as the input dataframe, but the dataframe's original
    index will be dropped.

    The resulting parquet file will contain all of the rows from the
    input dataframe, but they will be spatially sorted and partitioned
    along a 2D Hilbert curve (https://en.wikipedia.org/wiki/Hilbert_curve).

    The parquet file will also contain custom metadata that is needed to
    reconstruct the Hilbert curve distances on load.  This parquet file
    may then be used to construct SpatialPointsFrame instances using
    datashader.spatial.points.read_parquet.

    Parameters
    ----------
    df: pd.DataFrame or dd.DataFrame
        The input dataframe to partition
    path: str
        The path where the resulting parquet file should be written.
        See dask.dataframe.to_parquet for description of supported path
        specifications.
    x, y
        The column labels in df of the x and y coordinates of each row
    p: int (default 10)
        The Hilbert curve order parameter that determines the resolution
        of the 2D grid that data points are rounded to before computing
        their Hilbert distance. Points will be discretized into 2 ** p
        bins in each the x and y dimensions.

        This parameter should be increased if the partitions of the
        resulting parquet files are significantly unbalanced.

    npartitions: int or None (default None)
        The number of partitions for the resulting parquet file.  If None
        (the default) this is chosen to be the greater of 8 and
        len(df) // 2**23.

        In general, increasing the number of partitions will improve
        performance when processing small subsets of the overall parquet
        data set.  But this comes at the cost of some additional overhead
        when processing the entire data set.

    shuffle: str or None (default None)
        The dask.dataframe.DataFrame.set_index shuffle method. If None,
        a default is chosen based on the current scheduler.

    compression: str or None (default)
        The dask.dataframe.to_parquet compression method.
    """

    _validate_fastparquet()

    # Validate filename
    if (not isinstance(path, basestring)
            or not (path.endswith('.parquet') or path.endswith('.parq'))):
        raise ValueError("""\
'filename must be a string ending with a .parquet or .parq extension""")

    # Remove any existing directory
    if os.path.exists(path):
        shutil.rmtree(path)

    # Normalize to dask dataframe
    if isinstance(df, pd.DataFrame):
        ddf = dd.from_pandas(df, npartitions=4)
    elif isinstance(df, dd.DataFrame):
        ddf = df
    else:
        raise ValueError("""
df must be a pandas or dask DataFrame instance.
Received value of type {typ}""".format(typ=type(df)))

    # Get number of rows
    nrows = len(df)

    # Compute npartitions if needed
    if npartitions is None:
        # Make partitions of ~8 million rows with a minimum of 8
        # partitions
        npartitions = max(nrows // 2**23, 8)

    # Compute data extents
    extents = ddf.map_partitions(_compute_extents, x, y).compute()

    x_range = (float(extents['x_min'].min()), float(extents['x_max'].max()))

    y_range = (float(extents['y_min'].min()), float(extents['y_max'].max()))

    # Compute distance of points along the Hilbert-curve
    ddf = ddf.assign(distance=ddf.map_partitions(_compute_distance,
                                                 x=x,
                                                 y=y,
                                                 p=p,
                                                 x_range=x_range,
                                                 y_range=y_range,
                                                 as_series=True))

    # Set index to distance. This will trigger an expensive shuffle
    # sort operation
    ddf = ddf.set_index('distance', npartitions=npartitions, shuffle=shuffle)

    # Get list of the distance divisions computed by dask
    distance_divisions = [int(d) for d in ddf.divisions]

    # Save properties as custom metadata in the parquet file
    props = dict(version='1.0',
                 x=x,
                 y=y,
                 p=p,
                 distance_divisions=distance_divisions,
                 x_range=x_range,
                 y_range=y_range,
                 nrows=nrows)

    # Drop distance index to save storage space
    ddf = ddf.reset_index(drop=True)

    # Save ddf to parquet
    dd.to_parquet(ddf, path, engine='fastparquet', compression=compression)

    # Open resulting parquet file
    pf = fp.ParquetFile(path)

    # Add a new property to the file metadata
    new_fmd = copy.copy(pf.fmd)
    new_kv = fp.parquet_thrift.KeyValue()
    new_kv.key = 'SpatialPointsFrame'
    new_kv.value = json.dumps(props)
    new_fmd.key_value_metadata.append(new_kv)

    # Overwrite file metadata
    fn = os.path.join(path, '_metadata')
    fp.writer.write_common_metadata(fn, new_fmd, no_row_groups=False)

    fn = os.path.join(path, '_common_metadata')
    fp.writer.write_common_metadata(fn, new_fmd)
Beispiel #37
0
 def _write(cls, collection, path, **kwargs):
     return to_parquet(collection, path, **kwargs)
Beispiel #38
0
def get_ld(rgeno,
           rbim,
           tgeno,
           tbim,
           kbwindow=1000,
           threads=1,
           max_memory=None,
           justd=False,
           extend=False):
    """
    Get the LD blocks from snp overlap between two populations

    :param rgeno: Genotype array of the reference populartion
    :param rbim: Mapping variant info and the genotype array position for ref
    :param tgeno: Genotype array of the target populartion
    :param tbim: Mapping variant info and the genotype array position for tar
    :param kbwindow: Size of the window in KB
    :param threads: Number of threads to use for computation
    :param max_memory: Memory limit
    :param justd: Return only the raw LD matrices or the tagging/cotagging
    :param extend: 'Circularize' the genome by extending both ends
    :return: A list of tuples (or dataframe if not justd) with the ld per block
    """
    # # Set CPU limits
    # soft, hard = resource.getrlimit(resource.RLIMIT_NPROC)
    # resource.setrlimit(resource.RLIMIT_NPROC, (threads, hard))
    # soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
    # print('Soft limit changed to :', soft)

    # set Cache to protect memory spilling
    rp = 'r.pickle'
    if os.path.isfile(rp):
        with open(rp, 'rb') as pckl:
            r = pickle.load(pckl)
    else:
        if max_memory is not None:
            available_memory = max_memory
        else:
            available_memory = psutil.virtual_memory().available
        cache = Chest(available_memory=available_memory)
        if os.path.isfile('ld.matrix'):
            print('Loading precomputed LD matrix')
            r = dd.read_parquet('ld.matrix')
        else:
            print('Computing LD score per window')
            # Get the overlapping snps and their info
            shared = ['chrom', 'snp', 'pos']
            mbim = rbim.merge(tbim, on=shared, suffixes=['_ref', '_tar'])
            assert mbim.i_ref.values.shape == mbim.i_tar.values.shape
            # Get the number of bins or loci to be computed
            nbins = np.ceil(max(mbim.pos) / (kbwindow * 1000)).astype(int)
            # Get the limits of the loci
            bins = np.linspace(0,
                               max(mbim.pos) + 1,
                               num=nbins,
                               endpoint=True,
                               dtype=int)
            if bins.shape[0] == 1:
                # Fix the special case in which the window is much bigger than
                # the range
                bins = np.append(bins, kbwindow * 1000)
            # Get the proper intervals into the dataframe
            mbim['windows'] = pd.cut(mbim['pos'], bins, include_lowest=True)
            # Compute each locus in parallel
            dask_rgeno = dask.delayed(rgeno)
            dask_tgeno = dask.delayed(tgeno)
            delayed_results = [
                dask.delayed(single_window)(df, rg, tg, threads, max_memory,
                                            justd, extend)
                for rg, tg, ridx, tidx, df in window_yielder(
                    dask_rgeno, dask_tgeno, mbim)
            ]
            opts = dict(num_workers=threads,
                        cache=cache,
                        pool=ThreadPool(threads))
            with ProgressBar(), dask.config.set(**opts), open(rp, 'wb') as pck:
                r = tuple(dask.compute(*delayed_results))
                pickle.dump(r, pck)
    r = tuple(x for x in r if x is not None)
    if justd:
        return r
    r = pd.concat(r)
    dd.to_parquet(r, 'ld.matrix')
    return r