def test_categorical(tmpdir, write_engine, read_engine): tmp = str(tmpdir) df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) dd.to_parquet(ddf, tmp, engine=write_engine) ddf2 = dd.read_parquet(tmp, categories='x', engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2 = dd.read_parquet(tmp, categories=['x'], engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] # autocat if read_engine != 'pyarrow': ddf2 = dd.read_parquet(tmp, engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2) # dereference cats ddf2 = dd.read_parquet(tmp, categories=[], engine=read_engine) ddf2.loc[:1000].compute() assert (df.x == ddf2.x).all()
def test_append_with_partition(tmpdir): check_fastparquet() tmp = str(tmpdir) df0 = pd.DataFrame({ 'lat': np.arange(0, 10), 'lon': np.arange(10, 20), 'value': np.arange(100, 110) }) df0.index.name = 'index' df1 = pd.DataFrame({ 'lat': np.arange(10, 20), 'lon': np.arange(10, 20), 'value': np.arange(120, 130) }) df1.index.name = 'index' dd_df0 = dd.from_pandas(df0, npartitions=1) dd_df1 = dd.from_pandas(df1, npartitions=1) dd.to_parquet(dd_df0, tmp, partition_on=['lon']) dd.to_parquet(dd_df1, tmp, partition_on=['lon'], append=True, ignore_divisions=True) out = dd.read_parquet(tmp).compute() out['lon'] = out.lon.astype('int64') # just to pass assert # sort required since partitioning breaks index order assert_eq(out.sort_values('value'), pd.concat([df0, df1])[out.columns], check_index=False)
def test_categorical(tmpdir): check_fastparquet() tmp = str(tmpdir) df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) dd.to_parquet(ddf, tmp) ddf2 = dd.read_parquet(tmp, categories='x') assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2 = dd.read_parquet(tmp, categories=['x']) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] # autocat ddf2 = dd.read_parquet(tmp) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2) # dereference cats ddf2 = dd.read_parquet(tmp, categories=[]) ddf2.loc[:1000].compute() assert (df.x == ddf2.x).all()
def test_categorical(tmpdir): check_fastparquet() tmp = str(tmpdir) df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) dd.to_parquet(ddf, tmp) ddf2 = dd.read_parquet(tmp, categories='x') assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2 = dd.read_parquet(tmp, categories=['x']) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] # autocat ddf2 = dd.read_parquet(tmp) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2) # dereference cats ddf2 = dd.read_parquet(tmp, categories=[]) ddf2.loc[:1000].compute() assert (df.x == ddf2.x).all()
def load_season(self, season_obj): self.logger.info( f'Started extracting season "{season_obj.season_id}" from {season_obj.platform} to GCS' ) platform_tables_config = self.tables_config[season_obj.platform] for table, table_config in platform_tables_config.items(): method_name = table_config["method"] method = season_obj.__getattribute__(method_name) df = method(key_map=table_config.get("key_map")) df = df.replace("", np.nan) dask_df = dd.from_pandas(df, npartitions=1) gcs_path = f"gs://{self.gcs_bucket}/{season_obj.platform}_{table}/{self.league_name}/{season_obj.season_id}" schema = get_data_types(table_config.get("key_map")) dd.to_parquet( dask_df, path=gcs_path, write_index=False, overwrite=True, schema=schema, engine="pyarrow", ) self.logger.info(f"Extracted to {gcs_path}") self.logger.info( f'Finished extracting season "{season_obj.season_id}" from {season_obj.platform} to GCS\n' )
def test_categorical(tmpdir, write_engine, read_engine): tmp = str(tmpdir) df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) dd.to_parquet(ddf, tmp, engine=write_engine) ddf2 = dd.read_parquet(tmp, categories='x', engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2 = dd.read_parquet(tmp, categories=['x'], engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] # autocat if read_engine != 'pyarrow': ddf2 = dd.read_parquet(tmp, engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2) # dereference cats ddf2 = dd.read_parquet(tmp, categories=[], engine=read_engine) ddf2.loc[:1000].compute() assert (df.x == ddf2.x).all()
def test_roundtrip(tmpdir, df, write_kwargs, read_kwargs): check_fastparquet() tmp = str(tmpdir) if df.index.name is None: df.index.name = 'index' ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp, **write_kwargs) ddf2 = dd.read_parquet(tmp, index=df.index.name, **read_kwargs) assert_eq(ddf, ddf2)
def test_roundtrip(tmpdir, df, write_kwargs, read_kwargs): check_fastparquet() tmp = str(tmpdir) if df.index.name is None: df.index.name = 'index' ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp, **write_kwargs) ddf2 = dd.read_parquet(tmp, index=df.index.name, **read_kwargs) assert_eq(ddf, ddf2)
def write(self, item, data, metadata={}, npartitions=None, chunksize=None, overwrite=False, epochdate=False, reload_items=False, **kwargs): if utils.path_exists(self._item_path(item)) and not overwrite: raise ValueError(""" Item already exists. To overwrite, use `overwrite=True`. Otherwise, use `<collection>.append()`""") if isinstance(data, Item): data = data.to_pandas() else: # work on copy data = data.copy() if epochdate or "datetime" in str(data.index.dtype): data = utils.datetime_to_int64(data) if 1 in data.index.nanosecond and "times" not in kwargs: kwargs["times"] = "int96" if data.index.name == "": data.index.name = "index" if npartitions is None and chunksize is None: memusage = data.memory_usage(deep=True).sum() if isinstance(data, dd.DataFrame): npartitions = int(1 + memusage.compute() // config.PARTITION_SIZE) data.repartition(npartitions=npartitions) else: npartitions = int(1 + memusage // config.PARTITION_SIZE) data = dd.from_pandas(data, npartitions=npartitions) dd.to_parquet(data, self._item_path(item, as_string=True), compression="snappy", engine=self.engine, **kwargs) utils.write_metadata( utils.make_path(self.datastore, self.collection, item), metadata) # update items self.items.add(item) if reload_items: self._list_items_threaded()
def test_passing_parquetfile(tmpdir): import shutil fp = pytest.importorskip('fastparquet') path = str(tmpdir) df = pd.DataFrame({"x": [1, 3, 2, 4]}) ddf = dd.from_pandas(df, npartitions=1) dd.to_parquet(ddf, path) pf = fp.ParquetFile(path) shutil.rmtree(path) # should pass, because no need to re-read metadata dd.read_parquet(pf)
def _write_dataset(self): """ Helper function to write the actual dataset files I've had better luck writing the files with pyarrow, reading with fastparquet """ import pyarrow engine = 'pyarrow' print('Writing trainings and test sets: ') dd.to_parquet(self.train_ddf, self.train_data_path, engine=engine) # dd.to_parquet(self.y_train, self.train_target_path, engine=engine) dd.to_parquet(self.test_ddf, self.test_data_path, engine=engine)
def perform_transit_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] client: Client = dask.create_dask_client(num_workers=8) s3_in_prefix: str = 's3://' + in_bucket + '/' try: s3_options: Dict = ps.fetch_s3_options() for year in years: s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' s3_in_url: str = s3_in_prefix + year df = dd.read_parquet(path=s3_in_url, storage_options=s3_options, engine='fastparquet') df['delex'] = df['exits'].diff() df['delent'] = df['entries'].diff() df = df.drop(['exits', 'entries'], axis=1) df = df.dropna() delex_lo_q = df['delex'].quantile(.25) delent_lo_q = df['delent'].quantile(.25) delex_hi_q = df['delex'].quantile(.75) delent_hi_q = df['delent'].quantile(.75) delex_iqr = delex_hi_q - delex_lo_q delent_iqr = delent_hi_q - delent_lo_q discard = (df['delex'] < delex_lo_q - 1.5 * delex_iqr) | \ (df['delex'] > delex_hi_q + 1.5 * delex_iqr) | \ (df['delent'] < delent_lo_q - 1.5 * delent_iqr) | \ (df['delent'] > delent_hi_q + 1.5 * delent_iqr) df = df.loc[~discard] dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=True, compression='lz4', storage_options=s3_options) except Exception as err: print('error in perform_transit %s' % str(err)) client.close() raise err client.close() return True
def test_append_cat_fp(tmpdir): pytest.importorskip('fastparquet') path = str(tmpdir) # https://github.com/dask/dask/issues/4120 df = pd.DataFrame({"x": ["a", "a", "b", "a", "b"]}) df["x"] = df["x"].astype("category") ddf = dd.from_pandas(df, npartitions=1) dd.to_parquet(ddf, path) # this fails: dd.to_parquet(ddf, path, append=True, ignore_divisions=True) d = dd.read_parquet(path).compute() assert d['x'].tolist() == ["a", "a", "b", "a", "b"] * 2
def perform_traffic_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] client: Client = dask.create_dask_client(num_workers=8) s3_in_prefix: str = 's3://' + in_bucket + '/' try: s3_options: Dict = ps.fetch_s3_options() usecols = [1, 2, 4, 5] names = ['speed', 'traveltime', 'datetime', 'linkid'] for year in years: s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' s3_in_url: str = s3_in_prefix + '*' + year + '.csv' df = dd.read_csv(urlpath=s3_in_url, storage_options=s3_options, header=None, usecols=usecols, names=names, parse_dates=['datetime'], date_parser=row_ops.clean_traffic_date, skipinitialspace=True, skip_blank_lines=True, converters={ 'speed': row_ops.clean_num, 'traveltime': row_ops.clean_num, 'linkid': row_ops.clean_num }, encoding='utf-8') dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=True, compression='GZIP', storage_options=s3_options) except Exception as err: print('error in perform_transit %s' % str(err)) client.close() raise err client.close() return True
def write(self, item, data, metadata={}, npartitions=None, chunksize=None, overwrite=False, epochdate=False, compression="snappy", **kwargs): if utils.path_exists(self._item_path(item)) and not overwrite: raise ValueError(""" Item already exists. To overwrite, use `overwrite=True`. Otherwise, use `<collection>.append()`""") if isinstance(data, Item): data = data.to_pandas() else: # work on copy data = data.copy() if epochdate or "datetime" in str(data.index.dtype): data = utils.datetime_to_int64(data) if data.index.name == "": data.index.name = "index" if not isinstance(data, dd.DataFrame): if npartitions is None and chunksize is None: npartitions = None chunksize = int(1e6) data = dd.from_pandas(data, npartitions=npartitions, chunksize=chunksize) dd.to_parquet(data, self._item_path(item, as_string=True), compression=compression, engine='fastparquet', **kwargs) utils.write_metadata( utils.make_path(self.datastore, self.collection, item), metadata) # update items self.items = self.list_items()
def preprocess_papers(metadata_filename) -> pd.DataFrame: # Get metadata of research papers metadata_cols = [ "cord_uid", "title", "authors", "abstract", "publish_time", "url", "journal", "pdf_json_files", ] metadata_cols_dtypes = {col: str for col in metadata_cols} metadata_dd = dd.read_csv(metadata_filename, dtype=metadata_cols_dtypes, usecols=metadata_cols) print( f"Memory usage of metadata_df before clean: {metadata_dd.memory_usage(deep=True).sum()}" ) # Perform operations in place to reduce memory usage metadata_dd = remove_papers_with_null_cols(metadata_dd, ["title"]) metadata_dd = remove_papers_with_null_cols(metadata_dd, ["abstract", "url"]) metadata_dd = remove_papers_with_null_cols(metadata_dd, ["authors"]) metadata_dd = remove_papers_with_null_cols(metadata_dd, ["publish_time"]) metadata_dd = fill_in_missing_data(metadata_dd) print( f"Memory usage of metadata_df after clean: {metadata_dd.memory_usage(deep=True).sum()}" ) print(f"# partitions in metadata dd: {metadata_dd.npartitions}") # Get body of research papers and store in df # TODO: Rename gather_papers_data and put below embedding computation logic in it metadata_with_body_dd = gather_papers_data(metadata_dd) metadata_with_body_dd = metadata_with_body_dd.repartition( partition_size="100MB") print( f"# partitions in research papers' metadata dd: {metadata_with_body_dd.npartitions}" ) dd.to_parquet( metadata_with_body_dd, "research_paper_bodies/", engine="fastparquet", compute_kwargs={"scheduler": "synchronous"}, # synchronous ~2m40s ) # Get embeddings of each research paper's title and abstract (embeddings of body text would lose too much info due to current ineffective pooling techniques) """
def test_ordering(tmpdir, write_engine, read_engine): tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp, engine=write_engine) if read_engine == 'fastparquet': pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine) assert_eq(ddf, ddf2, check_divisions=False)
def test_ordering(tmpdir): check_fastparquet() tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp) pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex') assert_eq(ddf, ddf2)
def test_ordering(tmpdir, write_engine, read_engine): tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp, engine=write_engine) if read_engine == 'fastparquet': pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine) assert_eq(ddf, ddf2, check_divisions=False)
def test_ordering(tmpdir): check_fastparquet() tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp) pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex') assert_eq(ddf, ddf2)
def clean_cabs_at_path(special: bool, s3_in_url: str, s3_out_url: str, s3_options: Dict) -> bool: try: df = dd.read_parquet(path=s3_in_url, storage_options=s3_options, engine='fastparquet') # add cab zones if not special: print('In data clean tasks for cabs. Field dolocationid not found') # fetch cab zones taxi_zones_df: GeoDataFrame = fetch_cab_zones() df['dolocationid'] = df.map_partitions( partial(add_cab_zone, taxi_zone_df=taxi_zones_df, lon_var='dolongitude', lat_var='dolatitude', locid_var='dolocationid'), meta=('dolocationid', int64)) df['pulocationid'] = df.map_partitions( partial(add_cab_zone, taxi_zone_df=taxi_zones_df, lon_var='pulongitude', lat_var='pulatitude', locid_var='pulocationid'), meta=('pulocationid', int64)) del taxi_zones_df df = df[[ 'pudatetime', 'dodatetime', 'passengers', 'distance', 'dolocationid', 'pulocationid' ]] dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=True, compression='GZIP', storage_options=s3_options) del df except Exception as err: print('error in clean_cabs_at_path %s' % str(err)) raise err else: return True
def prepare_dataset(inloc, outloc, date_class, shapefile, crs='epsg:32618', seed=33): """ Extract xarray dataset variables for locations in the shapefile Args: inloc (str): path of the location of datasets to process outloc (str): path of location to write resulting dataframe as parquet crs (str): crs corresponding to dataset coordinates, default 'epsg:32618' date_class ([(str, np.datetime64)]): list of tuples with the first element indicating the column name with the phenology obs data in the shapefile and the second the corresponding date. e.g ('X20151222', np.datetime64('2015-12-22')) shapefile (str): path of location of the shapefile with class data regions in shapefile must have a lot identifier, coded as (IDLote) seed (int): seed for the generation of reproducible datasets, default 33 """ #Read shapefile fields_shp = gpd.read_file(shapefile).to_crs({'init': crs}) #Indicate if used as train or test # based on dictionary for each dic[date][polygon_index]:test/train ## eg {np.datetime('2015-12-22'):'2WKSD7':'train'} class_dict = assign_polygons_to_class(date_class, fields_shp, seed) dss = os.listdir(inloc) #Open the datasets dss = list(map(lambda ds: xr.open_dataset(inloc + ds), dss)) date_dataframes = [] for date in date_class: date_dataframes.append( create_date_dataframe(dss, fields_shp, date[1], date[0], class_dict[date[1]])) data = dd.concat(date_dataframes, axis=0, interleave_partitions=True) #Write results with valid polygon class data outloc dd.to_parquet(data[data['vclass'] > 0], outloc)
def run(self): conn_info={'host':'safsand', 'port':5433, 'user':'******', 'password':'******', 'database':'MBFS', 'read_timeout':600, 'unicode_error':'ignore', 'ssl':False} connection=vertica_python.connect(**conn_info) cur=connection.cursor() cur.execute(""" select * from abc""") # ..... connection.close() # .... ddf.to_parquet(self.output().path)
def convert_to_parquet(path2files, cols): """ Function to convert files to parquet format, after selecting columns of interest :param path2files: List of all monthly files :param cols: List of strings, Columns to be selected in final output :return: Nothing, export to parquet in same directory """ r = re.compile(r"A2019[0-9]{2}") for f in path2files: print(f) df = dd.read_csv(f, usecols=cols, sep=";", compression='gzip', blocksize=None) df = df.repartition(npartitions=20) # Filter : legal part only df = df.loc[df["PRS_REM_TYP"].isin([0, 1]), :] dd.to_parquet(df, "data/" + r.findall(f)[0])
def append(self, item, data, npartitions=None, chunksize=1e6, epochdate=False, compression="snappy", **kwargs): if not utils.path_exists(self._item_path(item)): raise ValueError( """Item do not exists. Use `<collection>.write(...)`""") # work on copy data = data.copy() try: if epochdate or ("datetime" in data.index.dtype_str and any(data.index.nanosecond) > 0): data = utils.datetime_to_int64(data) old_index = dd.read_parquet(self._item_path(item, as_string=True), columns='index', engine='fastparquet').index.compute() data = data[~data.index.isin(old_index)] except Exception: return if data.empty: # if len(data.index) == 0: return if data.index.name == "": data.index.name = "index" data = dd.from_pandas(data, npartitions=npartitions, chunksize=int(chunksize)) dd.to_parquet(data, self._item_path(item, as_string=True), append=True, compression=compression, engine='fastparquet', **kwargs)
def write_parquet( self, data: Union[pd.DataFrame, pd.Series, dd.DataFrame], overwrite: bool = True, present: str = "ignore", columns_to_string: bool = True, **kwargs, ) -> None: if not parquet_ok: raise TPImportError(errormessage_hdf5) if self.suffix != ".parquet": warnings.warn( f"path {self} does not have '.parquet' as suffix while using to_parquet. The path will be " f"changed to a path with '.parquet' as suffix") self.change_suffix(".parquet") compression = kwargs.get("compression", None) if compression is not None and compression != "snappy": warnings.warn( "TransparentPath can not write parquet files with a compression that is not snappy. You " f"specified '{compression}', it will be replaced by 'snappy'.") if not self.nocheck: self._check_multiplicity() if not overwrite and self.is_file() and present != "ignore": raise TPFileExistsError() if columns_to_string and not isinstance(data.columns[0], str): data.columns = data.columns.astype(str) if self.__class__.cli is None: self.__class__.cli = client.Client(processes=False) check_kwargs(dd.to_parquet, kwargs) dd.to_parquet(data, self.with_suffix("").__fspath__(), engine="pyarrow", compression="snappy", **kwargs)
def aggregation_df(self, parquet=False): '''returns df that contains all of the patient level data When parquet is equal to True it copies data over to a parquet file format which is faster for querying''' df = dd.read_csv('./rolling_window_output/pt_level_*.csv') df = df.categorize(columns=[ 'Window', 'Category_Type', 'Category', 'Population_Type', 'Diagnosis_Category', 'Diagnosis_Category', 'Age_Category', 'Gender' ]) df['AdjustedPriceAmt'] = df['AdjustedPriceAmt'].astype(float) df['Encounter'] = df['Encounter'].astype(int) df['ServiceCount'] = df['ServiceCount'].astype(int) df['VisitInpatientDays'] = df['VisitInpatientDays'].astype(int) if parquet == True: dd.to_parquet(df, 'rolling_window_parquet') df = dd.read_parquet('rolling_window_parquet/') return df
def test_append_with_partition(tmpdir): check_fastparquet() tmp = str(tmpdir) df0 = pd.DataFrame({'lat': np.arange(0, 10), 'lon': np.arange(10, 20), 'value': np.arange(100, 110)}) df0.index.name = 'index' df1 = pd.DataFrame({'lat': np.arange(10, 20), 'lon': np.arange(10, 20), 'value': np.arange(120, 130)}) df1.index.name = 'index' dd_df0 = dd.from_pandas(df0, npartitions=1) dd_df1 = dd.from_pandas(df1, npartitions=1) dd.to_parquet(dd_df0, tmp, partition_on=['lon']) dd.to_parquet(dd_df1, tmp, partition_on=['lon'], append=True, ignore_divisions=True) out = dd.read_parquet(tmp).compute() out['lon'] = out.lon.astype('int') # just to pass assert # sort required since partitioning breaks index order assert_eq(out.sort_values('value'), pd.concat([df0, df1])[out.columns], check_index=False)
def to_parquet(df: dd.DataFrame, out_bucket: str, folder: str, compute: bool = True) -> bool: try: s3_out_url: str = 's3://' + out_bucket + '/' + folder s3_options: Dict = ps.fetch_s3_options() dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=compute, compression='lz4', storage_options=s3_options) except Exception as err: print('error while saving to parquet to path %(path)s - %(error)s' % { 'path': out_bucket + '/' + folder, 'error': str(err) }) raise err else: return True
def _write(self, collection, path, **kwargs): """ This method implements Parquet writing. This method uses the Dask to_parquet() function, which automatically creates parent directories if required. Args: collection: dask dataframe, to be written to disk path: str, full path of the target **kwargs: dictionary, named arguments to be passed to to_parquet. Returns: filenames read or list of list of tasks """ return to_parquet(collection, path, **kwargs)
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") data = dd.read_parquet(data_path) data = data.repartition(npartitions=2) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = DaskLFApplier(lfs) L = applier.apply(data) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] data = data.reset_index().set_index("index") data_labeled = data.assign(y_prob=dd.from_array(y_prob)) dd.to_parquet(data_labeled, output_path) logging.info(f"Labels saved to {output_path}")
def clean_write_parquet(dataframe, path, engine="fastparquet", compute=True): """Write a parquet file with common options standardized in the project """ # Ensure we are writing dask dataframes, not accidentally pandas! if not isinstance(dataframe, dask.dataframe.DataFrame): raise ValueError( "Attempted to write dask dataframe, but got a {}".format( str(type(dataframe)))) # Clear the dest directory to prevent partial mixing of files from an old # archive if the number of partitions has changed. try: shutil.rmtree(path) except FileNotFoundError: # No worries if the output doesn't exist yet. pass # Do the write with the given options. return dataframe.to_parquet(path, compression="snappy", engine=engine, compute=compute)
def run_pipeline(task_type: str) -> bool: map: Dict = task_type_map[task_type] in_bucket: str = map['in'] out_bucket: str = map['out'] cols: Dict[str, str] = map['cols'] converters: Dict[str, Callable] = map['converters'] dtypes: Dict[str, str] = map['dtypes'] index_col: str = map['index']['col'] sorted: bool = map['index']['sorted'] row_op: Callable = map['row_op'] diff: Dict = map['diff'] filter_by_key: str = resample_map['filter_by']['key'] filter_by_val: int = resample_map['filter_by']['value'] resample_freq: str = resample_map['freq'] aggr_func: Callable = map['aggr_func'] try: #client = Client(address='dscheduler:8786') s3_in_url: str = 's3://'+in_bucket+'/*.*' s3_options: Dict = ps.fetch_s3_options() #df = dd.read_table(path=s3_in_url, storage_options=s3_options) df = dd.read_table(urlpath='tmp/'+in_bucket+'/*.*', header=0, usecols=lambda x: x.upper() in list(cols.keys()), skipinitialspace=True, converters=converters ) # rename columns df = df.rename(columns=cols) df.compute() if sorted: df = df.map_partitions(lambda pdf: pdf.rename(columns=cols) .apply(func=row_op, axis=1), meta=dtypes).compute() else: df = df.map_partitions(lambda pdf: pdf.rename(columns=cols) .set_index(index_col).sort().reset_index() .apply(func=row_op, axis=1), meta=dtypes).compute() # map row-wise operations #df = df.map_partitions(lambda pdf: pdf.apply(func=row_op, axis=1), meta=dtypes) # diff if diff['compute']: df[diff['new_col']] = df[diff['col']].diff() # specific processing for transit if task_type == 'cl-transit': df = df.map_partitions(partial(remove_outliers, col='DELEXITS'), meta=dtypes) # drop na values df = df.dropna() # set index (assumes pre-sorted data) df = df.set_index(index_col, sorted=True) #df.compute() # filter if filter_by_key == 'weekday': df = df.loc[df[index_col].weekday() == filter_by_val] # resample using frequency and aggregate function specified df = compose(df.resample(resample_freq), aggr_func) # save in out bucket s3_out_url: str = 's3://' + out_bucket # dd.to_parquet(df=df, path=s3_out_url, storage_options=s3_options) dd.to_parquet(df=df, path='tmp/'+out_bucket+'/*.*') except Exception as err: print('error in run_pipeline %s' % str(err)) raise err return True
f_adj_neg = 'adj_neg/' + uuid.uuid4().hex f_adj_pos = 'adj_pos/' + uuid.uuid4().hex f_adj_neu = 'adj_neu/' + uuid.uuid4().hex with open(f_adj_pos, 'wb') as file: pickle.dump(adj_pos, file, pickle.HIGHEST_PROTOCOL) with open(f_adj_neu, 'wb') as file: pickle.dump(adj_neu, file, pickle.HIGHEST_PROTOCOL) with open(f_adj_neg, 'wb') as file: pickle.dump(adj_neg, file, pickle.HIGHEST_PROTOCOL) out["text"] = texts out["sentiment"] = df["sentiment"] out["length"] = length return out meta = pd.DataFrame(columns=["text", "sentiment", "length"]) meta["sentiment"] = meta["sentiment"].astype(np.int8) meta["text"] = meta["text"].astype(str) meta["length"] = meta["length"].astype(np.int64) records = dd.read_parquet(path=srcpath) t0 = time.time() result = records.map_partitions(analyze_df, meta=meta) parquet = dd.to_parquet(df=result[result["length"] > 0], path=tgtpath, compute=False) parquet.compute(scheduler="processes") t1 = time.time() print(t1 - t0)
def test_dask_parquet(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test") STORAGE_OPTIONS = { "account_name": "devstoreaccount1", "connection_string": CONN_STR, } df = pd.DataFrame({ "col1": [1, 2, 3, 4], "col2": [2, 4, 6, 8], "index_key": [1, 1, 2, 2], "partition_key": [1, 1, 2, 2], }) dask_dataframe = dd.from_pandas(df, npartitions=1) for protocol in ["abfs", "az"]: dask_dataframe.to_parquet( "{}://test/test_group.parquet".format(protocol), storage_options=STORAGE_OPTIONS, engine="pyarrow", ) fs = AzureBlobFileSystem(**STORAGE_OPTIONS) assert fs.ls("test/test_group.parquet") == [ "test/test_group.parquet/_common_metadata", "test/test_group.parquet/_metadata", "test/test_group.parquet/part.0.parquet", ] fs.rm("test/test_group.parquet") df_test = dd.read_parquet( "abfs://test/test_group.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df, df_test) A = np.random.randint(0, 100, size=(10000, 4)) df2 = pd.DataFrame(data=A, columns=list("ABCD")) ddf2 = dd.from_pandas(df2, npartitions=4) dd.to_parquet( ddf2, "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test_group2.parquet") == [ "test/test_group2.parquet/_common_metadata", "test/test_group2.parquet/_metadata", "test/test_group2.parquet/part.0.parquet", "test/test_group2.parquet/part.1.parquet", "test/test_group2.parquet/part.2.parquet", "test/test_group2.parquet/part.3.parquet", ] df2_test = dd.read_parquet( "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df2, df2_test) a = np.full(shape=(10000, 1), fill_value=1) b = np.full(shape=(10000, 1), fill_value=2) c = np.full(shape=(10000, 1), fill_value=3) d = np.full(shape=(10000, 1), fill_value=4) B = np.concatenate((a, b, c, d), axis=1) df3 = pd.DataFrame(data=B, columns=list("ABCD")) ddf3 = dd.from_pandas(df3, npartitions=4) dd.to_parquet( ddf3, "abfs://test/test_group3.parquet", partition_on=["A", "B"], storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.glob("test/test_group3.parquet/*") == [ "test/test_group3.parquet/A=1", "test/test_group3.parquet/_common_metadata", "test/test_group3.parquet/_metadata", ] df3_test = dd.read_parquet( "abfs://test/test_group3.parquet", filters=[("A", "=", 1)], storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() df3_test = df3_test[["A", "B", "C", "D"]] df3_test = df3_test[["A", "B", "C", "D"]].astype(int) assert_frame_equal(df3, df3_test) A = np.random.randint(0, 100, size=(10000, 4)) df4 = pd.DataFrame(data=A, columns=list("ABCD")) ddf4 = dd.from_pandas(df4, npartitions=4) dd.to_parquet( ddf4, "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", flavor="spark", write_statistics=False, ) fs.rmdir("test/test_group4.parquet/_common_metadata", recursive=True) fs.rmdir("test/test_group4.parquet/_metadata", recursive=True) fs.rm("test/test_group4.parquet/_common_metadata") fs.rm("test/test_group4.parquet/_metadata") assert fs.ls("test/test_group4.parquet") == [ "test/test_group4.parquet/part.0.parquet", "test/test_group4.parquet/part.1.parquet", "test/test_group4.parquet/part.2.parquet", "test/test_group4.parquet/part.3.parquet", ] df4_test = dd.read_parquet( "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df4, df4_test) A = np.random.randint(0, 100, size=(10000, 4)) df5 = pd.DataFrame(data=A, columns=list("ABCD")) ddf5 = dd.from_pandas(df5, npartitions=4) dd.to_parquet( ddf5, "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test group5.parquet") == [ "test/test group5.parquet/_common_metadata", "test/test group5.parquet/_metadata", "test/test group5.parquet/part.0.parquet", "test/test group5.parquet/part.1.parquet", "test/test group5.parquet/part.2.parquet", "test/test group5.parquet/part.3.parquet", ] df5_test = dd.read_parquet( "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df5, df5_test)
def to_parquet(df, path, x, y, p=10, npartitions=None, shuffle=None, compression='default'): """ Perform spatial partitioning on an input dataframe and write the result to a parquet file. The resulting parquet file will contain the same columns as the input dataframe, but the dataframe's original index will be dropped. The resulting parquet file will contain all of the rows from the input dataframe, but they will be spatially sorted and partitioned along a 2D Hilbert curve (https://en.wikipedia.org/wiki/Hilbert_curve). The parquet file will also contain custom metadata that is needed to reconstruct the Hilbert curve distances on load. This parquet file may then be used to construct SpatialPointsFrame instances using datashader.spatial.points.read_parquet. Parameters ---------- df: pd.DataFrame or dd.DataFrame The input dataframe to partition path: str The path where the resulting parquet file should be written. See dask.dataframe.to_parquet for description of supported path specifications. x, y The column labels in df of the x and y coordinates of each row p: int (default 10) The Hilbert curve order parameter that determines the resolution of the 2D grid that data points are rounded to before computing their Hilbert distance. Points will be discretized into 2 ** p bins in each the x and y dimensions. This parameter should be increased if the partitions of the resulting parquet files are significantly unbalanced. npartitions: int or None (default None) The number of partitions for the resulting parquet file. If None (the default) this is chosen to be the greater of 8 and len(df) // 2**23. In general, increasing the number of partitions will improve performance when processing small subsets of the overall parquet data set. But this comes at the cost of some additional overhead when processing the entire data set. shuffle: str or None (default None) The dask.dataframe.DataFrame.set_index shuffle method. If None, a default is chosen based on the current scheduler. compression: str or None (default) The dask.dataframe.to_parquet compression method. """ _validate_fastparquet() # Validate filename if (not isinstance(path, basestring) or not (path.endswith('.parquet') or path.endswith('.parq'))): raise ValueError("""\ 'filename must be a string ending with a .parquet or .parq extension""") # Remove any existing directory if os.path.exists(path): shutil.rmtree(path) # Normalize to dask dataframe if isinstance(df, pd.DataFrame): ddf = dd.from_pandas(df, npartitions=4) elif isinstance(df, dd.DataFrame): ddf = df else: raise ValueError(""" df must be a pandas or dask DataFrame instance. Received value of type {typ}""".format(typ=type(df))) # Get number of rows nrows = len(df) # Compute npartitions if needed if npartitions is None: # Make partitions of ~8 million rows with a minimum of 8 # partitions npartitions = max(nrows // 2**23, 8) # Compute data extents extents = ddf.map_partitions(_compute_extents, x, y).compute() x_range = (float(extents['x_min'].min()), float(extents['x_max'].max())) y_range = (float(extents['y_min'].min()), float(extents['y_max'].max())) # Compute distance of points along the Hilbert-curve ddf = ddf.assign(distance=ddf.map_partitions(_compute_distance, x=x, y=y, p=p, x_range=x_range, y_range=y_range, as_series=True)) # Set index to distance. This will trigger an expensive shuffle # sort operation ddf = ddf.set_index('distance', npartitions=npartitions, shuffle=shuffle) # Get list of the distance divisions computed by dask distance_divisions = [int(d) for d in ddf.divisions] # Save properties as custom metadata in the parquet file props = dict(version='1.0', x=x, y=y, p=p, distance_divisions=distance_divisions, x_range=x_range, y_range=y_range, nrows=nrows) # Drop distance index to save storage space ddf = ddf.reset_index(drop=True) # Save ddf to parquet dd.to_parquet(ddf, path, engine='fastparquet', compression=compression) # Open resulting parquet file pf = fp.ParquetFile(path) # Add a new property to the file metadata new_fmd = copy.copy(pf.fmd) new_kv = fp.parquet_thrift.KeyValue() new_kv.key = 'SpatialPointsFrame' new_kv.value = json.dumps(props) new_fmd.key_value_metadata.append(new_kv) # Overwrite file metadata fn = os.path.join(path, '_metadata') fp.writer.write_common_metadata(fn, new_fmd, no_row_groups=False) fn = os.path.join(path, '_common_metadata') fp.writer.write_common_metadata(fn, new_fmd)
def _write(cls, collection, path, **kwargs): return to_parquet(collection, path, **kwargs)
def get_ld(rgeno, rbim, tgeno, tbim, kbwindow=1000, threads=1, max_memory=None, justd=False, extend=False): """ Get the LD blocks from snp overlap between two populations :param rgeno: Genotype array of the reference populartion :param rbim: Mapping variant info and the genotype array position for ref :param tgeno: Genotype array of the target populartion :param tbim: Mapping variant info and the genotype array position for tar :param kbwindow: Size of the window in KB :param threads: Number of threads to use for computation :param max_memory: Memory limit :param justd: Return only the raw LD matrices or the tagging/cotagging :param extend: 'Circularize' the genome by extending both ends :return: A list of tuples (or dataframe if not justd) with the ld per block """ # # Set CPU limits # soft, hard = resource.getrlimit(resource.RLIMIT_NPROC) # resource.setrlimit(resource.RLIMIT_NPROC, (threads, hard)) # soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) # print('Soft limit changed to :', soft) # set Cache to protect memory spilling rp = 'r.pickle' if os.path.isfile(rp): with open(rp, 'rb') as pckl: r = pickle.load(pckl) else: if max_memory is not None: available_memory = max_memory else: available_memory = psutil.virtual_memory().available cache = Chest(available_memory=available_memory) if os.path.isfile('ld.matrix'): print('Loading precomputed LD matrix') r = dd.read_parquet('ld.matrix') else: print('Computing LD score per window') # Get the overlapping snps and their info shared = ['chrom', 'snp', 'pos'] mbim = rbim.merge(tbim, on=shared, suffixes=['_ref', '_tar']) assert mbim.i_ref.values.shape == mbim.i_tar.values.shape # Get the number of bins or loci to be computed nbins = np.ceil(max(mbim.pos) / (kbwindow * 1000)).astype(int) # Get the limits of the loci bins = np.linspace(0, max(mbim.pos) + 1, num=nbins, endpoint=True, dtype=int) if bins.shape[0] == 1: # Fix the special case in which the window is much bigger than # the range bins = np.append(bins, kbwindow * 1000) # Get the proper intervals into the dataframe mbim['windows'] = pd.cut(mbim['pos'], bins, include_lowest=True) # Compute each locus in parallel dask_rgeno = dask.delayed(rgeno) dask_tgeno = dask.delayed(tgeno) delayed_results = [ dask.delayed(single_window)(df, rg, tg, threads, max_memory, justd, extend) for rg, tg, ridx, tidx, df in window_yielder( dask_rgeno, dask_tgeno, mbim) ] opts = dict(num_workers=threads, cache=cache, pool=ThreadPool(threads)) with ProgressBar(), dask.config.set(**opts), open(rp, 'wb') as pck: r = tuple(dask.compute(*delayed_results)) pickle.dump(r, pck) r = tuple(x for x in r if x is not None) if justd: return r r = pd.concat(r) dd.to_parquet(r, 'ld.matrix') return r