def get_missing_dates(product, name): # get start date from raw_target() and update # TODO: actually do bdate range or whatever, currently making hard assumption of continuity in dates missing_back_dates = [] # TODO: backfill check biz days etc filename = os.path.join(raw_target(), 'product={}/name={}'.format(product, name)) first_new_date = pd.read_parquet(filename).date.max() filename = os.path.join(download_update_target(), 'product={}/name={}'.format(product, name)) if len(glob.glob(os.path.join(filename, '*.parquet'))) > 0: first_new_date = max(first_new_date, pd.read_parquet(filename).date.max()) first_new_date = first_new_date.date() + datetime.timedelta(days=1) return missing_back_dates, first_new_date
def download(state: str, year: int, month: int, cache: bool =True) -> object: """ Download SIH records for state year and month and returns dataframe :param month: 1 to 12 :param state: 2 letter state code :param year: 4 digit integer """ state = state.upper() year2 = str(year)[-2:] month = str(month).zfill(2) if year < 1992: raise ValueError("SIH does not contain data before 1994") ftp = FTP('ftp.datasus.gov.br') ftp.login() if year < 2008 and year > 1994: ftype = 'DBC' ftp.cwd('/dissemin/publicos/SIASUS/199407_200712/Dados') fname = 'PA{}{}{}.dbc'.format(state, year2, month) fname2 = None if year >= 2008: ftype = 'DBC' ftp.cwd('/dissemin/publicos/SIASUS/200801_/Dados'.format(year)) fname = 'PA{}{}{}.dbc'.format(state, str(year2).zfill(2), month) fname2 = 'BI{}{}{}.dbc'.format(state, str(year2).zfill(2), month) # Check in Cache cachefile = os.path.join(CACHEPATH, 'SIA_' + fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) else: df = _fetch_file(fname, ftp, ftype) if cache: df.to_parquet(cachefile) if fname2 is not None: cachefile2 = os.path.join(CACHEPATH, 'SIA_' + fname2.split('.')[0] + '_.parquet') if os.path.exists(cachefile2): # reads from cache df2 = pd.read_parquet(cachefile2) else: # fetches from DataSUS try: df2 = _fetch_file(fname2, ftp, ftype) if cache: #saves to cache df2.to_parquet(cachefile2) except Exception as e: df2 = None print(e) else: df2 = None return df, df2
def get_sample_data(): """ reload(do); globals().update(do.get_sample_data()) """ df = pd.read_parquet('enriched/nrows=all/product=etfs/name=qqq') X_train, y_train, X_val, y_val = get_xy_data_plain(df) return locals()
def download(state: str, year: int, month: int, cache: bool=True) -> object: """ Download SIH records for state year and month and returns dataframe :param month: 1 to 12 :param state: 2 letter state code :param year: 4 digit integer """ state = state.upper() year2 = int(str(year)[-2:]) month = str(month).zfill(2) if year < 1992: raise ValueError("SIH does not contain data before 1994") ftp = FTP('ftp.datasus.gov.br') ftp.login() if year < 2008: ftype = 'DBC' ftp.cwd('/dissemin/publicos/SIHSUS/199201_200712/Dados') fname = 'RD{}{}{}.dbc'.format(state, year2, month) if year >= 2008: ftype = 'DBC' ftp.cwd('/dissemin/publicos/SIHSUS/200801_/Dados'.format(year)) fname = 'RD{}{}{}.dbc'.format(state, str(year2).zfill(2), month) cachefile = os.path.join(CACHEPATH, 'SIH_' + fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df df = _fetch_file(fname, ftp, ftype) if cache: df.to_parquet(cachefile) return df
def test_local(self): with self.temp_dir() as tmp: data = pd.DataFrame({ 'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O")}) data = data[['i32', 'i64', 'f', 'bhello']] self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \ .coalesce(1).write.parquet(tmp, mode='overwrite') def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = koalas.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.toPandas()) check(None, data) check(['i32', 'i64'], data[['i32', 'i64']]) check(['i64', 'i32'], data[['i64', 'i32']]) check(('i32', 'i64'), data[['i32', 'i64']]) check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']]) check([], pd.DataFrame([])) check(['a'], pd.DataFrame([])) check('i32', pd.DataFrame([])) check('float', data[['f']]) # check with pyspark patch. if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp) else: expected = data actual = koalas.read_parquet(tmp) self.assertPandasEqual(expected, actual.toPandas())
def download(state, year, cache=True): """ Downloads data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :return: pandas dataframe """ state = state.upper() if year < 1994: raise ValueError("SINASC does not contain data before 1994") ftp = FTP('ftp.datasus.gov.br') ftp.login() if year >= 1996: ftp.cwd('/dissemin/publicos/SINASC/NOV/DNRES') fname = 'DN{}{}.DBC'.format(state, year) else: ftp.cwd('/dissemin/publicos/SINASC/ANT/DNRES') fname = 'DNR{}{}.DBC'.format(state, str(year)[-2:]) cachefile = os.path.join(CACHEPATH, 'SINASC_'+fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) df = read_dbc(fname, encoding='iso-8859-1') if cache: df.to_parquet(cachefile) os.unlink(fname) return df
def plot_category_category(input_file, col1, col2, path): df = pd.read_parquet(input_file, columns=[col1, col2]) if len(df[col1].unique()) < len(df[col2].unique()): col1, col2 = col2, col1 file_name = os.path.join(path, f"{col1}-{col2}-bar-plot.png") bar_plot(df, col1, hue=col2, file_name=file_name) file_name = os.path.join(path, f"{col1}-{col2}-heatmap.png") heatmap(pd.crosstab(df[col1], df[col2]), file_name=file_name)
def plot_single_category(input_file, col, path): df = pd.read_parquet(input_file, columns=[col]) value_counts = df[col].value_counts(dropna=False) # if the categories are more than 50 then this should be ignored # TODO find a better way to visualize this if len(value_counts) > 50: ignore.add(col) else: file_name = os.path.join(path, col + "-bar-plot.png") bar_plot(df, col, file_name=file_name)
def check_round_trip(self, df, engine, expected=None, **kwargs): with tm.ensure_clean() as path: df.to_parquet(path, engine, **kwargs) result = read_parquet(path, engine) if expected is None: expected = df tm.assert_frame_equal(result, expected) # repeat to_parquet(df, path, engine, **kwargs) result = pd.read_parquet(path, engine) if expected is None: expected = df tm.assert_frame_equal(result, expected)
def load_entity_data(metadata, root): '''Load an entity's data from disk.''' if metadata['data_files']['filetype'] == 'pickle': data = pd_read_pickle(os.path.join(root, metadata['data_files']['data_filename'])) df = data['df'] elif metadata['data_files']['filetype'] == 'parquet': df = pd.read_parquet(os.path.join(root, metadata['data_files']['df_filename']), engine=metadata['data_files']['engine']) df.index = df[metadata['index']] to_join = metadata['data_files'].get('to_join', None) if to_join is not None: for cname, to_join_names in to_join.items(): df[cname] = df[to_join_names].apply(tuple, axis=1) df.drop(to_join_names, axis=1, inplace=True) else: raise ValueError("Unknown entityset data filetype: {}".format(metadata['data_files']['filetype'])) return df
def check_round_trip(self, df, engine, expected=None, write_kwargs=None, read_kwargs=None): if write_kwargs is None: write_kwargs = {} if read_kwargs is None: read_kwargs = {} with tm.ensure_clean() as path: df.to_parquet(path, engine, **write_kwargs) result = read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected) # repeat to_parquet(df, path, engine, **write_kwargs) result = pd.read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected)
def download(state, year, cache=True): """ Downloads data directly from Datasus ftp server :param state: two-letter state identifier: MG == Minas Gerais :param year: 4 digit integer :return: pandas dataframe """ year2 = str(year)[-2:].zfill(2) state = state.upper() if year < 1979: raise ValueError("SIM does not contain data before 1979") ftp = FTP('ftp.datasus.gov.br') ftp.login() if year >= 1996: ftp.cwd('/dissemin/publicos/SIM/CID10/DORES') fname = 'DO{}{}.DBC'.format(state, year) else: ftp.cwd('/dissemin/publicos/SIM/CID9/DORES') fname = 'DOR{}{}.DBC'.format(state, year2) cachefile = os.path.join(CACHEPATH, 'SIM_'+fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df try: ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) except: try: ftp.retrbinary('RETR {}'.format(fname.upper()), open(fname, 'wb').write) except: raise Exception("File {} not available".format(fname)) df = read_dbc(fname, encoding='iso-8859-1') if cache: df.to_parquet(cachefile) os.unlink(fname) return df
def get_CID9_table(cache=True): """ Fetch the CID9 table :param cache: :return: """ ftp = FTP('ftp.datasus.gov.br') ftp.login() ftp.cwd('/dissemin/publicos/SIM/CID9/TABELAS') fname = 'CID9.DBF' cachefile = os.path.join(CACHEPATH, 'SIM_' + fname.split('.')[0] + '_.parquet') if os.path.exists(cachefile): df = pd.read_parquet(cachefile) return df try: ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write) except: raise Exception('Could not download {}'.format(fname)) dbf = DBF(fname, encoding='iso-8859-1') df = pd.DataFrame(list(dbf)) if cache: df.to_parquet(cachefile) os.unlink(fname) return df
parser = argparse.ArgumentParser() parser.add_argument('--snps', required=True, help='Whitespace-delimited file with SNPs to extract. Must include columns A1,A2 and either (1) SNP or (2) both CHR and BP') parser.add_argument('--out', required=True, help='Prefix of the name of the output file') parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, the script will not terminate if some SNPs are not found in the meta file') parser.add_argument('--q', type=int, default=100, help='The maximum ratio between the largest and smallest prior causal probabilities') args = parser.parse_args() #check package versions check_package_versions() #configure logger configure_logger(args.out) #read snps file try: df_snps = pd.read_parquet(args.snps) except ArrowIOError: df_snps = pd.read_table(args.snps, delim_whitespace=True) if 'A1' not in df_snps.columns: raise ValueError('missing column A1') if 'A2' not in df_snps.columns: raise ValueError('missing column A2') if 'SNP' not in df_snps.columns: if 'CHR' not in df_snps.columns: raise ValueError('You must provide either a SNP or a CHR column') if 'BP' not in df_snps.columns: raise ValueError('You must provide either a SNP or a BP column') #read df_meta script_dir = os.path.dirname(os.path.realpath(__file__)) df_meta1 = pd.read_parquet(os.path.join(script_dir, 'snpvar_meta.chr1_7.parquet'))
def pd_df_parquet_load(filename, **kwargs): return pd.read_parquet(filename, **kwargs)
MODEL_TYPE = "rnn_updated" LOG_DIR = f"../EiT/tmp/logs/{MODEL_TYPE}" LOG_LEVEL = "ERROR" TARGET_PATH = "../EiT/final_datasets" model_24h_path_local = "../EiT/tmp/models/rnn_updated/rnn_updated_20210303_162009/model" model_48h_path_local = "../EiT/tmp/models/rnn_updated/rnn_updated_20210303_172934/model" model_24h = tf.keras.models.load_model(f"{model_24h_path_local}/saved_model") model_48h = tf.keras.models.load_model(f"{model_48h_path_local}/saved_model") # Load all .parquet files as dataframes dfs = {} for path in glob.glob(f"{TARGET_PATH}/**/*.parquet", recursive=True): df = pd.read_parquet(path) df_name = path.split(os.sep)[-1].split('.')[0] dfs[df_name] = df # Convert dataframe to numpy arrays train_df, val_df, test_df = dfs['train'], dfs['val'], dfs['test'] train_data = train_df.values.astype('float32') train_targets = train_df['nox'].values.astype('float32') val_data = val_df.values.astype('float32') val_targets = val_df['nox'].values.astype('float32') test_data = test_df.values.astype('float32') test_targets = test_df['nox'].values.astype('float32')
def sklearn_regressor(booster, params, num_round): reg = XGBRegressor(n_estimators=num_round, missing=-999, **_complete_params(params)) reg._Booster = booster return reg if __name__ == '__main__': import sys train_set_path = sys.argv[1] trials_db_path = sys.argv[2] output_path = sys.argv[3] train_set = pd.read_parquet(train_set_path) cv_splits = tscv.split(train_set['date_block_num'].values, n=1, window=16) dtrain = xgb.DMatrix(*df_to_X_y(train_set), missing=-999) del train_set trials_db = 'sqlite:///%s' % trials_db_path study = optuna.create_study(direction='minimize', load_if_exists=True, study_name=output_path, storage=trials_db, pruner=optuna.pruners.HyperbandPruner()) n_trials = MAX_EVALS - len(study.trials) if n_trials > 0: objective = make_xgb_objective(make_xgb_loss(dtrain, cv_splits)) try:
def query(parquetFilePath, columnList: list = [], continuousQueries: list = [], discreteQueries: list = []) -> pd.DataFrame: """ Performs mulitple queries on a parquet dataset. If no queries or columns are passed, it returns the entire dataset as a pandas dataframe. Otherwise, returns only the queried data over the requested columns as a Pandas dataframe :type parquetFilePath: string :param parquetFilePath: filepath to a parquet file to be queried on :type columnList: list of strings :param columnList: list of column names that will be included in the data resulting from the queries :type continuousQueries: list of ContinuousQuery objects :param continuousQueries: list of objects representing queries on a column of continuous data :type discreteQueries: list of DiscreteQuery objects :param discreteQueries: list of objects representing queries on a column of discrete data :return: Requested columns with results of all queries :rtype: Pandas dataframe """ if len(columnList) == 0 and len(continuousQueries) == 0 and len( discreteQueries) == 0: df = pd.read_parquet(parquetFilePath) df.set_index("Sample", drop=True, inplace=True) return df #extract all necessary columns in order to read them into pandas for query in continuousQueries: if query.columnName not in columnList: columnList.append(query.columnName) for query in discreteQueries: if query.columnName not in columnList: columnList.append(query.columnName) columnList.insert(0, "Sample") df = pd.read_parquet(parquetFilePath, columns=columnList) df.set_index("Sample", drop=True, inplace=True) del columnList[0] #perform continuous queries, adjusting for which operator is to be used for query in continuousQueries: if query.operator == OperatorEnum.Equals: df = df.loc[df[query.columnName] == query.value, [col for col in columnList]] elif query.operator == OperatorEnum.GreaterThan: df = df.loc[df[query.columnName] > query.value, [col for col in columnList]] elif query.operator == OperatorEnum.GreaterThanOrEqualTo: df = df.loc[df[query.columnName] >= query.value, [col for col in columnList]] elif query.operator == OperatorEnum.LessThan: df = df.loc[df[query.columnName] < query.value, [col for col in columnList]] elif query.operator == OperatorEnum.LessThanOrEqualTo: df = df.loc[df[query.columnName] <= query.value, [col for col in columnList]] #perform discrete queries for query in discreteQueries: df = df.loc[df[query.columnName].isin(query.values), [col for col in columnList]] return df
import pandas as pd from sklearn import preprocessing def get_logger(): FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger('main') logger.setLevel(logging.INFO) return logger logger = get_logger() # Load different data sources. train_df = pd.read_parquet('input/processed/train_static_features.parquet.gzip') test_df = pd.read_parquet('input/processed/test_static_features.parquet.gzip') # OHE Cols with small number of uniques (less than 20) ohe_cols = ['trafficSource.campaign', 'channelGrouping', 'trafficSource.adwordsClickInfo.page', 'trafficSource.medium', 'geoNetwork.continent', 'trafficSource.keyword_groups', 'device.deviceCategory', 'totals.bounces', 'totals.newVisits', 'trafficSource.adwordsClickInfo.slot', 'trafficSource.adwordsClickInfo.adNetworkType'] # Remove target col. y_train = train_df['totals.transactionRevenue'].values train_df = train_df.drop(['totals.transactionRevenue'], axis=1) # Join datasets for rowise feature engineering.
def plot_numeric_numeric(input_file, col1, col2, path): df = pd.read_parquet(input_file, columns=[col1, col2]) file_name = os.path.join(path, f"{col1}-{col2}-scatter-plot.png") scatter_plot(df, col1, col2, file_name=file_name)
import os import pandas as pd _mydir = os.path.dirname(os.path.realpath(__file__)) # 8 sec _filename = os.path.join(_mydir, 'raw/yahoo/2010-01-01_to_2018-11-29') try: df_orig except NameError as e: df_orig = pd.read_parquet(_filename)
def read_dfr_from_parquet(self, region_id): columns=['lon', 'lat', 'date', 'day_since'] + self.labels store_name = os.path.join(self.data_path, '{0}.parquet'.format(region_id)) dfr = pd.read_parquet(store_name, columns=columns) return dfr
def test_linear(): old_uri = tracking.get_tracking_uri() with TempDir(chdr=False, remove_on_exit=True) as tmp: try: diamonds = tmp.path("diamonds") root_tracking_dir = tmp.path("root_tracking_dir") os.mkdir(diamonds) os.mkdir(root_tracking_dir) tracking.set_tracking_uri(root_tracking_dir) # Download the diamonds dataset via mlflow run run(".", entry_point="download-example-data", version=None, parameters={"dest-dir": diamonds}, experiment_id=0, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, use_temp_cwd=False, storage_dir=None) initial = os.path.join(root_tracking_dir, "0") dir_list = os.listdir(initial) # Run the main linear app via mlflow run("examples/linear-regression", entry_point="main", version=None, parameters={ "training-data-path": os.path.join(diamonds, "train_diamonds.parquet"), "test-data-path": os.path.join(diamonds, "test_diamonds.parquet"), "alpha": .001, "l1-ratio": .5, "label-col": "price" }, experiment_id=0, mode="local", cluster_spec=None, git_username=None, git_password=None, use_conda=True, use_temp_cwd=False, storage_dir=None) # Identifying the new run's folder main = None for item in os.listdir(initial): if item not in dir_list: main = item pyfunc = load_pyfunc( os.path.join(initial, main, "artifacts/model/model.pkl")) df = pandas.read_parquet( os.path.join(diamonds, "test_diamonds.parquet")) # Removing the price column from the DataFrame so we can use the features to predict df = df.drop(columns="price") # Predicting from the saved pyfunc predict = pyfunc.predict(df) # Make sure the data is of the right type assert isinstance(predict[0], numpy.float64) finally: tracking.set_tracking_uri(old_uri)
def test_from_parquet_partitioned_columns_with_columns(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"]) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) df_equals(modin_df, pandas_df)
def test_from_parquet_partition(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE, directory=True) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME) df_equals(modin_df, pandas_df)
def test_from_parquet(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME) df_equals(modin_df, pandas_df)
print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format( end_mem, 100 * (start_mem - end_mem) / start_mem)) return df def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9): maes = (y_true - y_pred).abs().groupby(groups).mean() return np.log(maes.map(lambda x: max(x, floor))).mean() ##################### # READ INPUT FILES ##################### logger.info('Reading input files....') path = 'data/' train_df = pd.read_parquet(f'{path}/FE005-train.parquet') test_df = pd.read_parquet(f'{path}/FE005-test.parquet') ss = pd.read_csv('input/sample_submission.csv') # structures = pd.read_csv('input/structures.csv') train_df = reduce_mem_usage(train_df) test_df = reduce_mem_usage(test_df) train_df['atom_0'] = train_df['atom_0'].astype('category') train_df['atom_1'] = train_df['atom_1'].astype('category') test_df['atom_0'] = test_df['atom_0'].astype('category') test_df['atom_1'] = test_df['atom_1'].astype('category') train_df['type_0'] = train_df['type_0'].astype('category') test_df['type_0'] = test_df['type_0'].astype('category') ##################### # FEATURE CREATION
def __init__(self): self._holiday_df = pd.read_parquet(constants.holidays_parquet_table)
def pd_read_s3_parquet(key, bucket, s3_client=None, **args): if s3_client is None: s3_client = boto3.client('s3') obj = s3_client.get_object(Bucket=bucket, Key=key) return pd.read_parquet(io.BytesIO(obj['Body'].read()), **args)
def readParquetAndReturnDataFrame(bucket_name, key): dataS3 = S3Operations.readFile(bucket_name, key, True) return pd.read_parquet(dataS3)
def get_data(): filename = os.path.expanduser('~/projects/data/extractors/pdr/yahoo_price_volume/product=etfs') df = pd.read_parquet(filename) return df
def get_dicarlo_su(proj_dir,fn='su_selectivity_dicarlo_hi_var.pqt'): return pd.read_parquet(os.path.join(proj_dir,'data',fn))
def main(load_folder=f'../../data/xgb_split/', label='interventions', dir_name=DIR_NAME): # fixes: # - reduce n workers # - less features # - smaller dataset load_folder = Path(load_folder) overwrite = False if not load_folder.exists() or overwrite: print('Data path not found. Creating data.') n_docs = prepare_data() print('processing complete. Recommended to rerun the script.') quit() load_folder = (Path( '/'.join(load_folder.name.split('_')[:-1] + [n_docs]))).mkdir( exist_ok=True, parents=True) # todo: doesn't work n_string = f"{load_folder.name.split('_')[-1]}" dir_name = f'{random.randint(0,999)}_{label}' if not dir_name else dir_name start_time = time.time() label = sys.argv[2] if len(sys.argv) > 2 else label # Load X and y print( f"Loading datasets {', '.join([str(p.name) for p in list(load_folder.glob('*.parquet'))])} from {load_folder}" ) X_train = pd.read_parquet(load_folder / f'X_train.parquet') features = X_train.columns X_train = X_train.values y_train_df = pd.read_parquet(load_folder / f'y_train.parquet') X_test = pd.read_parquet(load_folder / f'X_test.parquet').values y_test_df = pd.read_parquet(load_folder / f'y_test.parquet') test_words = y_test_df.copy()['Word'] train_index = pd.read_parquet(load_folder / f'X_train.parquet').index y_train_df['all'] = ~(y_train_df.sum(axis=1) == 0) y_test_df['all'] = ~(y_test_df.sum(axis=1) == 0) print('\ntrain docs:\n', train_index) print('\n Docs in validation set:\n', y_train_df.index.unique('doc')) n_est = 500 params['n_estimators'] = n_est print(f"\nTraining a GBC model to predict label '{label}'.") print('input shapes: ', [df.shape for df in [X_train, X_test, y_train_df, y_test_df]]) print('Starting SKLRF.') # PIO.insert(0, 'all') print(PIO) # Set the parameters by cross-validation # for n_est in n_estimators: # for i, label in enumerate(PIO): y_train = np.where(y_train_df[label].values, label, f'not_{label}') y_test = np.where(y_test_df[label].values, label, f'not_{label}') print(np.unique(y_train, return_counts=True)) label_counts = np.unique(y_train, return_counts=True)[1] params['class_weight'] = { label: label_counts[1] / label_counts[0], f'not_{label}': 1.0 } print(params['class_weight']) print(f'\nTesting n_estimators {n_est} for label {label}') clf = RandomForestClassifier(**params) clf.fit(X=X_train, y=y_train) y_pred = clf.predict(X_test) with open('dump1.pickle', 'wb') as f: pickle.dump(y_pred, f) print("\n******************************\n") report = pd.DataFrame( classification_report(y_test.flatten(), y_pred.flatten(), digits=3, output_dict=True)) print(report.head()) print("\n******************************\n") print(report) time.sleep(3) print(f"Saving data...") exp_folder = EXPERIMENTS / dir_name / label / str(n_est) print('exp folder is here:', exp_folder.resolve()) exp_folder.mkdir(exist_ok=True, parents=True) probs = clf.predict_proba(X_test) preds = { 'token': test_words, f'not_{label}': probs[:, 0], f'{label}': probs[:, 1], f'pred': y_pred, 'true': y_test } pd.DataFrame(preds, index=test_words.index).to_csv(exp_folder / 'predict.csv') feat_imps = clf.feature_importances_ pd.DataFrame(feat_imps, index=features).to_csv(exp_folder / 'feature_importances.csv') report.to_csv(exp_folder / 'class_report.csv') # pd.to_csv(evals_result, exp_folder / 'evals_result.csv') # pd.DataFrame({'pred': prediction*1, 'true': sets['val'][1]*1}, index=val_index).to_csv(exp_folder / 'prediction.csv') if hasattr(clf, 'clf.best_score'): print( f"No score improvement detected over {params['early_stopping_rounds']} steps, terminating." ) write_dict((exp_folder / '.params'), params) print(f'\n Total computation time: {time.time() - start_time:.2f}s') print(f"Saved data for run with target '{label}' in '{dir_name}'")
lgb_params = { "boosting_type": "gbdt", "objective": "regression_l2", "learning_rate": LEARNING_RATE, "num_leaves": 255, "sub_feature": 0.50, "sub_row": 0.75, "bagging_freq": 1, "metric": EVAL_METRIC, 'random_state': RANDOM_STATE } folds = GroupKFold(n_splits=N_FOLDS) # Setup arrays for storing results train_df = pd.read_parquet( 'data/FE008_train.parquet') # only loading for skeleton not features oof_df = train_df[['id', 'type', 'scalar_coupling_constant']].copy() mol_group = train_df[['molecule_name', 'type']].copy() del train_df gc.collect() oof_df['oof_preds'] = 0 test_df = pd.read_parquet( 'data/FE008_test.parquet') # only loading for skeleton not features prediction = np.zeros(len(test_df)) feature_importance = pd.DataFrame() test_pred_df = test_df[['id', 'type', 'molecule_name']].copy() del test_df gc.collect() test_pred_df['prediction'] = 0 bond_count = 1
def plot_single_numeric(input_file, col, path): df = pd.read_parquet(input_file, columns=[col]) file_name = os.path.join(path, f"{col}-dist-plot.png") data = df[col].dropna() f, axes = plt.subplots(2, 1, sharex=True, figsize=(8, 6)) histogram_violin_plots(data, axes, file_name=file_name)
import os import pandas as pd import matplotlib.pyplot as plt import numpy as np from plotnine import * import gzip data = pd.read_parquet("merged_data_pop2010.gzip") ''' This code produces difference-in-difference plots for policy interventions in Florida, Texas, and Washington. Each state is compared to the rest of the United States in each plot ### Reading in data df = pd.read_parquet('merged_data.gzip') df.head() ##Processing data. No longer needed with waht we are using now. #df['Deaths'] = df['Deaths'].astype('float') #df['POPESTIMATE2010'] = df['POPESTIMATE2010'].astype('float') #df.drop(['COUNTY', 'FIPS'], axis = 1) ### Calculating per capita Drugs df['DrugsPerCapita'] = df['MME_Calculated']/df['population'] ###Start by subsetting data by years. The analysis will be completed using all available data for that specific area. ###The following function subsets data by years.
print('Load random Tweets:') start_time = time.time() paths_to_random = list( np.array_split(glob(os.path.join(path_to_data, '*.parquet')), SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]) print('#files:', len(paths_to_random)) tweets_random = pd.DataFrame() for file in paths_to_random: print(file) tweets_random = pd.concat( [tweets_random, pd.read_parquet(file)[['tweet_id', 'text']]]) print(tweets_random.shape) print('load random sample:', str(time.time() - start_time), 'seconds') print(tweets_random.shape) print('dropping duplicates:') # random contains 7.3G of data!! start_time = time.time() tweets_random = tweets_random.drop_duplicates('text') print('drop duplicates:', str(time.time() - start_time), 'seconds') print(tweets_random.shape) start_time = time.time() print('converting to list') examples = tweets_random.text.values.tolist()
def load_test_features(col): return pd.read_parquet( "filtered_test_features/filtered_test_feature{}.gzip".format(col))
def load_parquet(self): f = io.BytesIO() f.write(self.parquet_file.value) f.seek(0) self.df = pd.read_parquet(f)
log_file = '../log.txt' logging.basicConfig(filename=log_file, level=logging.INFO) logger = logging.getLogger() handler = logging.StreamHandler() logger.addHandler(handler) logging.info('\n<============================================================================>') logging.info(f'\nCurrent working directory: {os.getcwd()}') logging.info(f'\nApplication started ... ({time.ctime()})\n') # Load calibration data # y = (pd.read_csv('RevenewML/flatfiles/Final Preprocessed For MichaelSampled Latest.txt', sep='\t') # .loc[:, ['ProjectID', 'Report_Group_Flag', 'Y_Has_Claim', 'Partition']] # ) # y.to_parquet('RevenewML/flatfiles/Y_Calibration.parquet') y = pd.read_parquet('RevenewML/flatfiles/Y_Calibration.parquet') # count_profiles = (dt.fread('datasets/Count_Profiles_Calibration.csv', show_progress=True) count_profiles = (pd.read_parquet('RevenewML/flatfiles/Count_Profiles_Calibration.parquet') # .to_pandas() .merge(y, on=['ProjectID', 'Report_Group_Flag']) .drop(columns=['Y_Has_Claim', 'Partition']) ) # count_profiles.to_parquet('RevenewML/flatfiles/Count_Profiles_Calibration.parquet') # duplicate_reports = (dt.fread('datasets/Duplicate_Reports_Calibration.csv', show_progress=True) duplicate_reports = (pd.read_parquet('RevenewML/flatfiles/Duplicate_Reports_Calibration.parquet') # .to_pandas() .merge(y, on=['ProjectID', 'Report_Group_Flag']) .drop(columns=['Y_Has_Claim', 'Partition']) # ).sample(100000)
def plot_category_numeric(input_file, category_col, numeric_col, path): df = pd.read_parquet(input_file, columns=[category_col, numeric_col]) f, axes = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(8, 6)) axes = list(chain.from_iterable(axes)) file_name = os.path.join(path, f"{category_col}-{numeric_col}-plot.png") bar_box_violin_dot_plots(df, category_col, numeric_col, axes, file_name=file_name)
def read_parquet(file_widget): f = io.BytesIO() f.write(file_widget.value) f.seek(0) return pd.read_parquet(f)
# Methods if __name__ == "__main__": # Get subdirectories of gamedays subdirs = utils.get_gameday_subdirs(path=CONFIG.get('inpath'), window_start=CONFIG.get( 'windowStartDate'), window_end=CONFIG.get( 'windowEndDate'), left_incl=True, right_incl=True) # Append Files df_innings = pd.concat( objs=[pd.read_parquet( CONFIG.get('inpath') + subdir + '/innings.parquet') for subdir in subdirs if "innings.parquet" in os.listdir( CONFIG.get('inpath') + subdir)], axis=0 ) # Get atbat level subset atbat_vars = [x for x in df_innings if x[:6] == 'atbat_'] atbat_vars.append('game_id') df_atbats = df_innings.loc[:, atbat_vars] df_atbats.drop_duplicates(inplace=True) # Sort df_atbats.sort_values( by=['game_id', 'atbat_batter', 'atbat_num'],
# sort by date kb.sort_values(by='date', inplace=True) kb.reset_index(drop=True, inplace=True) # give unique alliance IDs kb['alliance_id'] = kb.apply(lambda x: uuid.uuid4(), axis=1) kb['alliance_id'] = kb['alliance_id'].astype('str') # save as parquet file kb.to_parquet('/Users/Jakob/Documents/financial_news_data/kb.parquet.gzip') ## News articles # read news articles reuters = pd.read_parquet( '/Users/Jakob/Documents/financial_news_data/reuters.parquet.gzip') bloomberg = pd.read_parquet( '/Users/Jakob/Documents/financial_news_data/bloomberg.parquet.gzip') news = reuters.append(bloomberg) news = news[['Date', 'Link', 'Article', 'Headline']] news.columns = [x.lower() for x in news.columns] # lowercase all column names # sort out dates news['date'] = pd.to_datetime(news['date'], utc=True, infer_datetime_format=True) news.sort_values(by='date', inplace=True) news['date'] = news['date'].dt.date # keep only date not time # remove city and source tag (only Reuters articles have them)
def read_transform_write(infile, outfile): print('{} -> {}'.format(infile, outfile)) df = pd.read_parquet(infile) enrich_pandas_single(df, inplace=True) table = pa.Table.from_pandas(df, preserve_index=False) pq.write_to_dataset(table, root_path=outfile, preserve_index=False)
parser.add_argument("--inference_folder", type=str) parser.add_argument("--new_iteration_folder", type=str) args = parser.parse_args() return args if __name__ == '__main__': args = get_args_from_command_line() data_path = '/scratch/mt4493/twitter_labor/twitter-labor-data/data' already_labelled_ids_path = os.path.join(data_path, 'active_learning', 'sampling_top_lift', args.country_code, args.inference_folder, 'already_labelled_ids.parquet') already_labelled_ids_df = pd.read_parquet(already_labelled_ids_path) already_labelled_ids_df['tweet_id'] = already_labelled_ids_df[ 'tweet_id'].astype(str) already_labelled_labels_path = os.path.join(data_path, 'qualtrics', args.country_code, 'old_iters', 'labeling') already_labelled_labels_df = pd.concat([ pd.read_parquet(path) for path in Path(already_labelled_labels_path).glob('*.parquet') ]) already_labelled_labels_df['tweet_id'] = already_labelled_labels_df[ 'tweet_id'].astype(str) labels_df = already_labelled_ids_df.merge(already_labelled_labels_df, on=['tweet_id' ]).reset_index(drop=True)
def _load(self) -> pd.DataFrame: load_path = PurePosixPath(self._get_load_path()) with self._s3.open(str(load_path), mode="rb") as s3_file: return pd.read_parquet(s3_file, **self._load_args)
def read_parquet(parq_file: pathlib.Path) -> pd.DataFrame: return pd.read_parquet(parq_file)
import boto3 import pandas import time FILE_NAME = 'some-file.parquet' pandas.set_option('display.expand_frame_repr', False) session = boto3.Session(profile_name='sbx') s3 = session.client('s3') file_object = s3.download_file( Bucket='test-bucket', Key='location/to/folder/partition='+time.strftime('%Y%m%d')+'/'+FILE_NAME, Filename=FILE_NAME) df = pandas.read_parquet(FILE_NAME) print(df.count()) print("=======================") print(df.head(20))
def pd_series_parquet_load(filename, **kwargs): series = pd.read_parquet(filename, **kwargs).ix[:, 0] if series.name == '_series': series.name = None return series
def check(columns, expected): if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): expected = pd.read_parquet(tmp, columns=columns) actual = koalas.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.toPandas())
from bigdl.optim.optimizer import * from bigdl.dataset.transformer import * #from matplotlib.pyplot import imshow #import matplotlib.pyplot as plt # create sparkcontext with bigdl configuration sc = SparkContext.getOrCreate(conf=create_spark_conf().setMaster("local[*]")) init_engine() # prepare the bigdl environment bigdl.version.__version__ # Get the current BigDL version # using SQLContext to read parquet file from pyspark.sql import SQLContext sqlContext = SQLContext(sc) pdf = pd.read_parquet("/home/matt/Bengalia/train_image_data_0.parquet").iloc[ 0:50, :] pdf_sml = pdf.iloc[0:25, :] sdf = sqlContext.createDataFrame(pdf_sml) rdd_train_images = sdf.drop('image_id').rdd rdd_train_labels = sc.parallelize( pd.read_csv("/home/matt/Bengalia/train.csv")["grapheme_root"].iloc[0:25]) rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map( lambda features_label: common.Sample.from_ndarray( np.asarray([x / 255 for x in features_label[0]]), features_label[1] + 1)) pdf_sml = pdf.iloc[26:50, :] sdf = sqlContext.createDataFrame(pdf_sml) rdd_test_images = sdf.drop('image_id').rdd rdd_test_labels = sc.parallelize( pd.read_csv("/home/matt/Bengalia/train.csv")["grapheme_root"].iloc[26:50])