def _transform(self, dfs): icustay_index = dfs['icustay_detail'].set_index('icustay_id') for df_name in ['chartevents', 'labevents', 'ioevents']: df = dfs[df_name] df['icustay_intime'] = icustay_index.loc[df['icustay_id'], 'icustay_intime'].values df = df[df['charttime'] > df['icustay_intime']] dfs[df_name] = df demo_history = DemographicTransform().transform(dfs['icustay_detail']) chart_history = ChartTransform().transform(dfs['chartevents']) lab_history = LabTransform().transform(dfs['labevents']) urine_history = UrineTransform().transform(dfs['ioevents']) all_history = from_chunks( [chart_history, lab_history, demo_history, urine_history]) all_history['charttime'] = big_dt_to_num(all_history['charttime']) all_history = all_history.sort_index( by=['subject_id', 'charttime']).reset_index(drop=True) all_history['charttime'] = num_to_big_dt(all_history['charttime']) all_history.drop_duplicates(inplace=True) all_history = all_history[[ 'subject_id', 'charttime', 'category', 'valuenum' ]] return all_history
def _transform(self, df): df['days'] = df['end_date'] - df['start_date'] + pd.Timedelta(days=1) invalid_range_mask = df[ 'days'] < 1 #remove rows with invalid (negative) deltas df = df[~invalid_range_mask] max_days = ceil(df['days'].max().days) df['date'] = df['start_date'] del df['start_date'], df['end_date'] expanded = [df] expand_per_row = False for i in range(2, max_days): prev = expanded[-1] rows_as_long_as = prev[prev['days'] >= pd.Timedelta(days=i)] rows_as_long_as.loc[:, 'date'] += pd.Timedelta(days=1) # if there are only a relative few rows remaining with long date ranges, # it will be more efficient to expand them per-row rather than per-date-range if len(rows_as_long_as) < max_days / len( rows_as_long_as.columns)**2: expand_per_row = True break expanded.append(rows_as_long_as) #expand remaining rows per-row rather than per-date-range if expand_per_row: rala = rows_as_long_as # represent any categorical columns as ints for fast expansion rala_cat = rala.select_dtypes(include=['category']) rala_categories = {} for col in rala_cat.columns: rala_categories[col] = rala[col].cat.categories rala[col] = rala[col].cat.codes # for each row, generate expanded rows as remaining days in the date range for j, row in rala.iterrows(): date = row['date'] n_days = row['days'].days - i expanded_row = pd.DataFrame(index=np.arange(n_days + 1), columns=row.index) for col in expanded_row.columns: expanded_row[col] = row[col] expanded_row['date'] = pd.date_range( date, date + pd.Timedelta(days=n_days)) #convert numeric back to categorical for col, categories in rala_categories.items(): expanded_row[col] = pd.Categorical.from_codes( codes=expanded_row[col], categories=categories, name=col) expanded.append(expanded_row) # merge all extra rows df = from_chunks(expanded) return df
def _transform(self, df): df['days'] = df['end_date'] - df['start_date'] + pd.Timedelta(days=1) invalid_range_mask = df['days'] < 1 #remove rows with invalid (negative) deltas df = df[~invalid_range_mask] max_days = ceil(df['days'].max().days) df['date'] = df['start_date'] del df['start_date'], df['end_date'] expanded = [df] expand_per_row = False for i in range(2, max_days): prev = expanded[-1] rows_as_long_as = prev[prev['days'] >= pd.Timedelta(days=i)] rows_as_long_as.loc[:, 'date'] += pd.Timedelta(days=1) # if there are only a relative few rows remaining with long date ranges, # it will be more efficient to expand them per-row rather than per-date-range if len(rows_as_long_as) < max_days / len(rows_as_long_as.columns)**2: expand_per_row = True break expanded.append(rows_as_long_as) #expand remaining rows per-row rather than per-date-range if expand_per_row: rala = rows_as_long_as # represent any categorical columns as ints for fast expansion rala_cat = rala.select_dtypes(include=['category']) rala_categories = {} for col in rala_cat.columns: rala_categories[col] = rala[col].cat.categories rala[col] = rala[col].cat.codes # for each row, generate expanded rows as remaining days in the date range for j, row in rala.iterrows(): date = row['date'] n_days = row['days'].days - i expanded_row = pd.DataFrame(index=np.arange(n_days + 1), columns=row.index) for col in expanded_row.columns: expanded_row[col] = row[col] expanded_row['date'] = pd.date_range(date, date+pd.Timedelta(days=n_days)) #convert numeric back to categorical for col, categories in rala_categories.items(): expanded_row[col] = pd.Categorical.from_codes( codes=expanded_row[col], categories=categories, name=col ) expanded.append(expanded_row) # merge all extra rows df = from_chunks(expanded) return df
def _transform(self, data): start = time.time() print('transforming data of size', data.memory_usage(index=True).sum(), 'bytes') store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print('splitting data into large groups') group_iter = self._group_iter( data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append( joblib.delayed(self.store_chunks_job)(hdf_store)) #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print('breaking data into chunks in parallel') joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [ joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores ] print('running transforms in', len(transform_jobs), 'parallel jobs') result_hdf_stores = joblib.Parallel( n_jobs=self.n_jobs)(transform_jobs) print('loading and merging the results') results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print('finished merge') finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print('took', end - start, 'seconds to transform all data in parallel') return results
def _transform(self, tables): # filtering carevue ioevents io_carevue = VasoIOEventsCarevue().transform(tables) # filtering metavision ioevents io_metavision = VasoIOEventsMetavision().transform(tables) # concatenating ioevents ioevents = from_chunks([io_carevue, io_metavision]) # getting vaso day counts vaso_days = VasoDayCounts().transform(ioevents) return vaso_days
def load(self): seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame), chunksize=CHUNK_SIZE, dshape=schema_to_dshape(self.schema)) def conv_chunks(chunks): for chunk in chunks: print('typechecking a chunk') self.schema.conform_df(chunk, skip_sort=True) yield chunk print('concatenating df chunks') df = from_chunks(conv_chunks(seq)) return df
def _transform(self, data): start = time.time() print("transforming data of size", data.memory_usage(index=True).sum(), "bytes") store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print("splitting data into large groups") group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store)) # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print("breaking data into chunks in parallel") joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores] print("running transforms in", len(transform_jobs), "parallel jobs") result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs) print("loading and merging the results") results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print("finished merge") finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print("took", end - start, "seconds to transform all data in parallel") return results
def _transform(self, dfs): icustay_index = dfs['icustay_detail'].set_index('icustay_id') for df_name in ['chartevents', 'labevents', 'ioevents']: df = dfs[df_name] df['icustay_intime'] = icustay_index.loc[df['icustay_id'], 'icustay_intime'].values df = df[df['charttime'] > df['icustay_intime']] dfs[df_name] = df demo_history = DemographicTransform().transform(dfs['icustay_detail']) chart_history = ChartTransform().transform(dfs['chartevents']) lab_history = LabTransform().transform(dfs['labevents']) urine_history = UrineTransform().transform(dfs['ioevents']) all_history = from_chunks([chart_history, lab_history, demo_history, urine_history]) all_history['charttime'] = big_dt_to_num(all_history['charttime']) all_history = all_history.sort_index(by=['subject_id', 'charttime']).reset_index(drop=True) all_history['charttime'] = num_to_big_dt(all_history['charttime']) all_history.drop_duplicates(inplace=True) all_history = all_history[['subject_id', 'charttime', 'category', 'valuenum']] return all_history
def _transform(self, dfs): icustay_index = dfs["icustay_detail"].set_index("icustay_id") for df_name in ["chartevents", "labevents", "ioevents"]: df = dfs[df_name] df["icustay_intime"] = icustay_index.loc[df["icustay_id"], "icustay_intime"].values df = df[df["charttime"] > df["icustay_intime"]] dfs[df_name] = df demo_history = DemographicTransform().transform(dfs["icustay_detail"]) chart_history = ChartTransform().transform(dfs["chartevents"]) lab_history = LabTransform().transform(dfs["labevents"]) urine_history = UrineTransform().transform(dfs["ioevents"]) all_history = from_chunks([chart_history, lab_history, demo_history, urine_history]) all_history["charttime"] = big_dt_to_num(all_history["charttime"]) all_history = all_history.sort_index(by=["subject_id", "charttime"]).reset_index(drop=True) all_history["charttime"] = num_to_big_dt(all_history["charttime"]) all_history.drop_duplicates(inplace=True) all_history = all_history[["subject_id", "charttime", "category", "valuenum"]] return all_history
from chatto_transform.config import config from chatto_transform.schema.ss.ss_sql_raw_schema import appointments from chatto_transform.datastores.hdf_datastore import HdfDataStore from chatto_transform.lib.chunks import from_chunks import time ds = HdfDataStore(appointments, config.data_dir + 'test.hdf') chunks = ds.load_chunks() start = time.time() df = from_chunks(chunks) end = time.time() print('took', end - start, 'seconds to load and concatenate all data')
from chatto_transform.config import config from chatto_transform.schema.ss.ss_sql_raw_schema import appointments from chatto_transform.datastores.hdf_datastore import HdfDataStore from chatto_transform.lib.chunks import from_chunks import time ds = HdfDataStore(appointments, config.data_dir+'test.hdf') chunks = ds.load_chunks() start = time.time() df = from_chunks(chunks) end = time.time() print('took', end - start, 'seconds to load and concatenate all data')