def map_genes_to_genomic_coordinates(gene_path, gene_annotation_path, gencode_path): """ Reads in the gene and gene_annotation tables along with a gencode annotation file and uses the gencode annotations to assign genomic coordinates to genes in gene_annotations before writing the updated table back to disk. @param gene_path [`string`] Path to the gene table .csv @param gene_annotation_path [`string`] Path to the gene_annotation table .csv @param gencode_path [`string`] Path tot he genecode annotation table .csv @return [None] Modifies gene_annotation table and writes .csv to disk """ # -- Load in the required data gene = fread(gene_path) gene_annot = fread(gene_annotation_path) gencode = fread(gencode_path) vsub = np.vectorize(re.sub) gencode[:, update( gene_id=vsub('[.][0-9]*$', '', gencode['gene_id'].to_numpy()))] # -- Add gene name back to gene_annotation gene.key = 'id' # join columns need the same name, stupid... gene_annot[:, update(id=f.gene_id)] gene_annot.key = 'id' gene_a = gene_annot[:, :, dt.join(gene)] gene_a = gene_a[:, [ name not in ('symbol', 'strand') for name in gene_a.names ]] # -- Prepocess the genomic coordinates gencode.names = {'gene_id': 'name', 'gene_name': 'symbol'} gencode = gencode[:, ['name', 'start', 'end', 'strand', 'seqnames', 'symbol']] gencode.key = 'name' gene_a.key = 'name' # -- Map coordinates to gene_annotations, check that nothing went wrong gene_annotation = gene_a[:, :, dt.join(gencode)].copy() # sanity check the mappings didn't get messed up if not np.all( gene_annotation['name'].to_numpy() == gene['name'].to_numpy()): raise ValueError( 'The gene_annotation table got mangled while trying to map' 'genomic coordinates!') # -- Clean up the table and write to disk gene_annotation[:, update(gene_seq_start=f.start, gene_seq_end=f.end, chr=f.seqnames)] del gene_annotation[:, ['name', 'id', 'start', 'end', 'seqnames']] gene_annotation.to_jay(gene_annotation_path)
def test_update_misplaced(): DT = dt.Frame(A=range(5)) with pytest.raises(TypeError, match="Column selector must be an integer " "or a string"): DT[update(B=0)] with pytest.raises(TypeError, match="Invalid item at position 2 in " r"DT\[i, j, \.\.\.\] call"): DT[:, :, update(B=0)]
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' forecast_len = 7 # get COVID19 data from NYTimes github us_total = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv") # produce lag of 1 unit and add as new feature for each column in the list series_cols = ["cases", "deaths"] aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols} us_total[:, update(**aggs), sort(date_col)] # update NA lags to 0 aggs = {f"{col}_yesterday": 0 for col in series_cols} us_total[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)] # compute daily values by differentiating aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols} us_total[:, update(**aggs), sort(date_col)] # delete columns with yesterday (shift) values series_cols_to_delete = [f"{col}_yesterday" for col in series_cols] del us_total[:, series_cols_to_delete] # set negative daily values to 0 us_total[f.cases_daily < 0, [f.cases_daily]] = 0 us_total[f.deaths_daily < 0, [f.deaths_daily]] = 0 # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_total[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_total[date_col].to_pandas() train = us_total[df[date_col] <= split_date, :] test = us_total[df[date_col] > split_date, :] # return [train, test] and rename dataset names as needed return {f"covid19_daily_{split_date}_us_train": train, f"covid19_daily_{test_date}_us_test": test}
def map_foreign_key_to_table(primary_df: dt.Frame, fk_df: dt.Frame, join_column_dict: dict) -> dt.Frame: """ Performs a left join of `primary_df` to `fk_df` by refence, updating the column indicated in `join_column_dict`. :primary_df: A `datatable.Frame`. This should be the larger table and will ideally be loaded from a .jay file with a `memory_limit` specified in `datable.fread`. :fk_df: A `datatable.Frame`. This should be a smaller table which will be joined to :join_column_dict: A dictionary with keys 'primary_df' and 'fk_df' specifying the columns to join the tables on. """ # Check for correct keys in dict key_strings = list(join_column_dict.keys()) if ('primary_df' not in key_strings or 'fk_df' not in key_strings): raise ValueError("The join_column_dict item must have keys" "'primary_df' and 'fk_df'!") # Rename columns primary_col = join_column_dict['primary_df'] fk_col = join_column_dict['fk_df'] fk_df.names = {fk_col: primary_col} fk_df.key = primary_col update_expr = {primary_col: g.id} # Join, update by reference then coerce to the correct type primary_df[:, update(**update_expr), join(fk_df)]
def test_assign_different_types(): DT = dt.Frame(A=range(5), B=list("ABCDE")) DT = DT[:, ["A", "B"]] # for py35 assert DT.stypes == (dt.int32, dt.str32) DT[:, update(A=dt.float32, B=dt.str64)] assert_equals(DT, dt.Frame(A=range(5), B=list("ABCDE"), stypes=dict(A=dt.float32, B=dt.str64)))
def test_update_multiple_columns(): DT = dt.Frame(A=range(5)) DT[:, update(I8=dt.int8(f.A), I16=dt.int16(f.A), I64=dt.int64(f.A))] assert_equals( DT, dt.Frame([[0, 1, 2, 3, 4]] * 4, names=["A", "I8", "I16", "I64"], stypes=[dt.int32, dt.int8, dt.int16, dt.int64]))
def test_update_with_groupby(): DT = dt.Frame(A=range(5), B=[1, 1, 2, 2, 2]) DT[:, update(C=7, D=dt.mean(f.A), E=f.A + 1), by(f.B)] assert_equals( DT, dt.Frame(A=range(5), B=[1, 1, 2, 2, 2], C=[7] * 5, D=[0.5, 0.5, 3.0, 3.0, 3.0], E=range(1, 6)))
def write_table(df, name, output_dir, add_index=True): """ Add a primary key to df ("id" column) and write it to output_dir as a .jay file. @param df: [`datatable.Frame`] A PharmacoDB table @param name: [`string`] The name of the table @param output_dir: [`string`] The directory to write the table to @return: [`datatable.Frame`] The indexed PharmacoDB table """ logger.info(f"Writing {name} table to {output_dir}...") if add_index: # Index datatable df[:, update(id=np.arange(df.nrows) + 1)] df.to_jay(os.path.join(output_dir, f"{name}.jay")) return df
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' forecast_len = 7 # get COVID19 data from NYTimes github us_states = dt.fread( "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv" ) # get states population us_states_pop = dt.fread( "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv" ) us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'} us_states_pop.key = "state" # augment data with state population figures and create adjusted case and death counts us_states[:, dt.update(pop=dt.g.pop, pop100k=dt.g.pop / 100000, cases100k=dt.f.cases / (dt.g.pop / 100000), deaths100k=dt.f.deaths / (dt.g.pop / 100000)), dt.join(us_states_pop)] # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_states[:, date_col]) split_date = dates[-forecast_len:, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_states[date_col].to_pandas() train = us_states[df[date_col] < split_date, :] test = us_states[df[date_col] >= split_date, :] # return [train, test] and rename dataset names as needed return { "covid19_daily_by_states_train": train, "covid10_daily_by_states_test": test }
def test_update_with_delete(): DT = dt.Frame(A=range(5)) with pytest.raises(ValueError, match=r"update\(\) clause cannot be used " r"with a delete expression"): del DT[:, update(B=0)]
# Visualization alt.Chart(weather_dt.to_pandas()).transform_fold( ['temp_avg','temp_min','temp_max'], as_=['temp_type','temp_val'] ).mark_boxplot().encode( alt.Y('temp_type:O'), alt.X('temp_val:Q') ).properties(title='Weather temp distributions') # Adding a new column temp_diff weather_dt[:,update(temp_diff=f.temp_max-f.temp_min)] # Visualiztion alt.Chart(weather_dt[:,f.temp_diff].to_pandas()).mark_bar().encode( alt.X('temp_diff',bin=True), alt.Y('count()') ).properties( title='Distribution of temparature differences' ) # Downloading weather data and selecting specific columns related to weather conditions weather_temp = dt.fread('https://assets.datacamp.com/production/repositories/1497/datasets/02f3fb2d4416d3f6626e1117688e0386784e8e55/weather.csv',na_strings=[""] )[:,[f[1],f[7:]]] weather_temp
def test_update_mixed_2(): DT = dt.Frame(A=range(5)) DT[:, update(B=3, C=f.A)] assert_equals(DT, dt.Frame(A=range(5), B=[3] * 5, C=range(5)))
def test_update_mixed_dimensions(): DT = dt.Frame(A=range(5)) DT[:, update(B=f.A * 2, C=10)] assert_equals(DT, dt.Frame(A=range(5), B=range(0, 10, 2), C=[10] * 5))
gcd_missing_columns = np.setdiff1d(gcd_table_columns, np.asarray(gcd_dt.names)) for col in gcd_missing_columns: gcd_dt[col] = None gcd_dt1 = gcd_dt[:, list(gcd_table_columns)] # Sanity check the columns are there if not np.all(gcd_table_columns == np.asarray(gcd_dt1.names)): raise ValueError(f'The build_gene_compound_dataset table', ' has missing columns!') # -- Map to existing FK ids # gene id gcd_dt1.names = {'gene_id': 'gene_name'} gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'} gene_dt.key = 'gene_name' # NOTE: the g object references the joined tables namespace gcd_dt1[:, update(gene_id=g.gene_id), join(gene_dt)] ## TODO:: rewrite as a helper function # regex match failed ids, then assign to the table failed_genes = np.unique(gcd_dt1[dt.isna(f.gene_id), 'gene_name'].to_numpy().flatten()) if len(failed_genes) > 0: gene_queries = [re.compile(f'{gene}.*') for gene in failed_genes] gene_name_series = gene_dt['gene_name'].to_pandas().gene_name # needs to be float64 because Numpy has no NaN for int types... makes no sense!? # Pad with NaNs for failed matches gene_ids = gene_dt[match_idx, 'gene_id'].to_pandas().gene_id if (len(failed_match_idx) > 1): gene_ids = pd.Series(np.insert(gene_ids, failed_match_idx, None), dtype='int32') gcd_dt1[dt.isna(f.gene_id), update(gene_id=gene_ids)]
def build_gene_compound_dataset_df(gene_compound_dataset_file, gene_file, compound_file, dataset_file, output_dir, compound_names): """ Build gene_compound_dataset table (description?) @param gene_compound_dataset_file: [`str`] Path to the gene signature .csv file. @param gene_file: [`str`] Path to the gene table .csv file. @param compound_file: [`str`] Path to the compound table .csv file. @param dataset_file: [`str`] Path to the tissue table .csv file. @param output_dir: [`str`] Path to write the output file to. :param compound_name: [`str`] Path to an optional .csv file mapping updated compound names to the dataset. This is to ensure that corrected compound annotations still make it into the database without the need to rerun all the gene signatures @return [`datatable.Frame`] Writes the 'gene_compound_dataset.csv' file to output_dir the returns the table. """ # -- Check the input files exist for fl in [ gene_compound_dataset_file, gene_file, compound_file, dataset_file ]: if not os.path.exists(fl): raise FileNotFoundError(f'Could not find the {fl}') # -- Read in mapping tables gene_dt = fread(gene_file) compound_dt = fread(compound_file) dataset_dt = fread(dataset_file) # -- Read in gene_compound_tissue table gcd_dt = fread(gene_compound_dataset_file) # -- Fix names and assign missing columns gcd_dt.names = { 'gene': 'gene_id', 'compound': 'compound_id', 'dataset': 'dataset_id', 'lower': 'lower_analytic', 'upper': 'upper_analytic', 'pvalue': 'pvalue_analytic', 'fdr': 'fdr_analytic' } del gcd_dt[:, ['significant', 'tissue']] # Determine missing columns and assign them, so we don't have to change code #>when new columns are addeds gcd_table_columns = np.asarray( ('id', 'gene_id', 'compound_id', 'dataset_id', 'estimate', 'lower_analytic', 'upper_analytic', 'lower_permutation', 'upper_permutation', 'n', 'pvalue_analytic', 'pvalue_permutation', 'df', 'fdr_analytic', 'fdr_permutation', 'significant_permutation', 'permutation_done', 'sens_stat', 'mDataType')) gcd_missing_columns = np.setdiff1d(gcd_table_columns, np.asarray(gcd_dt.names)) for col in gcd_missing_columns: gcd_dt[col] = None gcd_dt1 = gcd_dt[:, list(gcd_table_columns)] # Sanity check the columns are there if not np.all(gcd_table_columns == np.asarray(gcd_dt1.names)): raise ValueError(f'The build_gene_compound_dataset table', ' has missing columns!') gcd_dt1[:, update(sens_stat='AAC', permutation_done=0)] # -- Map to existing FK ids # gene id gcd_dt1.names = {'gene_id': 'gene_name'} gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'} gene_dt.key = 'gene_name' # NOTE: the g object references the joined tables namespace gcd_dt1[:, update(gene_id=g.gene_id), join(gene_dt)] # make sure all genes mapped failed_genes = np.unique(gcd_dt1[dt.isna(f.gene_id), 'gene_name'] \ .to_numpy().flatten()) if len(failed_genes) > 0: warnings.warn(f'The genes: {failed_genes} did not map!') if (np.any(gcd_dt1[:, dt.isna(f.gene_id)].to_numpy())): warnings.warn('Some gene_ids in gene_compound_dataset are still NA!' 'Dropping the missing rows...') gcd_dt1 = gcd_dt1[~dt.isna(f.gene_id), :] # fix compound names ## FIXME:: Remove this when gene signatures are regenerated ## START patch fix_names_df = dt.fread(compound_names) fix_names_df[f.dataset == "GDSC_2020(v1-8.2)", update(dataset="GDSC_v1")] fix_names_df[f.dataset == "GDSC_2020(v2-8.2)", update(dataset="GDSC_v2")] fix_names_df.names = { "drugid": "compound_name", "unique.drugid": "compound_id", "dataset": "dataset_id" } fix_names_df.key = ["compound_name", "dataset_id"] gcd_dt1.names = {'compound_id': 'compound_name'} gcd_dt1[~dt.isna(g.compound_id), update(compound_name=g.compound_id), join(fix_names_df)] ## END patch # compound id compound_dt.names = {'id': 'compound_id', 'name': 'compound_name'} del compound_dt[:, 'compound_uid'] compound_dt.key = 'compound_name' gcd_dt1[:, update(compound_id=g.compound_id), join(compound_dt)] if np.any(gcd_dt1[:, dt.isna(f.compound_id)].to_numpy()): warnings.warn("Some compound_ids in gene_compound_dataset are stll " "NA! Dropping the missing rows...") gcd_dt1 = gcd_dt1[~dt.isna(f.compound_id)] # dataset id gcd_dt1.names = {'dataset_id': 'dataset_name'} dataset_dt.names = {'id': 'dataset_id', 'name': 'dataset_name'} dataset_dt.key = 'dataset_name' gcd_dt1[:, update(dataset_id=g.dataset_id), join(dataset_dt)] # -- Sort then assign the primary key column gcd_dt2 = gcd_dt1[:, list(gcd_table_columns), sort('gene_id', 'compound_id', 'dataset_id', 'mDataType' )] gcd_dt2[:, update(id=range(1, gcd_dt2.nrows + 1))] # Sanity check we didn't lose any rows if not gcd_dt.nrows == gcd_dt2.nrows: warnings.warn('The gene_compound_dataset table has lost some rows!') gcd_dt2.to_jay(os.path.join(output_dir, 'gene_compound_dataset.jay'))
# X: datatable - primary data set # Parameters: # time_col: date/time/int - time column to order rows before the shift op # group_by_cols: list of column names - group columns # shift_cols: list of column names - columns to shift # Output: # dataset augmented with shifted columns from datatable import f, by, sort, update, shift, isna time_col = "date" group_by_cols = ["state"] shift_cols = ["cases", "deaths"] new_dataset_name = "new_dataset_name_with_shift" # produce lag of 1 unit and add as new feature for each shift column aggs = {f"{col}_yesterday": shift(f[col]) for col in shift_cols} X[:, update(**aggs), sort(time_col), by(*group_by_cols)] # update NA lags aggs = {f"{col}_yesterday": 0 for col in shift_cols} X[isna(f[f"{shift_cols[0]}_yesterday"]), update(**aggs)] aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in shift_cols} X[:, update(**aggs), sort(time_col), by(group_by_cols)] for col in shift_cols: del X[:, f[f"{col}_yesterday"]] return {new_dataset_name: X}
def test_update_simple(): DT = dt.Frame(A=range(5)) DT[:, update(B=10)] assert_equals(DT, dt.Frame(A=range(5), B=[10] * 5))
"""Concatenate columns""" # Consolidate multiple columns into single text column by concatenating # them and adding column name as a prefix. # # Specification: # Inputs: # X: datatable - primary dataset # Parameters: # col_names - list of text column names to consolidate # txt_col_name - column name containing consolidated text # Output: # dataset containing original and consolidated columns from datatable import f, FExpr, update import functools col_names = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] txt_col_name = "toxic_consolidated" new_dataset_name = "new_dataset_with_concat_txt_column" concat_cols_fexpr = functools.reduce(FExpr.__add__, (col + ": " + f[col] + " " for col in col_names)) X[:, update(**{txt_col_name: concat_cols_fexpr})] return {new_dataset_name: X}
).properties( title='Top directors title counts and imdb ratings' ) alt.Chart(directors_views_rating_v1.to_pandas()).mark_bar().encode( alt.Y('directed_by',sort='-x'), alt.X('count'), alt.Color('us_views_millions') ).properties( title='Top directors title counts and US million views' ) # are the directors and writers same for a title ? amigos_info_dt[:,dt.update(temp=f.directed_by==f.written_by)] # are the directors and writers same for a title ? amigos_info_dt[f.temp==True,:] # remove the temp col del amigos_info_dt["temp"] # split writers column writers_list = [ elemento.split('&') for elemento in amigos_info_dt[:,f.written_by].to_list()[0] ] # create a new DT with writers writers_dt = dt.Frame({'no_of_writers':[len(elem) for elem in writers_list]}) # Joining two DTs amigos_info_dt_v1 = dt.cbind(amigos_info_dt,writers_dt)
# remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] # produce tuple for column slices col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return {"CreditCard-train-aug.csv": X}
def test_update_with_groupby(): DT = dt.Frame(A=range(5), B=[1, 2, 2, 2, 1]) assert DT.stype == dt.int32 DT[:, update(A=f.A * 100 / dt.sum(f.A)), by(f.B)] assert_equals( DT, dt.Frame(A=[0, 100 / 6, 100 / 3, 50, 100], B=[1, 2, 2, 2, 1]))
# In[8]: # DT[order(sepal_length - sepal_width), head(.SD, 2), by = species] DT[:2, :, by('species'), sort(f.sepal_length - f.sepal_width)] # Just like in R's [data.table](https://github.com/Rdatatable/data.table), boolean expressions can be passed to the [sort](https://datatable.readthedocs.io/en/latest/api/dt/sort.html#) function. # #### Filter observations above the mean of `sepal_length` by species # In[9]: # DT[, .SD[sepal_length > mean(sepal_length)], by = species] DT[:, update(temp=f.sepal_length > dt.mean(f.sepal_length)), by('species')] DT[f.temp == 1, :-1] # Unlike in R's [data.table](https://github.com/Rdatatable/data.table), boolean expressions can not be applied within the `i` section, in the presence of `by`. The next best thing is to break it down into two steps - create a temporary column to hold the boolean value, and then filter on that column. # #### Filter rows with group size greater than 10 # In[10]: # DT[, .SD[.N > 10], keyby = .(species, petal_width)] DT[:, update(temp=dt.count() > 10), by('species', 'petal_width')] DT[f.temp == 1, :-1]
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' group_by_cols = ["state"] forecast_len = 7 # get COVID19 data from NYTimes github us_states = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv") # get states population us_states_pop = dt.fread( "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv") us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'} us_states_pop.key = "state" # augment data with state population figures and create adjusted case and death counts series_cols = ["cases", "deaths"] aggs = {f"{col}100k": dt.f[col] / (dt.g.pop / 100000) for col in series_cols} us_states[:, dt.update(pop = g.pop, pop100k = g.pop / 10000, **aggs), join(us_states_pop)] # remove rows without state defined (resulted in unmatched rows after left outer join) del us_states[isna(f.pop), :] # produce lag of 1 unit and add as new feature for each column in the list series_cols.extend([col + "100k" for col in series_cols]) aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols} us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] # update NA lags to 0 aggs = {f"{col}_yesterday": 0 for col in series_cols} us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)] # compute daily values by differentiating aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols} us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] # delete columns with yesterday (shift) values series_cols_to_delete = [f"{col}_yesterday" for col in series_cols] del us_states[:, series_cols_to_delete] # set negative daily values to 0 us_states[f.cases_daily < 0, [f.cases_daily, f.cases100k_daily]] = 0 us_states[f.deaths_daily < 0, [f.deaths_daily, f.deaths100k_daily]] = 0 # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_states[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_states[date_col].to_pandas() train = us_states[df[date_col] <= split_date, :] test = us_states[df[date_col] > split_date, :] # return [train, test] and rename dataset names as needed return {f"covid19_daily_{split_date}_by_states_train": train, f"covid19_daily_{test_date}_by_states_test": test}
def test_update_with_assign(): DT = dt.Frame(A=range(5)) with pytest.raises(ValueError, match=r"update\(\) clause cannot be used " r"with an assignment expression"): DT[:, update(B=0)] = None
def build_gene_compound_tissue_df(gene_compound_tissue_file, gene_file, compound_file, tissue_file, output_dir): """ Build gene_compound_tissue table (description?) @param gene_compound_tissue_file: [`str`] Path to the gene signature .csv file. @param gene_file: [`str`] Path to the gene table .csv file. @param compound_file: [`str`] Path to the compound table .csv file. @param tissue_file: [`str`] Path to the tissue table .csv file. @param output_dir: [`str`] Path to write the output file to. @return [`datatable.Frame`] Writes the 'gene_compound_tissue.csv' file to output_dir then returns the table. """ # -- Check the input files exist for fl in [ gene_compound_tissue_file, gene_file, compound_file, tissue_file ]: if not os.path.exists(fl): raise FileNotFoundError(f'Could not find the {fl}') # -- Read in mapping tables gene_dt = fread(gene_file) compound_dt = fread(compound_file) tissue_dt = fread(tissue_file) # -- Read in gene_compound_tissue table gct_dt = fread(gene_compound_tissue_file) # -- Fix names and assign missing columns if np.all( np.isin(np.asarray(('Gene', 'Tissue', 'Drug', 'FWER_genes')), np.asarray(gct_dt.names))): gct_dt.names = { 'Gene': 'gene_id', 'Tissue': 'tissue_id', 'Drug': 'compound_id', 'FWER_genes': 'FWER_gene' } # Determine missing columns and assign them, so we don't have to change code #>when new columns are addeds gct_table_columns = np.asarray( ('id', 'gene_id', 'compound_id', 'tissue_id', 'estimate', 'lower', 'upper', 'n', 'tstat', 'fstat', 'pvalue', 'df', 'fdr', 'FWER_gene', 'FWER_compound', 'FWER_all', 'BF_p_all', 'sens_stat', 'mDataType', 'tested_in_human_trials', 'in_clinical_trials')) gct_missing_columns = np.setdiff1d(gct_table_columns, np.asarray(gct_dt.names)) for col in gct_missing_columns: gct_dt[col] = None gct_dt1 = gct_dt[:, [ *gct_table_columns, 'gene_name', 'compound_name', 'tissue_name' ]] # Sanity check the columns are there if not np.all(np.isin(gct_table_columns, np.asarray(gct_dt1.names))): raise ValueError(f'The gene_compound_tissue table', ' has missing columns!') # -- Map to existing FK ids # gene id gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'} gene_dt.key = 'gene_name' # NOTE: the g object references the joined tables namespace gct_dt1[:, update(gene_id=g.gene_id), join(gene_dt)] # check for failed genes failed_genes = gct_dt1[dt.isna(f.gene_id), 'gene_name'].to_numpy().flatten() if len(failed_genes) > 0: raise ValueError(f'Genes {failed_genes} failed to map!') if (np.any(gct_dt1[:, dt.isna(f.gene_id)].to_numpy())): warnings.warn( 'Some gene_ids in gene_compound_tissue are still NA! Dropping' 'the missing rows...') gct_dt1 = gct_dt1[~dt.isna(f.gene_id), :] del gct_dt1[:, 'gene_name'] # compound id compound_dt.names = {'id': 'compound_id', 'name': 'compound_name'} del compound_dt[:, 'compound_uid'] compound_dt.key = 'compound_name' gct_dt1[:, update(compound_id=g.compound_id), join(compound_dt)] # tissue id tissue_dt.names = {'id': 'tissue_id', 'name': 'tissue_name'} tissue_dt.key = 'tissue_name' gct_dt1[:, update(tissue_id=g.tissue_id), join(tissue_dt)] ## TODO: Handle failed tissue mappings? # -- Sort then assign the primary key column ## TODO:: Is there a way to sort by reference? gct_dt2 = gct_dt1[:, list(gct_table_columns), sort('gene_id', 'compound_id', 'tissue_id', 'mDataType')] gct_dt2[:, update(id=range(1, gct_dt2.nrows + 1))] # Sanity check we didn't lose any rows if not gct_dt.nrows == gct_dt2.nrows: warnings.warn('The compound_gene_tissue table has lost some rows!') gct_dt2.to_jay(os.path.join(output_dir, 'gene_compound_tissue.jay'))
def test_update_existing_column(): DT = dt.Frame(A=range(5)) DT[:, update(A=f.A * 2)] assert_equals(DT, dt.Frame(A=range(0, 10, 2)))
# Compute per-column expressions (signed distance from the mean in this example) # for all numeric (int, float) columns with stats computed by groups and # new column added for each original numeric feature. # see: https://stackoverflow.com/questions/62974899/updating-or-adding-multiple-columns-with-pydatatable-in-style-of-r-datables-sd # # Specification: # Inputs: # X: datatable - primary data set # Parameters: # group_by_cols: list of column names - group columns to compute stats by # Output: # dataset augmented with computed statistics from datatable import f, by, sort, update, shift, isna, mean group_by_cols = ["user_id"] new_dataset_name = "new_dataset_name_with_stats" aggs = { f"{col}_dist_from_mean": mean(dt.f[col]) - f[col] for col in X[:, f[int].extend(f[float])].names } X[:, update(**aggs), by(*group_by_cols)] return {new_dataset_name: X}
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: if X is None: return [] columns = None # columns = ["PAY_AMT", "BILL_AMT", "PAY_"] ranges = None # [(1, 6), (1, 6), (2, 6)] black_listed_columns = [] min_col_group_size = 2 # parse column names for time series column groups if columns is None or columns == [] or \ ranges is None or ranges == []: # match any column names that consist of alpha name (prefix) followed by integer index (suffix) p = re.compile(r"^([a-zA-Z_]+)(\d+)$") matches = [p.match(c) for c in X.names] all_col_groups = defaultdict(list) for m in matches: if m is not None: key = m.group(1) val = int(m.group(2)) all_col_groups[key].append(val) # remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return X
def test_update_multiple_dependents(): DT = dt.Frame(A=range(5)) DT[:, update(B=f.A + 1, A=f.A + 2, D=f.A + 3)] assert_equals(DT, dt.Frame(A=range(2, 7), B=range(1, 6), D=range(3, 8)))
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' group_by_cols = ["state"] forecast_len = 7 # state codes lookup table us_state_codes = dt.Frame( code=[ 'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VI', 'VA', 'WA', 'WV', 'WI', 'WY' ], state=[ 'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming' ]) us_state_codes.key = "state" # get states population lookup table us_states_pop = dt.fread( "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv" ) us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'} us_states_pop = us_states_pop[dt.f.STATE > 0, :] us_states_pop.key = "state" # join state codes and population into single lookup table us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)] us_states_pop.key = "code" # US Covid Tracking API: https://covidtracking.com/data/api us_states = dt.fread( "https://covidtracking.com/api/v1/states/daily.csv") # remove deprecated fields deprecated = [ 'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified', 'grade', 'hash', 'hospitalized', 'negativeIncrease', 'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore', 'score', 'total' ] us_states = us_states[:, list(set(us_states.names) - set(deprecated))] us_states.names = {'state': 'code'} series_cols = [ "positive", "negative", "hospitalizedCumulative", "inIcuCumulative", "onVentilatorCumulative", "recovered", "death" ] aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols} us_states[:, dt.update( state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs), join(us_states_pop)] us_states = us_states[~dt.isna(dt.f.state), :] # produce lag of 1 unit and add as new feature for each shift column series_cols.extend([col + "100k" for col in series_cols]) aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols} us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] # update NA lags aggs = {f"{col}_yesterday": 0 for col in series_cols} us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)] aggs = { f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols } us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] for col in series_cols: del us_states[:, f[f"{col}_yesterday"]] # validate dataset if us_states[:, count(), by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1: raise ValueError( "Found duplicate elements for the same date and state.") # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_states[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_states[date_col].to_pandas() train = us_states[df[date_col] <= split_date, :] test = us_states[df[date_col] > split_date, :] return { f"covidtracking_daily_{split_date}_by_us_states_train": train, f"covidtracking_daily_{test_date}_by_us_states_test": test }