Example #1
0
def map_genes_to_genomic_coordinates(gene_path, gene_annotation_path,
                                     gencode_path):
    """
    Reads in the gene and gene_annotation tables along with a gencode annotation file
    and uses the gencode annotations to assign genomic coordinates to genes in gene_annotations
    before writing the updated table back to disk.

    @param gene_path [`string`] Path to the gene table .csv
    @param gene_annotation_path [`string`] Path to the gene_annotation table .csv
    @param gencode_path [`string`] Path tot he genecode annotation table .csv
    
    @return [None] Modifies gene_annotation table and writes .csv to disk
    """

    # -- Load in the required data
    gene = fread(gene_path)
    gene_annot = fread(gene_annotation_path)
    gencode = fread(gencode_path)

    vsub = np.vectorize(re.sub)
    gencode[:,
            update(
                gene_id=vsub('[.][0-9]*$', '', gencode['gene_id'].to_numpy()))]

    # -- Add gene name back to gene_annotation
    gene.key = 'id'
    # join columns need the same name, stupid...
    gene_annot[:, update(id=f.gene_id)]
    gene_annot.key = 'id'
    gene_a = gene_annot[:, :, dt.join(gene)]
    gene_a = gene_a[:, [
        name not in ('symbol', 'strand') for name in gene_a.names
    ]]

    # -- Prepocess the genomic coordinates
    gencode.names = {'gene_id': 'name', 'gene_name': 'symbol'}
    gencode = gencode[:,
                      ['name', 'start', 'end', 'strand', 'seqnames', 'symbol']]
    gencode.key = 'name'
    gene_a.key = 'name'

    # -- Map coordinates to gene_annotations, check that nothing went wrong
    gene_annotation = gene_a[:, :, dt.join(gencode)].copy()
    # sanity check the mappings didn't get messed up
    if not np.all(
            gene_annotation['name'].to_numpy() == gene['name'].to_numpy()):
        raise ValueError(
            'The gene_annotation table got mangled while trying to map'
            'genomic coordinates!')

    # -- Clean up the table and write to disk
    gene_annotation[:,
                    update(gene_seq_start=f.start,
                           gene_seq_end=f.end,
                           chr=f.seqnames)]
    del gene_annotation[:, ['name', 'id', 'start', 'end', 'seqnames']]

    gene_annotation.to_jay(gene_annotation_path)
Example #2
0
def test_update_misplaced():
    DT = dt.Frame(A=range(5))
    with pytest.raises(TypeError,
                       match="Column selector must be an integer "
                       "or a string"):
        DT[update(B=0)]
    with pytest.raises(TypeError,
                       match="Invalid item at position 2 in "
                       r"DT\[i, j, \.\.\.\] call"):
        DT[:, :, update(B=0)]
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_total = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv")

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_total[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_total[:, series_cols_to_delete]

        # set negative daily values to 0
        us_total[f.cases_daily < 0, [f.cases_daily]] = 0
        us_total[f.deaths_daily < 0, [f.deaths_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_total[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_total[date_col].to_pandas()
        train = us_total[df[date_col] <= split_date, :]
        test = us_total[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_us_train": train,
                f"covid19_daily_{test_date}_us_test": test}
Example #4
0
def map_foreign_key_to_table(primary_df: dt.Frame, fk_df: dt.Frame,
                             join_column_dict: dict) -> dt.Frame:
    """
    Performs a left join of `primary_df` to `fk_df` by refence, updating
    the column indicated in `join_column_dict`.

    :primary_df: A `datatable.Frame`. This should be the larger table
        and will ideally be loaded from a .jay file with a `memory_limit`
        specified in `datable.fread`.
    :fk_df: A `datatable.Frame`. This should be a smaller table
        which will be joined to 
    :join_column_dict: A dictionary with keys 'primary_df' and 'fk_df'
        specifying the columns to join the tables on.
    """
    # Check for correct keys in dict
    key_strings = list(join_column_dict.keys())
    if ('primary_df' not in key_strings or 'fk_df' not in key_strings):
        raise ValueError("The join_column_dict item must have keys"
                         "'primary_df' and 'fk_df'!")
    # Rename columns
    primary_col = join_column_dict['primary_df']
    fk_col = join_column_dict['fk_df']
    fk_df.names = {fk_col: primary_col}
    fk_df.key = primary_col
    update_expr = {primary_col: g.id}
    # Join, update by reference then coerce to the correct type
    primary_df[:, update(**update_expr), join(fk_df)]
def test_assign_different_types():
    DT = dt.Frame(A=range(5), B=list("ABCDE"))
    DT = DT[:, ["A", "B"]]  # for py35
    assert DT.stypes == (dt.int32, dt.str32)
    DT[:, update(A=dt.float32, B=dt.str64)]
    assert_equals(DT, dt.Frame(A=range(5), B=list("ABCDE"),
                               stypes=dict(A=dt.float32, B=dt.str64)))
Example #6
0
def test_update_multiple_columns():
    DT = dt.Frame(A=range(5))
    DT[:, update(I8=dt.int8(f.A), I16=dt.int16(f.A), I64=dt.int64(f.A))]
    assert_equals(
        DT,
        dt.Frame([[0, 1, 2, 3, 4]] * 4,
                 names=["A", "I8", "I16", "I64"],
                 stypes=[dt.int32, dt.int8, dt.int16, dt.int64]))
Example #7
0
def test_update_with_groupby():
    DT = dt.Frame(A=range(5), B=[1, 1, 2, 2, 2])
    DT[:, update(C=7, D=dt.mean(f.A), E=f.A + 1), by(f.B)]
    assert_equals(
        DT,
        dt.Frame(A=range(5),
                 B=[1, 1, 2, 2, 2],
                 C=[7] * 5,
                 D=[0.5, 0.5, 3.0, 3.0, 3.0],
                 E=range(1, 6)))
Example #8
0
def write_table(df, name, output_dir, add_index=True):
    """
    Add a primary key to df ("id" column) and write it to output_dir
    as a .jay file.

    @param df: [`datatable.Frame`] A PharmacoDB table
    @param name: [`string`] The name of the table
    @param output_dir: [`string`] The directory to write the table to
    @return: [`datatable.Frame`] The indexed PharmacoDB table
    """
    logger.info(f"Writing {name} table to {output_dir}...")
    if add_index:
        # Index datatable
        df[:, update(id=np.arange(df.nrows) + 1)]
    df.to_jay(os.path.join(output_dir, f"{name}.jay"))
    return df
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_states = dt.fread(
            "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
        )

        # get states population
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
        )
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop.key = "state"

        # augment data with state population figures and create adjusted case and death counts
        us_states[:,
                  dt.update(pop=dt.g.pop,
                            pop100k=dt.g.pop / 100000,
                            cases100k=dt.f.cases / (dt.g.pop / 100000),
                            deaths100k=dt.f.deaths / (dt.g.pop / 100000)),
                  dt.join(us_states_pop)]

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-forecast_len:, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] < split_date, :]
        test = us_states[df[date_col] >= split_date, :]

        # return [train, test] and rename dataset names as needed
        return {
            "covid19_daily_by_states_train": train,
            "covid10_daily_by_states_test": test
        }
Example #10
0
def test_update_with_delete():
    DT = dt.Frame(A=range(5))
    with pytest.raises(ValueError,
                       match=r"update\(\) clause cannot be used "
                       r"with a delete expression"):
        del DT[:, update(B=0)]
Example #11
0
# Visualization
alt.Chart(weather_dt.to_pandas()).transform_fold(
    
    ['temp_avg','temp_min','temp_max'],
    as_=['temp_type','temp_val']
    
).mark_boxplot().encode(
    
    alt.Y('temp_type:O'),
    alt.X('temp_val:Q')
    
).properties(title='Weather temp distributions')

# Adding a new column temp_diff
weather_dt[:,update(temp_diff=f.temp_max-f.temp_min)]

# Visualiztion
alt.Chart(weather_dt[:,f.temp_diff].to_pandas()).mark_bar().encode(
    alt.X('temp_diff',bin=True),
    alt.Y('count()')
).properties(

    title='Distribution of temparature differences'
)

# Downloading weather data and selecting specific columns related to weather conditions
weather_temp = dt.fread('https://assets.datacamp.com/production/repositories/1497/datasets/02f3fb2d4416d3f6626e1117688e0386784e8e55/weather.csv',na_strings=[""]
                       )[:,[f[1],f[7:]]]

weather_temp
Example #12
0
def test_update_mixed_2():
    DT = dt.Frame(A=range(5))
    DT[:, update(B=3, C=f.A)]
    assert_equals(DT, dt.Frame(A=range(5), B=[3] * 5, C=range(5)))
Example #13
0
def test_update_mixed_dimensions():
    DT = dt.Frame(A=range(5))
    DT[:, update(B=f.A * 2, C=10)]
    assert_equals(DT, dt.Frame(A=range(5), B=range(0, 10, 2), C=[10] * 5))
gcd_missing_columns = np.setdiff1d(gcd_table_columns, np.asarray(gcd_dt.names))
for col in gcd_missing_columns:
    gcd_dt[col] = None
gcd_dt1 = gcd_dt[:, list(gcd_table_columns)]
# Sanity check the columns are there
if not np.all(gcd_table_columns == np.asarray(gcd_dt1.names)):
    raise ValueError(f'The build_gene_compound_dataset table',
                     ' has missing columns!')

# -- Map to existing FK ids
# gene id
gcd_dt1.names = {'gene_id': 'gene_name'}
gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'}
gene_dt.key = 'gene_name'
# NOTE: the g object references the joined tables namespace
gcd_dt1[:, update(gene_id=g.gene_id), join(gene_dt)]
## TODO:: rewrite as a helper function
# regex match failed ids, then assign to the table
failed_genes = np.unique(gcd_dt1[dt.isna(f.gene_id),
                                 'gene_name'].to_numpy().flatten())
if len(failed_genes) > 0:
    gene_queries = [re.compile(f'{gene}.*') for gene in failed_genes]
    gene_name_series = gene_dt['gene_name'].to_pandas().gene_name

    # needs to be float64 because Numpy has no NaN for int types... makes no sense!?
    # Pad with NaNs for failed matches
    gene_ids = gene_dt[match_idx, 'gene_id'].to_pandas().gene_id
    if (len(failed_match_idx) > 1):
        gene_ids = pd.Series(np.insert(gene_ids, failed_match_idx, None),
                             dtype='int32')
    gcd_dt1[dt.isna(f.gene_id), update(gene_id=gene_ids)]
Example #15
0
def build_gene_compound_dataset_df(gene_compound_dataset_file, gene_file,
                                   compound_file, dataset_file, output_dir,
                                   compound_names):
    """
    Build gene_compound_dataset table (description?)

    @param gene_compound_dataset_file: [`str`] Path to the gene signature .csv file.
    @param gene_file: [`str`] Path to the gene table .csv file.
    @param compound_file: [`str`] Path to the compound table .csv file.
    @param dataset_file: [`str`] Path to the tissue table .csv file.
    @param output_dir: [`str`] Path to write the output file to.
    :param compound_name: [`str`] Path to an optional .csv file mapping 
        updated compound names to the dataset. This is to ensure that corrected
        compound annotations still make it into the database without the need
        to rerun all the gene signatures

    @return [`datatable.Frame`] Writes the 'gene_compound_dataset.csv' file to 
        output_dir the returns the table.
    """
    # -- Check the input files exist
    for fl in [
            gene_compound_dataset_file, gene_file, compound_file, dataset_file
    ]:
        if not os.path.exists(fl):
            raise FileNotFoundError(f'Could not find the {fl}')

    # -- Read in mapping tables
    gene_dt = fread(gene_file)
    compound_dt = fread(compound_file)
    dataset_dt = fread(dataset_file)

    # -- Read in gene_compound_tissue table
    gcd_dt = fread(gene_compound_dataset_file)

    # -- Fix names and assign missing columns
    gcd_dt.names = {
        'gene': 'gene_id',
        'compound': 'compound_id',
        'dataset': 'dataset_id',
        'lower': 'lower_analytic',
        'upper': 'upper_analytic',
        'pvalue': 'pvalue_analytic',
        'fdr': 'fdr_analytic'
    }
    del gcd_dt[:, ['significant', 'tissue']]

    # Determine missing columns and assign them, so we don't have to change code
    #>when new columns are addeds
    gcd_table_columns = np.asarray(
        ('id', 'gene_id', 'compound_id', 'dataset_id', 'estimate',
         'lower_analytic', 'upper_analytic', 'lower_permutation',
         'upper_permutation', 'n', 'pvalue_analytic', 'pvalue_permutation',
         'df', 'fdr_analytic', 'fdr_permutation', 'significant_permutation',
         'permutation_done', 'sens_stat', 'mDataType'))
    gcd_missing_columns = np.setdiff1d(gcd_table_columns,
                                       np.asarray(gcd_dt.names))
    for col in gcd_missing_columns:
        gcd_dt[col] = None
    gcd_dt1 = gcd_dt[:, list(gcd_table_columns)]
    # Sanity check the columns are there
    if not np.all(gcd_table_columns == np.asarray(gcd_dt1.names)):
        raise ValueError(f'The build_gene_compound_dataset table',
                         ' has missing columns!')

    gcd_dt1[:, update(sens_stat='AAC', permutation_done=0)]

    # -- Map to existing FK ids
    # gene id
    gcd_dt1.names = {'gene_id': 'gene_name'}
    gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'}
    gene_dt.key = 'gene_name'
    # NOTE: the g object references the joined tables namespace
    gcd_dt1[:, update(gene_id=g.gene_id), join(gene_dt)]

    # make sure all genes mapped
    failed_genes = np.unique(gcd_dt1[dt.isna(f.gene_id), 'gene_name'] \
        .to_numpy().flatten())
    if len(failed_genes) > 0:
        warnings.warn(f'The genes: {failed_genes} did not map!')

    if (np.any(gcd_dt1[:, dt.isna(f.gene_id)].to_numpy())):
        warnings.warn('Some gene_ids in gene_compound_dataset are still NA!'
                      'Dropping the missing rows...')
        gcd_dt1 = gcd_dt1[~dt.isna(f.gene_id), :]

    # fix compound names
    ## FIXME:: Remove this when gene signatures are regenerated
    ## START patch
    fix_names_df = dt.fread(compound_names)
    fix_names_df[f.dataset == "GDSC_2020(v1-8.2)", update(dataset="GDSC_v1")]
    fix_names_df[f.dataset == "GDSC_2020(v2-8.2)", update(dataset="GDSC_v2")]
    fix_names_df.names = {
        "drugid": "compound_name",
        "unique.drugid": "compound_id",
        "dataset": "dataset_id"
    }
    fix_names_df.key = ["compound_name", "dataset_id"]
    gcd_dt1.names = {'compound_id': 'compound_name'}
    gcd_dt1[~dt.isna(g.compound_id),
            update(compound_name=g.compound_id),
            join(fix_names_df)]
    ## END patch

    # compound id
    compound_dt.names = {'id': 'compound_id', 'name': 'compound_name'}
    del compound_dt[:, 'compound_uid']
    compound_dt.key = 'compound_name'
    gcd_dt1[:, update(compound_id=g.compound_id), join(compound_dt)]

    if np.any(gcd_dt1[:, dt.isna(f.compound_id)].to_numpy()):
        warnings.warn("Some compound_ids in gene_compound_dataset are stll "
                      "NA! Dropping the missing rows...")
        gcd_dt1 = gcd_dt1[~dt.isna(f.compound_id)]

    # dataset id
    gcd_dt1.names = {'dataset_id': 'dataset_name'}
    dataset_dt.names = {'id': 'dataset_id', 'name': 'dataset_name'}
    dataset_dt.key = 'dataset_name'
    gcd_dt1[:, update(dataset_id=g.dataset_id), join(dataset_dt)]

    # -- Sort then assign the primary key column
    gcd_dt2 = gcd_dt1[:,
                      list(gcd_table_columns),
                      sort('gene_id', 'compound_id', 'dataset_id', 'mDataType'
                           )]
    gcd_dt2[:, update(id=range(1, gcd_dt2.nrows + 1))]

    # Sanity check we didn't lose any rows
    if not gcd_dt.nrows == gcd_dt2.nrows:
        warnings.warn('The gene_compound_dataset table has lost some rows!')

    gcd_dt2.to_jay(os.path.join(output_dir, 'gene_compound_dataset.jay'))
Example #16
0
#   X: datatable - primary data set
# Parameters:
#   time_col: date/time/int - time column to order rows before the shift op
#   group_by_cols: list of column names - group columns
#   shift_cols: list of column names - columns to shift
# Output:
#   dataset augmented with shifted columns

from datatable import f, by, sort, update, shift, isna

time_col = "date"
group_by_cols = ["state"]
shift_cols = ["cases", "deaths"]

new_dataset_name = "new_dataset_name_with_shift"

# produce lag of 1 unit and add as new feature for each shift column
aggs = {f"{col}_yesterday": shift(f[col]) for col in shift_cols}
X[:, update(**aggs), sort(time_col), by(*group_by_cols)]

# update NA lags
aggs = {f"{col}_yesterday": 0 for col in shift_cols}
X[isna(f[f"{shift_cols[0]}_yesterday"]), update(**aggs)]

aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in shift_cols}
X[:, update(**aggs), sort(time_col), by(group_by_cols)]

for col in shift_cols:
    del X[:, f[f"{col}_yesterday"]]

return {new_dataset_name: X}
Example #17
0
def test_update_simple():
    DT = dt.Frame(A=range(5))
    DT[:, update(B=10)]
    assert_equals(DT, dt.Frame(A=range(5), B=[10] * 5))
Example #18
0
"""Concatenate columns"""

# Consolidate multiple columns into single text column by concatenating
# them and adding column name as a prefix.
#
# Specification:
# Inputs:
#   X: datatable - primary dataset
# Parameters:
#   col_names - list of text column names to consolidate
#   txt_col_name - column name containing consolidated text
# Output:
#   dataset containing original and consolidated columns
from datatable import f, FExpr, update
import functools

col_names = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
txt_col_name = "toxic_consolidated"

new_dataset_name = "new_dataset_with_concat_txt_column"

concat_cols_fexpr = functools.reduce(FExpr.__add__, (col + ": " + f[col] + " "
                                                     for col in col_names))
X[:, update(**{txt_col_name: concat_cols_fexpr})]

return {new_dataset_name: X}
).properties(
    
    title='Top directors title counts and imdb ratings'
)

alt.Chart(directors_views_rating_v1.to_pandas()).mark_bar().encode(
    alt.Y('directed_by',sort='-x'),
    alt.X('count'),
    alt.Color('us_views_millions')
).properties(
    
    title='Top directors title counts and US million views'
)

# are the directors and writers same for a title ?
amigos_info_dt[:,dt.update(temp=f.directed_by==f.written_by)]

# are the directors and writers same for a title ?
amigos_info_dt[f.temp==True,:]

# remove the temp col
del amigos_info_dt["temp"]

# split writers column
writers_list = [ elemento.split('&') for elemento in amigos_info_dt[:,f.written_by].to_list()[0] ]

# create a new DT with writers
writers_dt = dt.Frame({'no_of_writers':[len(elem) for elem in writers_list]})

# Joining two DTs
amigos_info_dt_v1 = dt.cbind(amigos_info_dt,writers_dt)
Example #20
0
    # remove black listed columns or column groups that smaller than minimal size
    col_groups = {
        key: val
        for key, val in all_col_groups.items()
        if not key in black_listed_columns or len(val) >= min_col_group_size
    }

    # list of column prefixes
    columns = list(col_groups.keys())
    # list of column ranges
    ranges = [(min(idx), max(idx)) for idx in col_groups.values()]

# produce tuple for column slices
col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta))
              for (col, (desde, hasta)) in zip(columns, ranges)]

for c, r, s in zip(columns, ranges, col_slices):
    update_map = {
        c + "_sum": rowsum(f[s[0]:s[1]]),
        c + "_mean": rowmean(f[s[0]:s[1]]),
        c + "_sd": rowsd(f[s[0]:s[1]]),
        c + "_max": rowmax(f[s[0]:s[1]]),
        c + "_min": rowmin(f[s[0]:s[1]]),
        c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]),
        c + "_first": rowfirst(f[s[0]:s[1]]),
        c + "_last": rowlast(f[s[0]:s[1]]),
        c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]])
    }
    X[:, update(**update_map)]

return {"CreditCard-train-aug.csv": X}
Example #21
0
def test_update_with_groupby():
    DT = dt.Frame(A=range(5), B=[1, 2, 2, 2, 1])
    assert DT.stype == dt.int32
    DT[:, update(A=f.A * 100 / dt.sum(f.A)), by(f.B)]
    assert_equals(
        DT, dt.Frame(A=[0, 100 / 6, 100 / 3, 50, 100], B=[1, 2, 2, 2, 1]))
# In[8]:

# DT[order(sepal_length - sepal_width), head(.SD, 2), by = species]

DT[:2, :, by('species'), sort(f.sepal_length - f.sepal_width)]

# Just like in R's [data.table](https://github.com/Rdatatable/data.table), boolean expressions can be passed to the [sort](https://datatable.readthedocs.io/en/latest/api/dt/sort.html#) function.

# #### Filter observations above the mean of `sepal_length` by species

# In[9]:

# DT[, .SD[sepal_length > mean(sepal_length)], by = species]

DT[:, update(temp=f.sepal_length > dt.mean(f.sepal_length)), by('species')]

DT[f.temp == 1, :-1]

# Unlike in R's [data.table](https://github.com/Rdatatable/data.table), boolean expressions can not be applied within the `i` section, in the presence of `by`. The next best thing is to break it down into two steps - create a temporary column to hold the boolean value, and then filter on that column.

# #### Filter rows with group size greater than 10

# In[10]:

# DT[, .SD[.N > 10], keyby = .(species, petal_width)]

DT[:, update(temp=dt.count() > 10), by('species', 'petal_width')]

DT[f.temp == 1, :-1]
Example #23
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_states = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")

        # get states population
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv")
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop.key = "state"

        # augment data with state population figures and create adjusted case and death counts
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}100k": dt.f[col] / (dt.g.pop / 100000) for col in series_cols}
        us_states[:, dt.update(pop = g.pop, pop100k = g.pop / 10000, **aggs), join(us_states_pop)]

        # remove rows without state defined (resulted in unmatched rows after left outer join)
        del us_states[isna(f.pop), :]

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_states[:, series_cols_to_delete]

        # set negative daily values to 0
        us_states[f.cases_daily < 0, [f.cases_daily, f.cases100k_daily]] = 0
        us_states[f.deaths_daily < 0, [f.deaths_daily, f.deaths100k_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_by_states_train": train,
                f"covid19_daily_{test_date}_by_states_test": test}
Example #24
0
def test_update_with_assign():
    DT = dt.Frame(A=range(5))
    with pytest.raises(ValueError,
                       match=r"update\(\) clause cannot be used "
                       r"with an assignment expression"):
        DT[:, update(B=0)] = None
Example #25
0
def build_gene_compound_tissue_df(gene_compound_tissue_file, gene_file,
                                  compound_file, tissue_file, output_dir):
    """
    Build gene_compound_tissue table (description?)

    @param gene_compound_tissue_file: [`str`] Path to the gene signature .csv file.
    @param gene_file: [`str`] Path to the gene table .csv file.
    @param compound_file: [`str`] Path to the compound table .csv file.
    @param tissue_file: [`str`] Path to the tissue table .csv file.
    @param output_dir: [`str`] Path to write the output file to.

    @return [`datatable.Frame`] Writes the 'gene_compound_tissue.csv' file to 
    output_dir then returns the table.
    """
    # -- Check the input files exist
    for fl in [
            gene_compound_tissue_file, gene_file, compound_file, tissue_file
    ]:
        if not os.path.exists(fl):
            raise FileNotFoundError(f'Could not find the {fl}')

    # -- Read in mapping tables
    gene_dt = fread(gene_file)
    compound_dt = fread(compound_file)
    tissue_dt = fread(tissue_file)

    # -- Read in gene_compound_tissue table
    gct_dt = fread(gene_compound_tissue_file)

    # -- Fix names and assign missing columns
    if np.all(
            np.isin(np.asarray(('Gene', 'Tissue', 'Drug', 'FWER_genes')),
                    np.asarray(gct_dt.names))):
        gct_dt.names = {
            'Gene': 'gene_id',
            'Tissue': 'tissue_id',
            'Drug': 'compound_id',
            'FWER_genes': 'FWER_gene'
        }
    # Determine missing columns and assign them, so we don't have to change code
    #>when new columns are addeds
    gct_table_columns = np.asarray(
        ('id', 'gene_id', 'compound_id', 'tissue_id', 'estimate', 'lower',
         'upper', 'n', 'tstat', 'fstat', 'pvalue', 'df', 'fdr', 'FWER_gene',
         'FWER_compound', 'FWER_all', 'BF_p_all', 'sens_stat', 'mDataType',
         'tested_in_human_trials', 'in_clinical_trials'))
    gct_missing_columns = np.setdiff1d(gct_table_columns,
                                       np.asarray(gct_dt.names))
    for col in gct_missing_columns:
        gct_dt[col] = None
    gct_dt1 = gct_dt[:, [
        *gct_table_columns, 'gene_name', 'compound_name', 'tissue_name'
    ]]
    # Sanity check the columns are there
    if not np.all(np.isin(gct_table_columns, np.asarray(gct_dt1.names))):
        raise ValueError(f'The gene_compound_tissue table',
                         ' has missing columns!')

    # -- Map to existing FK ids
    # gene id
    gene_dt.names = {'id': 'gene_id', 'name': 'gene_name'}
    gene_dt.key = 'gene_name'
    # NOTE: the g object references the joined tables namespace
    gct_dt1[:, update(gene_id=g.gene_id), join(gene_dt)]

    # check for failed genes
    failed_genes = gct_dt1[dt.isna(f.gene_id),
                           'gene_name'].to_numpy().flatten()
    if len(failed_genes) > 0:
        raise ValueError(f'Genes {failed_genes} failed to map!')

    if (np.any(gct_dt1[:, dt.isna(f.gene_id)].to_numpy())):
        warnings.warn(
            'Some gene_ids in gene_compound_tissue are still NA! Dropping'
            'the missing rows...')
        gct_dt1 = gct_dt1[~dt.isna(f.gene_id), :]
    del gct_dt1[:, 'gene_name']

    # compound id
    compound_dt.names = {'id': 'compound_id', 'name': 'compound_name'}
    del compound_dt[:, 'compound_uid']
    compound_dt.key = 'compound_name'
    gct_dt1[:, update(compound_id=g.compound_id), join(compound_dt)]

    # tissue id
    tissue_dt.names = {'id': 'tissue_id', 'name': 'tissue_name'}
    tissue_dt.key = 'tissue_name'
    gct_dt1[:, update(tissue_id=g.tissue_id), join(tissue_dt)]

    ## TODO: Handle failed tissue mappings?

    # -- Sort then assign the primary key column
    ## TODO:: Is there a way to sort by reference?
    gct_dt2 = gct_dt1[:,
                      list(gct_table_columns),
                      sort('gene_id', 'compound_id', 'tissue_id', 'mDataType')]
    gct_dt2[:, update(id=range(1, gct_dt2.nrows + 1))]

    # Sanity check we didn't lose any rows
    if not gct_dt.nrows == gct_dt2.nrows:
        warnings.warn('The compound_gene_tissue table has lost some rows!')

    gct_dt2.to_jay(os.path.join(output_dir, 'gene_compound_tissue.jay'))
Example #26
0
def test_update_existing_column():
    DT = dt.Frame(A=range(5))
    DT[:, update(A=f.A * 2)]
    assert_equals(DT, dt.Frame(A=range(0, 10, 2)))
# Compute per-column expressions (signed distance from the mean in this example)
# for all numeric (int, float) columns with stats computed by groups and
# new column added for each original numeric feature.
# see: https://stackoverflow.com/questions/62974899/updating-or-adding-multiple-columns-with-pydatatable-in-style-of-r-datables-sd
#
# Specification:
# Inputs:
#   X: datatable - primary data set
# Parameters:
#   group_by_cols: list of column names - group columns to compute stats by
# Output:
#   dataset augmented with computed statistics

from datatable import f, by, sort, update, shift, isna, mean

group_by_cols = ["user_id"]

new_dataset_name = "new_dataset_name_with_stats"

aggs = {
    f"{col}_dist_from_mean": mean(dt.f[col]) - f[col]
    for col in X[:, f[int].extend(f[float])].names
}

X[:, update(**aggs), by(*group_by_cols)]

return {new_dataset_name: X}
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        if X is None:
            return []

        columns = None  # columns = ["PAY_AMT", "BILL_AMT", "PAY_"]
        ranges = None  # [(1, 6), (1, 6), (2, 6)]
        black_listed_columns = []
        min_col_group_size = 2

        # parse column names for time series column groups
        if columns is None or columns == [] or \
                ranges is None or ranges == []:
            # match any column names that consist of alpha name (prefix) followed by integer index (suffix)
            p = re.compile(r"^([a-zA-Z_]+)(\d+)$")
            matches = [p.match(c) for c in X.names]
            all_col_groups = defaultdict(list)
            for m in matches:
                if m is not None:
                    key = m.group(1)
                    val = int(m.group(2))
                    all_col_groups[key].append(val)

            # remove black listed columns or column groups that smaller than minimal size
            col_groups = {
                key: val
                for key, val in all_col_groups.items()
                if not key in black_listed_columns
                or len(val) >= min_col_group_size
            }

            # list of column prefixes
            columns = list(col_groups.keys())
            # list of column ranges
            ranges = [(min(idx), max(idx)) for idx in col_groups.values()]

        col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta))
                      for (col, (desde, hasta)) in zip(columns, ranges)]

        for c, r, s in zip(columns, ranges, col_slices):
            update_map = {
                c + "_sum": rowsum(f[s[0]:s[1]]),
                c + "_mean": rowmean(f[s[0]:s[1]]),
                c + "_sd": rowsd(f[s[0]:s[1]]),
                c + "_max": rowmax(f[s[0]:s[1]]),
                c + "_min": rowmin(f[s[0]:s[1]]),
                c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]),
                c + "_first": rowfirst(f[s[0]:s[1]]),
                c + "_last": rowlast(f[s[0]:s[1]]),
                c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]])
            }
            X[:, update(**update_map)]

        return X
Example #29
0
def test_update_multiple_dependents():
    DT = dt.Frame(A=range(5))
    DT[:, update(B=f.A + 1, A=f.A + 2, D=f.A + 3)]
    assert_equals(DT, dt.Frame(A=range(2, 7), B=range(1, 6), D=range(3, 8)))
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # state codes lookup table
        us_state_codes = dt.Frame(
            code=[
                'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC',
                'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY',
                'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE',
                'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK',
                'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT',
                'VI', 'VA', 'WA', 'WV', 'WI', 'WY'
            ],
            state=[
                'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
                'California', 'Colorado', 'Connecticut', 'Delaware',
                'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii',
                'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
                'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
                'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
                'New York', 'North Carolina', 'North Dakota',
                'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon',
                'Pennsylvania', 'Puerto Rico', 'Rhode Island',
                'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
                'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
                'West Virginia', 'Wisconsin', 'Wyoming'
            ])
        us_state_codes.key = "state"

        # get states population lookup table
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
        )
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop = us_states_pop[dt.f.STATE > 0, :]
        us_states_pop.key = "state"

        # join state codes and population into single lookup table
        us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)]
        us_states_pop.key = "code"

        # US Covid Tracking API: https://covidtracking.com/data/api
        us_states = dt.fread(
            "https://covidtracking.com/api/v1/states/daily.csv")
        # remove deprecated fields
        deprecated = [
            'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified',
            'grade', 'hash', 'hospitalized', 'negativeIncrease',
            'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore',
            'score', 'total'
        ]
        us_states = us_states[:, list(set(us_states.names) - set(deprecated))]
        us_states.names = {'state': 'code'}

        series_cols = [
            "positive", "negative", "hospitalizedCumulative",
            "inIcuCumulative", "onVentilatorCumulative", "recovered", "death"
        ]
        aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols}
        us_states[:,
                  dt.update(
                      state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs),
                  join(us_states_pop)]
        us_states = us_states[~dt.isna(dt.f.state), :]

        # produce lag of 1 unit and add as new feature for each shift column
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        aggs = {
            f"{col}_daily": f[col] - f[f"{col}_yesterday"]
            for col in series_cols
        }
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        for col in series_cols:
            del us_states[:, f[f"{col}_yesterday"]]

        # validate dataset
        if us_states[:, count(),
                     by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1:
            raise ValueError(
                "Found duplicate elements for the same date and state.")

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        return {
            f"covidtracking_daily_{split_date}_by_us_states_train": train,
            f"covidtracking_daily_{test_date}_by_us_states_test": test
        }