Ejemplo n.º 1
0
def map_variant_to_mean_full_relative_gamma(datadir,
                                            *,
                                            dose='sober',
                                            filtered=True):
    child_gammas = get_child_gammas(datadir)
    parent_gammas = get_parent_gammas(datadir)
    varcon = mapping_lib.get_mapping('variant', 'control', datadir)
    vargamma = mapping_lib.get_mapping('variant', 'gamma', datadir, dose=dose)
    conmask = vargamma.index.intersection(varcon.loc[varcon.control].index)
    congamma = vargamma.loc[conmask]
    sigma = congamma.std().gamma
    z = -sigma  # easier to read
    unfiltered = (child_gammas / parent_gammas) - 1
    if filtered:
        geodelt_gammas = unfiltered.where(parent_gammas < (_Z_THRESHOLD * z))
    else:
        geodelt_gammas = unfiltered
    relgammas = select_dose(geodelt_gammas, dose, datadir)
    relgammas = relgammas.stack(level='sid', dropna=False)[['03']].unstack()
    if filtered:
        colname = 'relgamma'
    else:
        colname = 'unfiltered_relgamma'
    relgammas = pd.DataFrame(relgammas.mean(axis=1), columns=[colname])
    relgammas.reset_index(inplace=True)
    mapping_lib.make_mapping(relgammas, 'variant', colname, datadir, dose=dose)
Ejemplo n.º 2
0
def fetch_training_data(datadir):
    data = mapping_lib.get_mapping('variant',
                                   'relgamma',
                                   datadir,
                                   dose='sober')
    data = training_lib.filter_for_training(data, datadir)
    data = data.dropna()
    data.reset_index(inplace=True)
    var_orig = mapping_lib.get_mapping('variant', 'original', datadir).original
    var_pam = mapping_lib.get_mapping('variant', 'pam', datadir).pam
    data['original'] = data.variant.map(var_orig)
    data['pam'] = data.variant.map(var_pam)
    data.set_index('variant', inplace=True)
    return data
Ejemplo n.º 3
0
def map_variant_to_bin(datadir, dose='sober'):
    varrg = mapping_lib.get_mapping('variant', 'relgamma', datadir, dose=dose)
    bins = relgamma_bins()
    rgbin = bin_relgammas(varrg.relgamma.values, bins)
    rgbin = pd.DataFrame(rgbin.T, index=varrg.index,
                         columns=['rgbin']).reset_index()
    mapping_lib.make_mapping(rgbin, 'variant', 'rgbin', datadir, dose=dose)
Ejemplo n.º 4
0
def compute_rough_gammas(datadir):
    data = mapping_lib.get_countdata(datadir)
    varcon = mapping_lib.get_mapping('variant', 'control', datadir)
    data = data.merge(varcon.reset_index(), how='left', on='variant')
    data = data.merge(mapping_lib.get_sample_tags(datadir),
                      how='left',
                      on='sample')
    data.drop('sample', axis=1, inplace=True)

    def normalize(counts):
        return counts * _NORMAL_SIZE / counts.sum()

    data['norm'] = data.groupby(['sid', 'tp']).raw.transform(normalize)
    data['log'] = np.log2(data.norm.clip(_PSEUDO))
    data.set_index(['variant', 'sid', 'tp'], inplace=True)
    grouper = data.groupby(['sid'], group_keys=False)
    relevant = list()
    for i in range(1, 4):
        namespan = _namespan_func(i)
        diff = grouper.apply(_diff_by_tp, 'log', k=i, raw_threshold=_THRESHOLD)
        diffcenters = diff.loc[data.control].unstack(level=[-2, -1]).median()
        dg = diff.unstack(level=[-2, -1]).subtract(diffcenters, axis='columns')
        dg.columns = dg.columns.map(namespan)
        relevant.append(dg)
    X = pd.concat(relevant, axis=1)
    X.to_csv(datadir / _ROUGH_GAMMA_FILE, sep='\t')
Ejemplo n.º 5
0
def derive_child_parent_gammas(datadir):
    var_orig = mapping_lib.get_mapping('variant', 'original', datadir)
    var_orig.reset_index(inplace=True)
    allgammas = get_normed_gammas(datadir)
    parent_gammas = allgammas.loc[var_orig.original]
    child_gammas = allgammas.loc[var_orig.variant]
    parent_gammas.index = child_gammas.index
    child_gammas.to_csv(datadir / _CHILD_GAMMA_FILE_NAME, sep='\t')
    parent_gammas.to_csv(datadir / _PARENT_GAMMA_FILE_NAME, sep='\t')
Ejemplo n.º 6
0
def downsample_families(data, ratio, datadir):
    var_orig = mapping_lib.get_mapping('variant', 'original', datadir)
    oanno = pd.merge(data, var_orig, left_on='variant', right_index=True)
    families = set(oanno.original.unique())
    samplesize = int(len(families) * ratio)
    sample = random.sample(families, samplesize)
    samplevariants = oanno.loc[oanno.original.isin(sample)].index
    littledata = data.loc[data.index.intersection(samplevariants)]
    return littledata
Ejemplo n.º 7
0
def one_hot_pair_encoder(datadir):
    var_orig = mapping_lib.get_mapping('variant', 'original', datadir)
    var_pam = mapping_lib.get_mapping('variant', 'pam', datadir)
    bases = ['A', 'C', 'G', 'T']
    enc = skpreproc.OneHotEncoder(categories=[bases], sparse=False)

    def encoder(seq):
        orig = var_orig.loc[seq].original
        pam = var_pam.loc[seq].pam
        varplus = seq + pam[0]
        origplus = orig + pam[0]
        V = np.array(list(varplus))
        V = V.reshape(len(varplus), 1)
        O = np.array(list(origplus))
        O = O.reshape(len(origplus), 1)
        onehot = np.stack(
            [enc.fit_transform(V), enc.fit_transform(O)], axis=-1)
        return ((varplus, origplus), onehot)

    return encoder
Ejemplo n.º 8
0
# variant -> original
# TODO(jsh): fix this (see above)
mapping_lib.adapt_orig_map(ORIG_MAP, UNGD)

# locus_tag -> gene
# TODO(jsh): fix this (see above)
mapping_lib.adapt_gene_map(GENE_MAP, UNGD)

# count grid
mapping_lib.read_countfiles(STATICDIR, COUNT_GLOB, UNGD)
mapping_lib.make_sample_tags(UNGD)

# locus_tag -> locus_len
mapping_lib.map_locus_tag_to_len(GENOME_FILE, UNGD)

# locus_tag -> bmk_ess
# locus_tag -> bmk_sick
mapping_lib.process_bmk_spreadsheet(JMPBMK_ANNOS, UNGD)

# variant -> is_oneoff
orig_map_frame = mapping_lib.get_mapping('variant', 'original', UNGD)
orig_map_frame.reset_index(inplace=True)
mapping_lib.map_variant_to_oneoff(orig_map_frame, UNGD)

# adapt OD data
mapping_lib.adapt_od_data(OD_DATA, UNGD)

# variant -> control
mapping_lib.make_variant_controltag_map(UNGD)
Ejemplo n.º 9
0
    dfra_copies = 4
    muraa_copies = 4
    fola_copies = 4
    mura_copies = 4

    con_adaptid = 0
    bsu_exploit_adaptid = 1
    bsu_explore_adaptid = 2
    eco_exploit_adaptid = 3
    eco_explore_adaptid = 4
    dfra_adaptid = 5
    muraa_adaptid = 6
    fola_adaptid = 7
    mura_adaptid = 8

    cmap = mapping_lib.get_mapping('variant', 'control', UNGD)
    controls = list(cmap.loc[cmap.control].index)
    colis = build_oligos(controls, con_adaptid, ncopies=con_copies)

    with open(OLIGOFILE, 'w') as outfile:
        allolis = list()
        allolis.extend(colis)
        allolis.extend(
            oligos_from(BSU_EXPLOIT, bsu_exploit_adaptid, bsu_exploit_copies))
        allolis.extend(
            oligos_from(BSU_EXPLORE, bsu_explore_adaptid, bsu_explore_copies))
        allolis.extend(
            oligos_from(ECO_EXPLOIT, eco_exploit_adaptid, eco_exploit_copies))
        allolis.extend(
            oligos_from(ECO_EXPLORE, eco_explore_adaptid, eco_explore_copies))
        allolis.extend(oligos_from(DFRA_FILE, dfra_adaptid, dfra_copies))
Ejemplo n.º 10
0
N_LOCI = 300
N_FAMILIES = 10
EXPLOIT_GUIDES_PER_LOCUS = 9
EXPLORE_GUIDES_PER_FAMILY = 9
EXPLOITFILE = (UNGD / _CODEFILE).with_suffix('.exploit.tsv')
EXPLOREFILE = (UNGD / _CODEFILE).with_suffix('.explore.tsv')

if __name__ == '__main__':
    logging.info('Reading preds from {BSU_PREDFILE}...'.format(**locals()))
    preds = pd.read_csv(BSU_PREDFILE, sep='\t')
    logging.info('Reading comps from {COMPS}...'.format(**locals()))
    comps = pd.read_csv(COMPS, sep='\t')
    logging.info('Reading targets from {BSU_TARGETS}...'.format(**locals()))
    all_targets = pd.read_csv(BSU_TARGETS, sep='\t')
    var_rg = mapping_lib.get_mapping('variant',
                                     'unfiltered_relgamma',
                                     UNGD,
                                     dose='sober')
    var_rg.rename(columns={'unfiltered_relgamma': 'relgamma'}, inplace=True)
    comps['relgamma'] = comps.variant.map(var_rg.relgamma)
    important = set(pd.read_csv(BSU_LOCI, sep='\t', header=None)[0])
    essmap = mapping_lib.get_mapping('locus_tag', 'bmk_ess', UNGD)
    kinda = (important - set(essmap.loc[essmap.bmk_ess == True].index))
    veryimp = important - kinda
    fill_size = N_LOCI - len(veryimp)
    # grab mean parent gamma for all such loci
    var_orig = mapping_lib.get_mapping('variant', 'original', UNGD)
    var_loc = mapping_lib.get_mapping('variant', 'locus_tag', UNGD)
    locsub = var_loc.loc[var_loc.locus_tag.isin(kinda)]
    var_orig.reset_index(inplace=True)
    origs = var_orig.loc[var_orig.variant == var_orig.original]
    bothsub = set(origs.loc[origs.variant.isin(locsub.index)].variant)
Ejemplo n.º 11
0
UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref')
_DIR_PREFIX = pathlib.Path(__file__).parents[1]
_CODEFILE = pathlib.Path(__file__).name
PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots')

_DATA_FRACTION = 1
_K_FOLD_SPLITS = 3
_REL_PLOT_MIN = -1.2
_REL_PLOT_MAX = 1
_EPOCHS = 30
_BATCH_SIZE = 32

######################
# Read Relgamma Data #
######################
data = mapping_lib.get_mapping('variant', 'relgamma', UNGD, dose='sober')

###############
# Filter Data #
###############
# Remove non-oneoff guides (parents, off-strand, controls, etc.)
data = training_lib.filter_for_training(data, UNGD)
data = data.dropna()

###################
# Downsample Data #
###################
data = training_lib.downsample_families(data, _DATA_FRACTION, UNGD)

###################
# Preprocess Data #
Ejemplo n.º 12
0
def filter_for_training(variantframe, datadir):
    var_oneoff = mapping_lib.get_mapping('variant', 'is_oneoff', datadir)
    maskset = var_oneoff.loc[var_oneoff.is_oneoff].index
    oneoffs = variantframe.loc[variantframe.index.intersection(maskset)]
    return oneoffs
Ejemplo n.º 13
0
_DIR_PREFIX = pathlib.Path(__file__).parents[1]
_CODEFILE = pathlib.Path(__file__).name
PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots')

_PLOT_MIN = -1.2
_PLOT_MAX = 0.2

_FIGDPI = 300

shutil.rmtree(PLOTDIR, ignore_errors=True)
PLOTDIR.mkdir(parents=True, exist_ok=True)

############################
# Re-load/process raw data #
############################
sober = mapping_lib.get_mapping('variant', 'gamma', UNGD, dose='sober')
sober.columns = ['sober']
low = mapping_lib.get_mapping('variant', 'gamma', UNGD, dose='low')
low.columns = ['low']
high = mapping_lib.get_mapping('variant', 'gamma', UNGD, dose='high')
high.columns = ['high']
familymap = mapping_lib.get_mapping('variant', 'original', UNGD)

locusmap = mapping_lib.get_mapping('variant', 'locus_tag', UNGD)
genemap = mapping_lib.get_mapping('locus_tag', 'gene_name', UNGD)
geneids = genemap.loc[locusmap.locus_tag]
geneids.index = locusmap.index

data = pd.concat([familymap, sober, low, high, geneids], axis=1, sort=True)

for gene, group in data.groupby(['gene_name']):
Ejemplo n.º 14
0
UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref')
_DIR_PREFIX = pathlib.Path(__file__).parents[1]
_CODEFILE = pathlib.Path(__file__).name
PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots')

_DATA_FRACTION = 1
_K_FOLD_SPLITS = 3
_REL_PLOT_MIN = -1.2
_REL_PLOT_MAX = 1
_BATCH_SIZE = 32
_EPOCHS = 10

######################
# Read Relgamma Data #
######################
data = mapping_lib.get_mapping('variant', 'relgamma', UNGD)

###############
# Filter Data #
###############
# Remove non-oneoff guides (parents, off-strand, controls, etc.)
data = training_lib.filter_for_training(data, UNGD)
data = data.dropna()

###############
# Weight Data #
###############
# binmap = mapping_lib.get_mapping('variant', 'rgbin', UNGD).loc[data.index]
# binweights = gamma_lib.weight_bins(binmap.rgbin)
# weightmap = binmap.rgbin.map(binweights)
# weightmap.name = 'binweight'
Ejemplo n.º 15
0
#!/usr/bin/env python
# Author: John Hawkins (jsh) [[email protected]]

import logging
import pathlib

import mapping_lib
import gamma_lib

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref')
DIR_PREFIX = pathlib.Path(__file__).parents[1]
_HORIA_FILE = 'horia.gammas.tsv'

pg = gamma_lib.get_parent_gammas(UNGD)
cg = gamma_lib.get_child_gammas(UNGD)
rg = gamma_lib.unfiltered_mean_relgammas(UNGD)
oo = mapping_lib.get_mapping('variant', 'is_oneoff', UNGD)
oi = oo.loc[oo.is_oneoff].index
mpg = pg.stack(level=0, dropna=False)['03'].unstack().mean(axis=1)
mcg = cg.stack(level=0, dropna=False)['03'].unstack().mean(axis=1)
import IPython; IPython.embed()
rg['parent_gamma'] = mpg
rg['child_gamma'] = mcg
rg.loc[oi].to_csv(UNGD / _HORIA_FILE, sep='\t')
Ejemplo n.º 16
0
                    format='%(asctime)s %(levelname)s %(message)s')

UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref')
_DIR_PREFIX = pathlib.Path(__file__).parents[1]
_CODEFILE = pathlib.Path(__file__).name
PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots')

_REL_PLOT_MIN = -1.2
_REL_PLOT_MAX = 1

_FIGDPI = 100

############################
# Re-load/process raw data #
############################
data = mapping_lib.get_mapping('variant', 'relgamma', UNGD)
data = training_lib.filter_for_training(data, UNGD)
data = data.dropna()
familymap = mapping_lib.get_mapping('variant', 'original', UNGD)
familymap = familymap.loc[data.index]
encoder = training_lib.one_hot_pair_encoder(UNGD)
encodings = [encoder(x)[1] for x in data.index]
X = np.stack(encodings, axis=0)
y = np.array(data[['relgamma']], dtype=float)
cross_predictions = np.full_like(y, np.nan)

X_scaler = dict()
for i in range(X.shape[1]):
  for j in range(X.shape[3]):
    X_scaler[(i,j)] = skpreproc.StandardScaler()
    X[:,i,:,j] = X_scaler[(i,j)].fit_transform(X[:,i,:,j])
Ejemplo n.º 17
0
UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref')
_DIR_PREFIX = pathlib.Path(__file__).parents[1]
_CODEFILE = pathlib.Path(__file__).name
PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots')

_REL_PLOT_MIN = -1.2
_REL_PLOT_MAX = 1

_FIGDPI = 300

############################
# Re-load/process raw data #
############################
data = eval_lib.fetch_training_data(UNGD)
familymap = mapping_lib.get_mapping('variant', 'original', UNGD)
familymap = familymap.loc[data.index]
encoder = training_lib.get_linear_encoder()
X, y = eval_lib.featurize_training_data(encoder, data, UNGD)
cross_predictions = np.full_like(y, np.nan)
y_orig = y
X_scaler, y_scaler, X, y = eval_lib.scale_training_data_linear(X, y)

########################
# Read Prediction Data #
########################
modeldir = training_lib.LINEAR_MODELDIR
model_template = 'model.{i}.d5'
coverage_template = 'model.{i}.coverage.pickle'

# Loop over models