Esempio n. 1
0
def read_V1_data(data_file, learn_options):
    if data_file is None:
        data_file = "../data/07-23 Data for Jennifer.xlsx"
    human_data = pandas.read_excel(data_file, sheetname=0, index_col=[0, 1])
    mouse_data = pandas.read_excel(data_file, sheetname=1, index_col=[0, 1])
    Xdf, Y = combine_organisms(human_data, mouse_data)

    # get position within each gene, then join and re-order
    # note that 11 missing guides we were told to ignore
    annotations = pandas.read_csv(r"..\data\AML_EL4_PercentRank_0725.txt", delimiter='\t', index_col=[0, 4])
    annotations.index.names = Xdf.index.names
    gene_position = pandas.merge(Xdf, annotations, how="inner", left_index=True, right_index=True)
    gene_position = util.impute_gene_position(gene_position)
    gene_position = gene_position[['Amino Acid Cut position', 'Nucleotide cut position', 'Percent Peptide']]
    Y = Y.loc[gene_position.index]
    Xdf = Xdf.loc[gene_position.index]

    Y['test'] = 1  # for bookeeping to keep consistent with V2 which uses this for "extra pairs"

    target_genes = Y['Target gene'].unique()

    Y.index.names = ['Sequence', 'Target gene']

    assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"

    if learn_options is not None and learn_options["flipV1target"]:
        print "************************************************************************"
        print "*****************MATCHING DOENCH CODE (DEBUG MODE)**********************"
        print "************************************************************************"
        # normally it is: Y['average threshold'] = Y['average rank'] > 0.8, where 1s are good guides, 0s are not
        Y['average threshold'] = Y['average rank'] < 0.2  # 1s are bad guides
        print "press c to continue"
        import ipdb
        ipdb.set_trace()

    return annotations, gene_position, target_genes, Xdf, Y
Esempio n. 2
0
def read_V2_data(data_file, learn_options=None, verbose=True):
    if data_file is None:
        data_file = "../data/11-15-2014 DeepXPR results_processed.xlsx"

    # to compare
    # import predict as pr; a1, g1, t1, X1, Y1 = pr.data_setup()
    # a1.index.names

    data = pandas.read_excel(data_file, sheetname="ResultsFiltered", skiprows=range(0, 6+1), index_col=[0, 4])
    # grab data relevant to each of three drugs, which exludes some genes
    # note gene MED12 has two drugs, all others have at most one
    Xdf = pandas.DataFrame()

    # This comes from the "Pairs" tab in their excel sheet,
    # note HPRT/HPRT1 are same thing, and also PLX_2uM/PLcX_2uM
    known_pairs = {'AZD_200nM':  ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
                   '6TG_2ug/mL': ['HPRT1'],
                   'PLX_2uM':    ['CUL3', 'NF1', 'NF2', 'MED12']}

    drugs_to_genes = {'AZD_200nM':  ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
                      '6TG_2ug/mL': ['HPRT1'],
                      'PLX_2uM':    ['CUL3', 'NF1', 'NF2', 'MED12']}

    if learn_options is not None:
        assert not (learn_options['extra pairs'] and learn_options['all pairs']), "extra pairs and all pairs options (in learn_options) can't be active simultaneously."

        if learn_options['extra pairs']:
            drugs_to_genes['AZD_200nM'].extend(['CUL3', 'NF1', 'NF2'])
        elif learn_options['all pairs']:
            drugs_to_genes['AZD_200nM'].extend(['HPRT1', 'CUL3', 'NF1', 'NF2'])
            drugs_to_genes['PLX_2uM'].extend(['HPRT1', 'CCDC101', 'TADA2B', 'TADA1'])
            drugs_to_genes['6TG_2ug/mL'].extend(['CCDC101', 'MED12', 'TADA2B', 'TADA1', 'CUL3', 'NF1', 'NF2'])

    count = 0
    for drug in drugs_to_genes.keys():
        genes = drugs_to_genes[drug]
        for g in genes:
            Xtmp = data.copy().xs(g, level='Target gene', drop_level=False)
            Xtmp['drug'] = drug
            Xtmp['score'] = Xtmp[drug].copy()  # grab the drug results that are relevant for this gene

            if g in known_pairs[drug]:
                Xtmp['test'] = 1.
            else:
                Xtmp['test'] = 0.

            count = count + Xtmp.shape[0]
            Xdf = pandas.concat([Xdf, Xtmp], axis=0)
            if verbose:
                print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count)

    # create new index that includes the drug
    Xdf = Xdf.set_index('drug', append=True)

    Y = pandas.DataFrame(Xdf.pop("score"))
    Y.columns.names = ["score"]

    test_gene = pandas.DataFrame(Xdf.pop('test'))
    target = pandas.DataFrame(Xdf.index.get_level_values('Target gene').values, index=Y.index, columns=["Target gene"])
    Y = pandas.concat((Y, target, test_gene), axis=1)
    target_genes = Y['Target gene'].unique()
    gene_position = Xdf[["Percent Peptide", "Amino Acid Cut position"]].copy()

    # convert to ranks for each (gene, drug combo)
    # flip = True
    y_rank = pandas.DataFrame()
    y_threshold = pandas.DataFrame()
    y_quant = pandas.DataFrame()
    for drug in drugs_to_genes.keys():
        gene_list = drugs_to_genes[drug]
        for gene in gene_list:
            ytmp = pandas.DataFrame(Y.xs((gene, drug), level=["Target gene", "drug"], drop_level=False)['score'])
            y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug_gene", flip=False)
            # np.unique(y_rank.values-y_rank_raw.values)
            y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
            y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
            y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)

    yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
    Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)

    # convert also by drug only, irrespective of gene
    y_rank = pandas.DataFrame()
    y_threshold = pandas.DataFrame()
    y_quant = pandas.DataFrame()
    for drug in drugs_to_genes.keys():
        ytmp = pandas.DataFrame(Y.xs(drug, level="drug", drop_level=False)['score'])
        y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug", flip=False)
        # np.unique(y_rank.values-y_rank_raw.values)
        y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
        y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
        y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)

    yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
    Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)

    PLOT = False
    if PLOT:
        # to better understand, try plotting something like:
        labels = ["score", "score_drug_gene_rank", "score_drug_rank", "score_drug_gene_threshold", "score_drug_threshold"]

    gene_position = util.impute_gene_position(gene_position)

    if learn_options is not None and learn_options["weighted"] == "variance":
        print "computing weights from replicate variance..."
        # compute the variance across replicates so can use it as a weight
        data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6+1), index_col=[0, 4])
        data.index.names = ["Sequence", "Target gene"]

        experiments = {}
        experiments['AZD_200nM'] = ['Deep 25', 'Deep 27', 'Deep 29 ', 'Deep 31']
        experiments['6TG_2ug/mL'] = ['Deep 33', 'Deep 35', 'Deep 37', 'Deep 39']
        experiments['PLX_2uM'] = ['Deep 49', 'Deep 51', 'Deep 53', 'Deep 55']

        variance = None
        for drug in drugs_to_genes.keys():
            data_tmp = data.iloc[data.index.get_level_values('Target gene').isin(drugs_to_genes[drug])][experiments[drug]]
            data_tmp["drug"] = drug
            data_tmp = data_tmp.set_index('drug', append=True)
            data_tmp["variance"] = np.var(data_tmp.values, axis=1)
            if variance is None:
                variance = data_tmp["variance"].copy()
            else:
                variance = pandas.concat((variance, data_tmp["variance"]), axis=0)

        orig_index = Y.index.copy()
        Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True)
        Y = Y.ix[orig_index]
        print "done."

    # Make sure to keep this check last in this function
    assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"

    return Xdf, drugs_to_genes, target_genes, Y, gene_position