def read_V1_data(data_file, learn_options): if data_file is None: data_file = "../data/07-23 Data for Jennifer.xlsx" human_data = pandas.read_excel(data_file, sheetname=0, index_col=[0, 1]) mouse_data = pandas.read_excel(data_file, sheetname=1, index_col=[0, 1]) Xdf, Y = combine_organisms(human_data, mouse_data) # get position within each gene, then join and re-order # note that 11 missing guides we were told to ignore annotations = pandas.read_csv(r"..\data\AML_EL4_PercentRank_0725.txt", delimiter='\t', index_col=[0, 4]) annotations.index.names = Xdf.index.names gene_position = pandas.merge(Xdf, annotations, how="inner", left_index=True, right_index=True) gene_position = util.impute_gene_position(gene_position) gene_position = gene_position[['Amino Acid Cut position', 'Nucleotide cut position', 'Percent Peptide']] Y = Y.loc[gene_position.index] Xdf = Xdf.loc[gene_position.index] Y['test'] = 1 # for bookeeping to keep consistent with V2 which uses this for "extra pairs" target_genes = Y['Target gene'].unique() Y.index.names = ['Sequence', 'Target gene'] assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" if learn_options is not None and learn_options["flipV1target"]: print "************************************************************************" print "*****************MATCHING DOENCH CODE (DEBUG MODE)**********************" print "************************************************************************" # normally it is: Y['average threshold'] = Y['average rank'] > 0.8, where 1s are good guides, 0s are not Y['average threshold'] = Y['average rank'] < 0.2 # 1s are bad guides print "press c to continue" import ipdb ipdb.set_trace() return annotations, gene_position, target_genes, Xdf, Y
def read_V2_data(data_file, learn_options=None, verbose=True): if data_file is None: data_file = "../data/11-15-2014 DeepXPR results_processed.xlsx" # to compare # import predict as pr; a1, g1, t1, X1, Y1 = pr.data_setup() # a1.index.names data = pandas.read_excel(data_file, sheetname="ResultsFiltered", skiprows=range(0, 6+1), index_col=[0, 4]) # grab data relevant to each of three drugs, which exludes some genes # note gene MED12 has two drugs, all others have at most one Xdf = pandas.DataFrame() # This comes from the "Pairs" tab in their excel sheet, # note HPRT/HPRT1 are same thing, and also PLX_2uM/PLcX_2uM known_pairs = {'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'], '6TG_2ug/mL': ['HPRT1'], 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']} drugs_to_genes = {'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'], '6TG_2ug/mL': ['HPRT1'], 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']} if learn_options is not None: assert not (learn_options['extra pairs'] and learn_options['all pairs']), "extra pairs and all pairs options (in learn_options) can't be active simultaneously." if learn_options['extra pairs']: drugs_to_genes['AZD_200nM'].extend(['CUL3', 'NF1', 'NF2']) elif learn_options['all pairs']: drugs_to_genes['AZD_200nM'].extend(['HPRT1', 'CUL3', 'NF1', 'NF2']) drugs_to_genes['PLX_2uM'].extend(['HPRT1', 'CCDC101', 'TADA2B', 'TADA1']) drugs_to_genes['6TG_2ug/mL'].extend(['CCDC101', 'MED12', 'TADA2B', 'TADA1', 'CUL3', 'NF1', 'NF2']) count = 0 for drug in drugs_to_genes.keys(): genes = drugs_to_genes[drug] for g in genes: Xtmp = data.copy().xs(g, level='Target gene', drop_level=False) Xtmp['drug'] = drug Xtmp['score'] = Xtmp[drug].copy() # grab the drug results that are relevant for this gene if g in known_pairs[drug]: Xtmp['test'] = 1. else: Xtmp['test'] = 0. count = count + Xtmp.shape[0] Xdf = pandas.concat([Xdf, Xtmp], axis=0) if verbose: print "Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count) # create new index that includes the drug Xdf = Xdf.set_index('drug', append=True) Y = pandas.DataFrame(Xdf.pop("score")) Y.columns.names = ["score"] test_gene = pandas.DataFrame(Xdf.pop('test')) target = pandas.DataFrame(Xdf.index.get_level_values('Target gene').values, index=Y.index, columns=["Target gene"]) Y = pandas.concat((Y, target, test_gene), axis=1) target_genes = Y['Target gene'].unique() gene_position = Xdf[["Percent Peptide", "Amino Acid Cut position"]].copy() # convert to ranks for each (gene, drug combo) # flip = True y_rank = pandas.DataFrame() y_threshold = pandas.DataFrame() y_quant = pandas.DataFrame() for drug in drugs_to_genes.keys(): gene_list = drugs_to_genes[drug] for gene in gene_list: ytmp = pandas.DataFrame(Y.xs((gene, drug), level=["Target gene", "drug"], drop_level=False)['score']) y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug_gene", flip=False) # np.unique(y_rank.values-y_rank_raw.values) y_rank = pandas.concat((y_rank, y_ranktmp), axis=0) y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0) y_quant = pandas.concat((y_quant, y_quanttmp), axis=0) yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1) Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True) # convert also by drug only, irrespective of gene y_rank = pandas.DataFrame() y_threshold = pandas.DataFrame() y_quant = pandas.DataFrame() for drug in drugs_to_genes.keys(): ytmp = pandas.DataFrame(Y.xs(drug, level="drug", drop_level=False)['score']) y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug", flip=False) # np.unique(y_rank.values-y_rank_raw.values) y_rank = pandas.concat((y_rank, y_ranktmp), axis=0) y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0) y_quant = pandas.concat((y_quant, y_quanttmp), axis=0) yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1) Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True) PLOT = False if PLOT: # to better understand, try plotting something like: labels = ["score", "score_drug_gene_rank", "score_drug_rank", "score_drug_gene_threshold", "score_drug_threshold"] gene_position = util.impute_gene_position(gene_position) if learn_options is not None and learn_options["weighted"] == "variance": print "computing weights from replicate variance..." # compute the variance across replicates so can use it as a weight data = pandas.read_excel(data_file, sheetname="Normalized", skiprows=range(0, 6+1), index_col=[0, 4]) data.index.names = ["Sequence", "Target gene"] experiments = {} experiments['AZD_200nM'] = ['Deep 25', 'Deep 27', 'Deep 29 ', 'Deep 31'] experiments['6TG_2ug/mL'] = ['Deep 33', 'Deep 35', 'Deep 37', 'Deep 39'] experiments['PLX_2uM'] = ['Deep 49', 'Deep 51', 'Deep 53', 'Deep 55'] variance = None for drug in drugs_to_genes.keys(): data_tmp = data.iloc[data.index.get_level_values('Target gene').isin(drugs_to_genes[drug])][experiments[drug]] data_tmp["drug"] = drug data_tmp = data_tmp.set_index('drug', append=True) data_tmp["variance"] = np.var(data_tmp.values, axis=1) if variance is None: variance = data_tmp["variance"].copy() else: variance = pandas.concat((variance, data_tmp["variance"]), axis=0) orig_index = Y.index.copy() Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True) Y = Y.ix[orig_index] print "done." # Make sure to keep this check last in this function assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)" return Xdf, drugs_to_genes, target_genes, Y, gene_position