def set_clique_n69(self): ''' load 69 PCLs currated by Rajiv Parameters ---------- ''' ### load in data for individual groups llo = ldc.label_loader() self.pclDict = llo.load_clique_set_n69() self.all_group_cps = self.pclDict.values() self.test_groups = self.pclDict.keys()
def test_classes_incrementally(self, rnkpt_med_file, n_test_max=False): ''' -start from the most internally consistent PCL - and move down the list -incrementally increase the number of groups added to the classifier Parameters ---------- rnkpt_med_file : str path to a file containing the median summly rankpoint values for each group (output from the pcla tool) n_test_max : int -max number of PCL groups to incorporate into the classifier -if set to False, all groups are tested ''' ### load in data for individual groups llo = ldc.label_loader() self.pclDict = llo.load_TTD() #load pcl rankpoint file groupMedians = pd.io.parsers.read_csv(rnkpt_med_file, sep='\t') groupMedians = groupMedians.sort('median_rankpt', ascending=False) # make sure compounds are not counted mroe than once in a dictionary: extendedCompoundList = [] reducedPCLDict = {} for key in groupMedians['PCL_group']: value = self.pclDict[key] for brd in value: if brd in extendedCompoundList: value.remove(brd) reducedPCLDict[key] = value extendedCompoundList.extend(value) self.pclDict = reducedPCLDict # set incrament of groups if n_test_max: max_groups = n_test_max else: max_groups = groupMedians.shape[0] group_range = np.arange(2, max_groups + 1) n_group_accuracy = {} for n_groups in group_range: print "testing " + str(n_groups) + " number of classes" testGroups = groupMedians['PCL_group'][:n_groups].values self.test_groups = testGroups self.classification_across_cell(groups_to_model=testGroups, loo_type='by_cp', max_signatures_per_cp=3) n_group_accuracy[n_groups] = self.model_accuracy_across_cells self.n_group_accuracy = n_group_accuracy
def test_classes_incrementally(self,rnkpt_med_file,n_test_max=False): ''' -start from the most internally consistent PCL - and move down the list -incrementally increase the number of groups added to the classifier Parameters ---------- rnkpt_med_file : str path to a file containing the median summly rankpoint values for each group (output from the pcla tool) n_test_max : int -max number of PCL groups to incorporate into the classifier -if set to False, all groups are tested ''' ### load in data for individual groups llo = ldc.label_loader() self.pclDict = llo.load_TTD() #load pcl rankpoint file groupMedians = pd.io.parsers.read_csv(rnkpt_med_file,sep='\t') groupMedians = groupMedians.sort('median_rankpt',ascending=False) # make sure compounds are not counted mroe than once in a dictionary: extendedCompoundList = [] reducedPCLDict = {} for key in groupMedians['PCL_group']: value = self.pclDict[key] for brd in value: if brd in extendedCompoundList: value.remove(brd) reducedPCLDict[key] = value extendedCompoundList.extend(value) self.pclDict = reducedPCLDict # set incrament of groups if n_test_max: max_groups = n_test_max else: max_groups = groupMedians.shape[0] group_range = np.arange(2,max_groups+1) n_group_accuracy = {} for n_groups in group_range: print "testing " + str(n_groups) + " number of classes" testGroups = groupMedians['PCL_group'][:n_groups].values self.test_groups = testGroups self.classification_across_cell(groups_to_model=testGroups,loo_type='by_cp',max_signatures_per_cp=3) n_group_accuracy[n_groups] = self.model_accuracy_across_cells self.n_group_accuracy = n_group_accuracy
def set_classes(self): ''' specify source of class labels Parameters ---------- ''' ### load in data for individual groups llo = ldc.label_loader() self.pclDict = llo.load_TTD() ## pick 5 groups - best inter-connectors testGroups = ['Histone_deacetylase_1-Inhibitor', 'Glucocorticoid_receptor-Agonist', 'Proto-oncogene_tyrosine-protein_kinase_ABL1-Inhibitor', 'Phosphatidylinositol-4,5-bisphosphate_3-kinase_catalytic_subunit,_delta_isoform-Inhibitor', '3-hydroxy-3-methylglutaryl-coenzyme_A_reductase-Inhibitor'] brdAllGroups = [] for group in testGroups: brdAllGroups.extend(self.pclDict[group]) self.all_group_cps = brdAllGroups self.test_groups = testGroups
def set_classes(self): ''' specify source of class labels Parameters ---------- ''' ### load in data for individual groups llo = ldc.label_loader() self.pclDict = llo.load_TTD() ## pick 5 groups - best inter-connectors testGroups = [ 'Histone_deacetylase_1-Inhibitor', 'Glucocorticoid_receptor-Agonist', 'Proto-oncogene_tyrosine-protein_kinase_ABL1-Inhibitor', 'Phosphatidylinositol-4,5-bisphosphate_3-kinase_catalytic_subunit,_delta_isoform-Inhibitor', '3-hydroxy-3-methylglutaryl-coenzyme_A_reductase-Inhibitor' ] brdAllGroups = [] for group in testGroups: brdAllGroups.extend(self.pclDict[group]) self.all_group_cps = brdAllGroups self.test_groups = testGroups
## # gmo = gm.GeneMod() # gmo.load_data_from_gctx(src=cmap.score_path,symbols='TRIB1') # gmo.z_score_filter(4) # gmo.signature_strength_filter(8) # gmo.cell_type_filter('HEPG2') # # gmo.sc_plot(out=wkdir+'sc_hits.png') # gmo.scatter() # gmo.expression_histogram() # gmo.ssr_plot(out=wkdir+'ssr_hits.png') # cid = [gmo.cid[x] for x in gmo.reg_ind] #load in drugbank annotations reload(ldc) llo = ldc.label_loader() pclDict = llo.load_TTD() dbDict = llo.load_drugbank_by_gene(group_by_action=False) geneTargets = dbDict.keys() #put drug-gene relationships in a file tupList = [] for gene in dbDict: # make tuple cps = dbDict[gene] for cp in cps: tup = (cp, gene) tupList.append(tup) dbFrm = pd.DataFrame(tupList,columns=['brd','target_gene']) dbSer = pd.Series(dbFrm['target_gene']) dbSer.index = dbFrm['brd'] cpToGenDict = dbSer.to_dict()
## pick 5 groups - best inter-connectors # testGroups = ['Histone_deacetylase_1-Inhibitor', # 'Glucocorticoid_receptor-Agonist', # 'Proto-oncogene_tyrosine-protein_kinase_ABL1-Inhibitor', # 'Phosphatidylinositol-4,5-bisphosphate_3-kinase_catalytic_subunit,_delta_isoform-Inhibitor', # '3-hydroxy-3-methylglutaryl-coenzyme_A_reductase-Inhibitor'] # load in top groups wkdir = '/xchip/cogs/hogstrom/analysis/scratch' if not os.path.exists(wkdir): os.mkdir(wkdir) #make pso object pso = psc.svm_pcla(out=wkdir) self = pso llo = ldc.label_loader() self.pclDict = llo.load_TTD() #load pcl rankpoint file rnkpt_med_file = '/xchip/cogs/projects/pharm_class/TTd_Oct29/PCL_group_rankpt_medians.txt' groupMedians = pd.io.parsers.read_csv(rnkpt_med_file, sep='\t') groupMedians = groupMedians.sort('median_rankpt', ascending=False) # make sure compounds are not counted mroe than once in a dictionary: extendedCompoundList = [] reducedPCLDict = {} for key in groupMedians['PCL_group']: value = self.pclDict[key] for brd in value: if brd in extendedCompoundList: value.remove(brd) reducedPCLDict[key] = value extendedCompoundList.extend(value)