def load_pcls():
    "-load pcl groups \
    limit groups to a set of pre-curated PCLS \
    -return of pandas Series of these groups (index= group name, column is BRDs"
    classGMT = '/xchip/cogs/projects/pharm_class/pcl_shared_target_pid.gmt'
    gmtDict = gmt.read(classGMT)
    drugLabels = pd.DataFrame(gmtDict)
    drugLabels['id'] = drugLabels['id'].str.replace("/","_")
    drugLabels['id'] = drugLabels['id'].str.replace("-","_")
    drugLabels['id'] = drugLabels['id'].str.replace(" ","_")
    drugLabels['id'] = drugLabels['id'].str.replace("&","_")
    drugLabels['id'] = drugLabels['id'].str.replace("?","_")
    drugLabels['id'] = drugLabels['id'].str.replace("(","_")
    drugLabels['id'] = drugLabels['id'].str.replace(")","_")
    drugLabels['id'] = drugLabels['id'].str.replace("'","_")
    drugLabels['id'] = drugLabels.id.str.lower() # convert to lower case
    #load curated list of groups
    curatedFile = '/xchip/cogs/hogstrom/analysis/scratch/pcl_keepers_mod_currated.txt'
    curFrm = pd.read_csv(curratedFile,header=None)
    curFrm.columns = ['curated_groups']
    dlSer = pd.Series(data=drugLabels['sig'])
    dlSer.index = drugLabels['id']
    #which of the curated groups are have pairings
    isCur = curFrm.curated_groups.isin(dlSer.index)
    curGroups = dlSer.reindex(curFrm.ix[isCur,'curated_groups'])
    return curGroups
Ejemplo n.º 2
0
    def load_clique_set_n69(self):
        ''' 
        load drug label set currated by Rajiv

        '''
        #load in clique annotations and matrix
        cFile = '/xchip/cogs/sig_tools/sig_cliquescore_tool/sample/cp_clique_n69/clique.gmt'
        cliqueGMT = gmt.read(cFile)
        cliqFrm = pd.DataFrame(cliqueGMT)
        # set grouping structures 
        pclDict = {}
        for x in cliqFrm.iterrows():
            pclDict[x[1]['id']] = set(x[1]['sig'])
        return pclDict
Ejemplo n.º 3
0
    def load_clique_set_n69(self):
        ''' 
        load drug label set currated by Rajiv

        '''
        #load in clique annotations and matrix
        cFile = '/xchip/cogs/sig_tools/sig_cliquescore_tool/sample/cp_clique_n69/clique.gmt'
        cliqueGMT = gmt.read(cFile)
        cliqFrm = pd.DataFrame(cliqueGMT)
        # set grouping structures
        pclDict = {}
        for x in cliqFrm.iterrows():
            pclDict[x[1]['id']] = set(x[1]['sig'])
        return pclDict
### add lines for gct headers
line_pre_adder(outFile,str(mtrx.shape[0])+'\t'+str(mtrx.shape[1]-1))
line_pre_adder(outFile,"#1.2")

### make gmts of gene shRNAs
geneGrped = annt.groupby('pert_id')
gmtList = []
for grp in geneGrped:
    gmtDictUp = {}
    gmtDictUp['id'] = grp[0]
    gmtDictUp['desc'] = grp[0]
    gmtDictUp['sig'] = list(grp[1].sig_id.values)
    gmtList.append(gmtDictUp)
# gmtOut = wkdir + '/gene_shRNA_sig_id.gmt'
gmtOut = wkdir + '/gene_oe_sig_id.gmt'
gmt.write(gmtList,gmtOut)

### load core drivers - save sig_ids to new gmt
gFile= wkdir + '/core_lung_drivers.gmt'
coreGMT = gmt.read(gFile)
coreOE = coreGMT['sig']
coreFrm = annt[annt.pert_id.isin(coreOE)]
sig_ids = list(coreFrm.sig_id.values)
gmtDict = {}
gmtDict['id'] = 'core_lung_drivers'
gmtDict['desc'] = 'core_lung_drivers'
gmtDict['sig'] = sig_ids
gmtOut = wkdir + '/core_lung_drivers_sig_id.gmt'
gmt.write([gmtDict],gmtOut)

Ejemplo n.º 5
0
import copy
from matplotlib import cm
from statsmodels.distributions import ECDF

from cmap.analytics.statsig import ConnectivitySignificance
from cmap.io import gct
from cmap.io import gmt
import cmap.util.progress as update

wkdir = '/xchip/cogs/projects/connectivity/null/clique_analysis/clique_vs_dmso_null'
if not os.path.exists(wkdir):
    os.mkdir(wkdir)

# load cliques
classGMT = '/xchip/cogs/projects/pharm_class/cp_cliques_current.gmt'
gmtDict = gmt.read(classGMT)
cliqueLabels = pd.DataFrame(gmtDict)
# create set of all clique members
cList = [item for sublist in cliqueLabels['sig'] for item in sublist]
cSet = set(cList)

# load observed score data
# thresholded
# rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/dmso_q_thresholded_asym_lass_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814320559/summly/self_rankpt_n379x379.gctx'
# non-thresholded asym
rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/baseline_lass_asym_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814364180/summly/self_rankpt_n379x379.gctx'
gt1 = gct.GCT()
gt1.read(rFile)
sFrm = gt1.frame
sFrm.columns = gt1.get_column_meta('pert_id')
#check that all clique members are in the observed matrix
Ejemplo n.º 6
0
    sigInfoFrm['labels'] = np.nan
    sigInfoFrm['pcl_name'] = 'null'
    for igroup,group in enumerate(test_groups):
        grpMembers = pclDict[group]
        iMatch = sigInfoFrm['pert_id'].isin(grpMembers)
        sigInfoFrm['labels'][iMatch] = igroup
        sigInfoFrm['pcl_name'][iMatch] = group
    return sigInfoFrm

wkdir = '/xchip/cogs/projects/NMF/clique_n69_all_cell_lines'
if not os.path.exists(wkdir):
    os.mkdir(wkdir)

#load in clique annotations and matrix
cFile = '/xchip/cogs/sig_tools/sig_cliquescore_tool/sample/cp_clique_n69/clique.gmt'
cliqueGMT = gmt.read(cFile)
cliqFrm = pd.DataFrame(cliqueGMT)

# set grouping structures 
pclDict = {}
for x in cliqFrm.iterrows():
    pclDict[x[1]['id']] = set(x[1]['sig'])
#
brdAllGroups = []
for group in pclDict:
    brdAllGroups.extend(pclDict[group])
brdAllGroups.append('DMSO')
brdAllGroups = list(set(brdAllGroups))
testGroups = cliqFrm['id'].values

# extract signatures and expression data for every group member
mtrx.to_csv(outFile, sep='\t')
### add lines for gct headers
line_pre_adder(outFile, str(mtrx.shape[0]) + '\t' + str(mtrx.shape[1] - 1))
line_pre_adder(outFile, "#1.2")

### make gmts of gene shRNAs
geneGrped = annt.groupby('pert_id')
gmtList = []
for grp in geneGrped:
    gmtDictUp = {}
    gmtDictUp['id'] = grp[0]
    gmtDictUp['desc'] = grp[0]
    gmtDictUp['sig'] = list(grp[1].sig_id.values)
    gmtList.append(gmtDictUp)
# gmtOut = wkdir + '/gene_shRNA_sig_id.gmt'
gmtOut = wkdir + '/gene_oe_sig_id.gmt'
gmt.write(gmtList, gmtOut)

### load core drivers - save sig_ids to new gmt
gFile = wkdir + '/core_lung_drivers.gmt'
coreGMT = gmt.read(gFile)
coreOE = coreGMT['sig']
coreFrm = annt[annt.pert_id.isin(coreOE)]
sig_ids = list(coreFrm.sig_id.values)
gmtDict = {}
gmtDict['id'] = 'core_lung_drivers'
gmtDict['desc'] = 'core_lung_drivers'
gmtDict['sig'] = sig_ids
gmtOut = wkdir + '/core_lung_drivers_sig_id.gmt'
gmt.write([gmtDict], gmtOut)
Ejemplo n.º 8
0
    'A375', 'A549', 'HA1E', 'HCC515', 'HEPG2', 'HT29', 'MCF7', 'PC3', 'VCAP'
]  # cmap 'core' cell lines
basedir = '/xchip/cogs/projects/NMF/NMF_drug_shRNA'

# drug_gene_list = '/xchip/cogs/projects/target_id/drug_gene_connections_20Mar2014/expected_drug_gene_connection_ranks.txt'
# dg = pd.read_csv(drug_gene_list,sep='\t')
# dg_connected = dg[dg.connection_rank <= 10]

cpd_targets_n368_file = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/cpd_targets_n368/summly/self_connectivity.txt'
n368 = pd.read_csv(cpd_targets_n368_file, sep='\t')
median_rnkpt_thresh = 73
cp_connected = n368[n368.median_rankpt >= median_rnkpt_thresh]

#load in clique annotations and matrix
cFile = '/xchip/cogs/projects/pharm_class/rnwork/cliques/cpd_targets_n368.gmt'
cliqueGMT = gmt.read(cFile)
cliqFrm = pd.DataFrame(cliqueGMT)
# limit only to drug-gene groups that have coherence
cliqFrm = cliqFrm[cliqFrm.id.isin(cp_connected.group_id)]

# write a new, shorter gmt file
gmtUpdate = [x for x in cliqueGMT if x['desc'] in cliqFrm.desc.values]
outF = basedir + '/n69_drug_targets.gmt'
gmt.write(gmtUpdate, outF)

### set parameters
probeSpace = 'lm_epsilon'  # lm_epsilon or bing
nDMSO = 50
nKeep = 2  # number of signatures per drug
for cell in cellList:
    print(cell)
Ejemplo n.º 9
0


# run through MSIG_DB

### EMT questions
# 1) enrichment for any apirori genes
# 2) stability across multiple signatures or TFs
# 3) any drug class enrichment? 

# run up/dn signatures through MSIGDB - do they capture other emt signatures? 


# load kegg pathways
file_kegg = '/xchip/cogs/hogstrom/bathe/gordonov/c2.cp.kegg.v4.0.symbols.gmt'
gt = gmt.read(file_kegg)
keggFrm = pd.DataFrame(gt)
GeneList = keggFrm[keggFrm.id == 'KEGG_REGULATION_OF_ACTIN_CYTOSKELETON'].sig.values
GeneList = list(GeneList[0])

###
aprioriList = ['RAC1',
'CDC42', 
'RHOA',
'ROCK1',
'RICS',
'RHOA',
'PRKCA',
'PIK3CA',
'ARPC1A',
'MAPK',
### shRNA
# anntFrm = pd.read_csv(aFile,sep='\t',index_col=1) #,header=True)
# headers= ['sig2','pert_id']
anntFrm.columns = headers
anntFrm.index.name = 'sig1'
# drop extra rows
anntFrm = anntFrm[anntFrm.index.isin(Hmtrx.index)] # leave out annotations not in matrix
### read in mutual information matrices
mFile = sourceDir + '/' + prefix + '/' + modzPrefix + '.MI.input_space.gct'
mi = pd.read_csv(mFile,sep='\t',skiprows=[0,1],index_col=0) #,header=True
mi = mi.drop('Description',1)
cFile = sourceDir + '/' + prefix + '/' + modzPrefix + '.MI.k' + str(nComponents) + '.gct'
cmi = pd.read_csv(cFile,sep='\t',skiprows=[0,1],index_col=0) #,header=True
cmi = cmi.drop('Description',1)
### load in clique annotations and matrix
cliqueGMT = gmt.read(gmtFile)
cliqFrm = pd.DataFrame([cliqueGMT])
#########################################
### graph individual group components ###
#########################################
group_component_maps(Hmtrx,cliqFrm,graphDir)
# # ##############################
# # ### top component analysis ###
# # ##############################
# # take the mean of the top 3 components for each group member
# topMeanFrm = combine_group_top_components(Hmtrx,cliqFrm,metric='mean')
# # ##############################
# # ### build null distribution ##
# # ##############################
# # repeate metric - but shuffle signatures from groups of equal size
# nullMean = build_combine_null(Hmtrx,cliqFrm,topMeanFrm,nTop=3,nPerm=4000)
Ejemplo n.º 11
0


# run through MSIG_DB

### EMT questions
# 1) enrichment for any apirori genes
# 2) stability across multiple signatures or TFs
# 3) any drug class enrichment? 

# run up/dn signatures through MSIGDB - do they capture other emt signatures? 


# load kegg pathways
file_kegg = '/xchip/cogs/hogstrom/bathe/gordonov/c2.cp.kegg.v4.0.symbols.gmt'
gt = gmt.read(file_kegg)
keggFrm = pd.DataFrame(gt)
GeneList = keggFrm[keggFrm.id == 'KEGG_REGULATION_OF_ACTIN_CYTOSKELETON'].sig.values
GeneList = list(GeneList[0])

###
aprioriList = ['RAC1',
'CDC42', 
'RHOA',
'ROCK1',
'RICS',
'RHOA',
'PRKCA',
'PIK3CA',
'ARPC1A',
'MAPK',
import copy
from matplotlib import cm
from statsmodels.distributions import ECDF

from cmap.analytics.statsig import ConnectivitySignificance
from cmap.io import gct
from cmap.io import gmt
import cmap.util.progress as update

wkdir = '/xchip/cogs/projects/connectivity/null/clique_analysis/clique_vs_dmso_null'
if not os.path.exists(wkdir):
    os.mkdir(wkdir)

# load cliques
classGMT = '/xchip/cogs/projects/pharm_class/cp_cliques_current.gmt'
gmtDict = gmt.read(classGMT)
cliqueLabels = pd.DataFrame(gmtDict)
# create set of all clique members
cList = [item for sublist in cliqueLabels['sig'] for item in sublist]
cSet = set(cList)

# load observed score data
# thresholded
# rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/dmso_q_thresholded_asym_lass_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814320559/summly/self_rankpt_n379x379.gctx'
# non-thresholded asym
rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/baseline_lass_asym_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814364180/summly/self_rankpt_n379x379.gctx'
gt1 = gct.GCT()
gt1.read(rFile)
sFrm = gt1.frame
sFrm.columns = gt1.get_column_meta('pert_id')
#check that all clique members are in the observed matrix
Ejemplo n.º 13
0
import matplotlib
from cmap.analytics.pert_explorer import PertExplorer
from cmap.analytics.cluster import HClust
import cmap.io.gmt as gmt
from matplotlib import cm
from cmap.analytics.queryer import Queryer
import cmap.analytics.gppa as gppa
from cmap.io import queryresult

wkdir = '/xchip/cogs/projects/target_id/pathway_clustering/lfcg_gppa_KD_spearman'
if not os.path.exists(wkdir):
    os.makedirs(wkdir)

#pathway annotations from reactome
pathGMT = '/xchip/cogs/projects/target_id/KD_pathway_clustering/ReactomePathways.gmt'
gmtDict = gmt.read(pathGMT)
pathwayDict = {}
for dict1 in gmtDict:
    pathwayDict[dict1['id']] = dict1['sig']

# aprioriPathways = ['Cholesterol biosynthesis',
#   'p53-Dependent G1 DNA Damage Response',
#   'p53-Dependent G1/S DNA damage checkpoint',
#   'Antigen processing: Ubiquitination & Proteasome degradation',
#   'Regulation of activated PAK-2p34 by proteasome mediated degradation',
#   'Signaling by TGF-beta Receptor Complex',
#   'mTOR signalling']

# Lessons from the cancer genome pathways:
lfcgPathways = [
    'p38MAPK events', 'ERK/MAPK targets',
Ejemplo n.º 14
0
for i,x in enumerate(pInames):
    pInameType.append(pInames[i]+ '.' +pType[i])
anntFrm = pd.DataFrame({'pert_id':pIDs,'pert_type':pType,'pert_iname':pInames},index=pInameType)
sigSer = pd.Series(index=summFrm.index, data=summFrm.columns)
outGRP = wkdir + '/summly_matched_ids.grp'
sigSer.to_csv(outGRP,index=False)

####################
### load cliques ###
####################

# groupGMT = '/xchip/cogs/projects/pharm_class/rnwork/cliques/cpd_groups_n147.gmt'
groupGMT = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/pcl_20140213/cliques.gmt'
# groupGMT = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/pcl_20140221/cliques.gmt'

cliqueGMT = gmt.read(groupGMT)
cliqFrm = pd.DataFrame(cliqueGMT)
cliqFrm['group_size'] = cliqFrm.sig.apply(len)
cliqFrm.index = cliqFrm['desc']
 cliqFrm['Name'].str.replace("/","-")

# which compounds are clique members vs. non-members
cliqMemberLong = [item for sublist in cliqFrm.sig.values for item in sublist]
cliqMemb = list(set(cliqMemberLong))
isMemb = anntFrm.pert_id.isin(cliqMemb)
isCp = anntFrm.pert_type == 'trt_cp'
nonMemb = anntFrm[isCp & ~isMemb].index.values
nonMid = anntFrm[isCp & ~isMemb].pert_id.values

#########################################
### load sig_cliquescore_tool results ###
import matplotlib
from cmap.analytics.pert_explorer import PertExplorer
from cmap.analytics.cluster import HClust
import cmap.io.gmt as gmt
from matplotlib import cm
from cmap.analytics.queryer import Queryer
import cmap.analytics.gppa as gppa
from cmap.io import queryresult

wkdir = '/xchip/cogs/projects/target_id/pathway_clustering/lfcg_gppa_KD_spearman'
if not os.path.exists(wkdir):
    os.makedirs(wkdir)

#pathway annotations from reactome 
pathGMT = '/xchip/cogs/projects/target_id/KD_pathway_clustering/ReactomePathways.gmt'
gmtDict = gmt.read(pathGMT)
pathwayDict = {}
for dict1 in gmtDict:
    pathwayDict[dict1['id']] = dict1['sig']

# aprioriPathways = ['Cholesterol biosynthesis', 
#   'p53-Dependent G1 DNA Damage Response', 
#   'p53-Dependent G1/S DNA damage checkpoint', 
#   'Antigen processing: Ubiquitination & Proteasome degradation', 
#   'Regulation of activated PAK-2p34 by proteasome mediated degradation', 
#   'Signaling by TGF-beta Receptor Complex',
#   'mTOR signalling']

# Lessons from the cancer genome pathways:
lfcgPathways = ['p38MAPK events',
  'ERK/MAPK targets',