def run_msigdbr(outdir): # download the sets and write to csvs # NOTE still had to sort|uniq|sort -t, -k2 this by hand (after removing first column with just line numbers) if not os.path.exists(outdir): os.makedirs(outdir) rcmds = [ loadcmd('msigdbr'), 'all_gene_sets <- msigdbr(species="H**o sapiens", category="C7")', 'alldf <- data.frame()', ] for ctype, gsname in msigdb_sets( 'UP'): # TODO should probably use the 'DN' ones in some way? print ' %8s %s' % (ctype, gsname) rcmds += [ 'glist <- all_gene_sets[all_gene_sets$gs_name=="%s",]$human_gene_symbol' % gsname, # gives list of gene names 'df <- data.frame(glist, tag="%s")' % ctype, 'names(df)[names(df) == "glist"] <- "gene"', 'names(df)[names(df) == "tag"] <- "type"', 'alldf <- rbind(alldf, df)' ] rcmds += [ 'write.csv(alldf, "%s/msigdb-markers.csv")' % outdir, ] utils.run_r(rcmds, 'auto', dryrun=False)
def install(): rcmds = [ 'install.packages("BiocManager", repos="http://cran.rstudio.com/"))', 'BiocManager::install(c("scRNAseq", "scater", "scran", "uwot", "DropletUtils", "GSEABase", "AUCell", "celldex", "SingleR"), dependencies=TRUE)' ] # "TENxPBMCData" utils.run_r(rcmds, 'auto')
def run_gex(feature_matrix_fname, mname, outdir, make_plots=True): if not os.path.exists(outdir): os.makedirs(outdir) rcmds = [ loadcmd(l) for l in [ 'DropletUtils', 'scater', 'scran', 'pheatmap', 'celldex', 'SingleR', 'GSEABase', 'AUCell' ] ] rcmds += [ 'options(width=1000)', 'sce <- read10xCounts("%s")' % feature_matrix_fname, 'rownames(sce) <- uniquifyFeatureNames(rowData(sce)$ID, rowData(sce)$Symbol)', # quality control 'is.mito <- grepl("^MT-", rownames(sce))', # figure out which genes are mitochondrial 'qcstats <- perCellQCMetrics(sce, subsets=list(Mito=is.mito))', 'filtered <- quickPerCellQC(qcstats, percent_subsets="subsets_Mito_percent")', # identifies + removes outliers (in several qc metrics) 'sce <- sce[, !filtered$discard]', 'capture.output(colData(sce)$Barcode, file="%s/%s")' % (outdir, barcodefname), # normalization 'sce <- logNormCounts(sce)', # # get reference labels from celldex (so we can remove HSCs) NOTE turning this off since it doesn't really change anything # 'ref <- celldex::BlueprintEncodeData()', # get reference labels from cache or download # 'pred <- SingleR(test=sce, ref=ref, labels=ref$label.main)', # assign labels to our cells (more SingleR detail here: https://ltla.github.io/SingleRBook) # 'table(pred$labels)', # 'sce <- sce[, pred$labels=="B-cells"]', # discard non-b-cells # 'pred <- pred[pred$labels=="B-cells", ]', ] if mname == 'hvg': rcmds += [ 'dec <- modelGeneVar(sce)', 'hvg <- getTopHVGs(dec, prop=0.1)', # 0.1 gives ~700 most variable genes (if you combine these with the fabio/waick, these totally dominate everything, presumably because there's so many) ] elif mname == 'fabio': rcmds += [ 'fabio.markers <- read.csv("%s", sep="\t", header=T)' % fabio_fname, # $name # genes from fabio (200 most up- or down-regulated in plasmablast as compared to naive B cells) ] elif mname == 'waick': rcmds += [ 'waick.markers <- read.csv("%s", header=T)' % waickfname, # 10 most up'd genes for naive, memory, pb, and prepb (40 total). Not sure if it's with respeect to each other, or other cells, or what ] elif mname == 'msigdb': rcmds += [ 'msigdb.markers <- read.csv("%s", header=T)' % msigdbfname, # see <msdsets> above -- I just searched through the G7 sets for ones with plasma{blast,cell} and took the nearby ones ] else: assert False # 'all_genes <- c(fabio.markers$gene, waick.markers$gene, hvg)', # don't do this, the hvgs overwhelm everything mname_markers = mname if mname != 'hvg': mname_markers += '.markers$gene' rcmds += dimredcmds(outdir, mname_markers) # reference labels from celldex rcmds += [ 'ref <- celldex::BlueprintEncodeData()', # get reference labels from cache or download 'pred <- SingleR(test=sce, ref=ref, labels=ref$label.main)', # assign labels to our cells (more SingleR detail here: https://ltla.github.io/SingleRBook) 'table(pred$labels)', ] rcmds += rplotcmds(outdir, 'celldex-label-heatmap', 'plotScoreHeatmap(pred)') # only if we have clusters: rcmds += [ 'tab <- table(Assigned=pred$pruned.labels, Cluster=colLabels(sce))', ] # table (and then heatmap) comparing these new labels to our existing clusters rcmds += rplotcmds( outdir, 'celldex-label-vs-cluster-heatmap', 'pheatmap(log2(tab+10), color=colorRampPalette(c("white", "blue"))(101))' ) # this will crash if you've filtered to only B cells if mname != 'hvg': rcmds += ctype_ann_cmds(outdir, mname_markers.replace('$gene', '')) utils.run_r(rcmds, 'auto', logfname='%s/out' % outdir, dryrun=False)
def run_bios2mds(n_components, n_clusters, seqfos, base_workdir, seed, aligned=False, reco_info=None, region=None, max_runs=100, max_iterations=1000, method='euclidean', plotdir=None, plotname='mds', queries_to_include=None, color_scale_vals=None, labels=None, title=None, debug=False): workdir = base_workdir + '/mds' msafname = workdir + '/msa.fa' mdsfname = workdir + '/components.txt' clusterfname = workdir + '/clusters.txt' if not os.path.exists(workdir): os.makedirs(workdir) if len(set([sfo['seq'] for sfo in seqfos])) < len(seqfos): # it'll just crash when it's running mds later, but this is faster raise Exception('duplicate sequences in seqfos') if aligned: # NOTE unlike the sklearn version below, this doesn't modify <seqfos> with open(msafname, 'w') as fastafile: for sfo in seqfos: fastafile.write('>%s\n%s\n' % (sfo['name'], sfo['seq'])) else: utils.align_many_seqs(seqfos, outfname=msafname) # build the R cmd file cmdlines = [ 'options(rgl.useNULL=TRUE)', 'require(bios2mds, quietly=TRUE)', 'set.seed(%d)' % seed, 'human <- import.fasta("%s")' % msafname, 'active <- mat.dif(human, human)', # mat.dif or mat.dis? ] if n_components is not None: cmdlines += ['mmds_active <- mmds(active, pc=%d)' % n_components] cmdlines += ['capture.output(mmds_active$coord, file="%s")' % mdsfname] else: raise Exception('need to implement') if n_clusters is not None: cmdlines += [ 'kmeans.run1 <- kmeans.run(mmds_active$coord, nb.clus=%d, nb.run=%d, iter.max=%d, method="%s")' % (n_clusters, max_runs, max_iterations, method), # 'kmeans.run1$clusters', # 'kmeans.run1$elements', 'options(width=10000)', 'capture.output(kmeans.run1$clusters, file="%s")' % clusterfname, # sil.score(mat, nb.clus = c(2:13), nb.run = 100, iter.max = 1000, # run for every possible number of clusters (?) # method = "euclidean") # random.msa # builds a random [...] ] rstart = time.time() try: utils.run_r(cmdlines, workdir) #, print_time='kmeans') except subprocess.CalledProcessError as e: # complex eigenvalues print e print ' mds failed on cluster' # NOTE will still crash in read_kmeans_clusterfile(), but I'm not using that a.t.m. title = (title if title is not None else '') + ' mds failed' pcvals = read_component_file(mdsfname, n_components, seqfos) partition = read_kmeans_clusterfile(clusterfname, seqfos) if n_clusters is not None else None rstop = time.time() os.remove(msafname) os.rmdir(workdir) plotstart = time.time() if plotdir is not None: # utils.prep_dir(plotdir, wildlings=['*.svg']) plot_mds(n_components, pcvals, plotdir, plotname, partition=partition if n_clusters is not None else None, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, labels=labels, title=title) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, title=title) print ' %5.1f %5.1f' % (rstop - rstart, time.time() - plotstart), return partition
def install(): rcmds = ['install.packages("tigger", repos="http://cran.rstudio.com/")'] workdir = '/tmp/%s/%d' % (os.getenv('USER'), random.randint(0, 999999)) os.makedirs(workdir) utils.run_r(rcmds, workdir) os.rmdir(workdir)
def run_treesim(self, seed, outfname, workdir): if self.args.debug or utils.getsuffix(outfname) == '.nwk': print ' generating %d tree%s,' % ( self.args.n_trees, utils.plural(self.args.n_trees)), if self.args.constant_number_of_leaves: print 'all with %s leaves' % str(self.args.n_leaves) else: print 'n-leaves from %s' % ( 'hist in parameter dir' if self.final_nldist == 'hist' else '%s distribution with parameter %s' % (self.final_nldist, str(self.args.n_leaves))) if self.args.debug: print ' mean branch lengths from %s' % ( self.parameter_dir if self.parameter_dir is not None else 'scratch') for mtype in [ 'all', ] + utils.regions: print ' %4s %7.3f (ratio %7.3f)' % ( mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean']) ages, treestrs = [], [] cmd_lines = [] pkgname = 'TreeSim' # TreeSimGM when root_mrca_weibull_parameter is set, otherwise TreeSim if self.args.root_mrca_weibull_parameter is not None: pkgname += 'GM' cmd_lines += ['require(%s, quietly=TRUE)' % pkgname] cmd_lines += ['set.seed(' + str(seed) + ')'] for itree in range(self.args.n_trees): n_leaves = self.choose_n_leaves() age = self.choose_full_sequence_branch_length() ages.append(age) if n_leaves == 1: # add singleton trees by hand treestrs.append('t1:%f;' % age) continue treestrs.append(None) # NOTE these simulation functions seem to assume that we want all the extant leaves to have the same height. Which is kind of weird. Maybe makes more sense at some point to change this. params = {'n': n_leaves, 'numbsim': self.n_trees_each_run} if self.args.root_mrca_weibull_parameter is None: fcn = 'sim.bd.taxa.age' params['lambda'] = 1 # speciation_rate params['mu'] = 0.5 # extinction_rate params['age'] = age else: fcn = 'sim.taxa' params['distributionspname'] = '"rweibull"' params[ 'distributionspparameters'] = 'c(%f, 1)' % self.args.root_mrca_weibull_parameter params[ 'labellivingsp'] = '"t"' # TreeSim doesn't let you do this, but a.t.m. this is their default cmd_lines += [ 'trees <- %s(%s)' % (fcn, ', '.join( ['%s=%s' % (k, str(v)) for k, v in params.items()])) ] cmd_lines += [ 'write.tree(trees[[1]], \"' + outfname + '\", append=TRUE)' ] if None not in treestrs: # if every tree has one leaf, we don't need to run R open(outfname, 'w').close() else: if os.path.exists(outfname): os.remove(outfname) utils.run_r( cmd_lines, workdir, print_time='tree generation' if self.args.debug else None) with open(outfname) as treefile: for itree, tstr in enumerate(treestrs): if tstr is None: treestrs[itree] = treefile.readline().strip() if None in treestrs: raise Exception( 'didn\'t read enough trees from %s: still %d empty places in treestrs' % (outfname, treestrs.count(None))) # rescale branch lengths (TreeSim lets you specify the number of leaves and the height at the same time, but TreeSimGM doesn't, and TreeSim's numbers are usually a little off anyway... so we rescale everybody) for itree in range(len(ages)): treestrs[itree] = '(%s):0.0;' % treestrs[itree].rstrip( ';' ) # the trees it spits out have non-zero branch length above root (or at least that's what the newick strings turn into when dendropy reads them), which is f****d up and annoying, so here we add a new/real root at the top of the original root's branch treestrs[itree] = treeutils.rescale_tree(ages[itree], treestr=treestrs[itree]) return ages, treestrs