def plotMetrics(infile, outfile): ''' Intermediate target to plot metrics. ''' IOTools.touch_file(outfile)
def checkMkfastqInputs(infile, outfile): '''Check mkfastq input .sample files''' sample_information() IOTools.touch_file(outfile)
def metrics(infiles, outfile): ''' Intermediate target to run metrics tasks. ''' IOTools.touch_file(outfile)
def create_view(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append([ x.lower() for x in database.getColumnNames(dbhandle, table) if x != track ]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError("number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join([ "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:]) ]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() database.executewait(dbhandle, statement) nrows = database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError("empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touch_file(outfile)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if iotools.is_empty(dbfile) or len(motiffiles) == 0: iotools.touch_file(outfile) return if not os.path.exists(controlfile): raise ValueError( "control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.get_temp_dir(".") tmpfile = P.get_temp_filename(".") for motiffile in motiffiles: if iotools.is_empty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = iotools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) of = iotools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) P.run("gzip < %(tmpfile)s > %(outfile)s") shutil.rmtree(tmpdir) os.unlink(tmpfile)
def subsetAndDownsample(infiles, outfile): ''' Generate datasets that include subsets of the 10x samples. Optionally downsample UMI counts to normalise between samples. ''' outdir = os.path.dirname(outfile) if not os.path.exists(outdir): os.mkdir(outdir) agg_matrix_dir = os.path.join(os.path.dirname(infiles[0]), "agg.processed.dir") sample_table = pd.read_csv(infiles[1], sep="\t") subsets = [k.split("_", 1)[1] for k in PARAMS.keys() if k.startswith("datasets_")] # Titles of fields encoded in filenames name_field_titles = PARAMS["name_field_titles"] if PARAMS["downsampling_enabled"]: downsampling_function = PARAMS['downsampling_function'] else: downsampling_function = "no" downsampling_apply = PARAMS["downsampling_apply"] job_memory = PARAMS["postprocess_memory"] statements = [] for subset in subsets: if subset == "all": if not PARAMS["datasets_all"]: continue sample_ids = set(sample_table["sample_id"].values) sample_ids_str = ",".join(sample_ids) else: sample_ids = PARAMS["datasets" + "_" + subset] sample_ids_str = ",".join([x.strip() for x in sample_ids.split(",")]) out_dir = os.path.join(os.path.dirname(outfile), subset) tenx_dir = PARAMS["tenx_dir"] log_file = outfile.replace(".sentinel", "." + subset + ".log") statement = '''Rscript %(tenx_dir)s/R/cellranger_subsetAndDownsample.R --tenxdir=%(agg_matrix_dir)s --sampleids=%(sample_ids_str)s --downsample=%(downsampling_function)s --apply=%(downsampling_apply)s --samplenamefields=%(name_field_titles)s --outdir=%(out_dir)s &> %(log_file)s ''' % locals() statements.append(statement) P.run(statements) IOTools.touch_file(outfile)
def buildSpikeResults(infile, outfile): '''build matrices with results from spike-in and upload into database. The method will output several files: .spiked.gz: Number of intervals that have been spiked-in for each bin of expression and fold-change .power.gz: Global power analysis - aggregates over all ranges of fold-change and expression and outputs the power, the proportion of intervals overall that could be detected as differentially methylated. This is a table with the following columns: fdr - fdr threshold power - power level, number of intervals detectable intervals - number of intervals in observed data at given level of fdr and power. intervals_percent - percentage of intervals in observed data at given level of fdr and power The method will also upload the results into the database. Arguments --------- infile : string Input filename in :term:`tsv` format. Usually the output of :mod:`scripts/runExpression`. outfile : string Output filename in :term:`tsv` format. ''' expression_nbins = 10 fold_nbins = 10 spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz' if not os.path.exists(spikefile): E.warn('no spike data: %s' % spikefile) iotools.touch_file(outfile) return ######################################## # output and load spiked results tmpfile_name = P.get_temp_filename(shared=True) statement = '''zcat %(spikefile)s | grep -e "^spike" -e "^test_id" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting spiked counts") (spiked, spiked_d2hist_counts, xedges, yedges, spiked_l10average, spiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".spiked.gz", infile_name=tmpfile_name, expression_nbins=expression_nbins, fold_nbins=fold_nbins) ######################################## # output and load unspiked results statement = '''zcat %(infile)s | grep -v -e "^spike" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting unspiked counts") (unspiked, unspiked_d2hist_counts, unspiked_xedges, unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz", infile_name=tmpfile_name, expression_bins=xedges, fold_bins=yedges) E.debug("computing power") assert xedges.all() == unspiked_xedges.all() tmpfile = iotools.open_file(tmpfile_name, "w") tmpfile.write("\t".join(("expression", "fold", "fdr", "counts", "percent")) + "\n") fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1)) power_thresholds = numpy.arange(0.1, 1.1, 0.1) spiked_total = float(spiked_d2hist_counts.sum().sum()) unspiked_total = float(unspiked_d2hist_counts.sum().sum()) outf = iotools.open_file(outfile, "w") outf.write("fdr\tpower\tintervals\tintervals_percent\n") # significant results for fdr in fdr_thresholds: take = spiked['qvalue'] < fdr # compute 2D histogram in spiked data below fdr threshold spiked_d2hist_fdr, xedges, yedges = \ numpy.histogram2d(spiked_l10average[take], spiked_l2fold[take], bins=(xedges, yedges)) # convert to percentage of spike-ins per bin spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed) # set values without data to -1 spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0 # output to table for database upload for x, y in itertools.product(list(range(len(xedges) - 1)), list(range(len(yedges) - 1))): tmpfile.write("\t".join( map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y], 100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n") # take elements in spiked_hist_fdr above a certain threshold for power in power_thresholds: # select 2D bins at a given power level power_take = spiked_d2hist_fdr_normed >= power # select the counts in the unspiked data according # to this level power_counts = unspiked_d2hist_counts[power_take] outf.write("\t".join( map(str, (fdr, power, power_counts.sum().sum(), 100.0 * power_counts.sum().sum() / unspiked_total))) + "\n") tmpfile.close() outf.close() # upload into table method = P.snip(os.path.dirname(outfile), ".dir") tablename = P.to_table( P.snip(outfile, "power.gz") + method + ".spike.load") P.load(tmpfile_name, outfile + ".log", tablename=tablename, options="--add-index=fdr") os.unlink(tmpfile_name)
def postprocessAggrMatrix(infiles, outfile): ''' Post-process the cellranger aggr count matrix. Batch, sample_name and aggregation ID metadata are added. Optionally cells with barcodes shared (within sequencing batch) between samples can be removed (known index hopping on Illumina 4000). ''' outdir = os.path.dirname(outfile) if not os.path.exists(outdir): os.mkdir(outdir) infile = infiles[0] sample_table = infiles[1] agg_dir = os.path.dirname(infile) out_dir = os.path.dirname(outfile) # Clean barcode hopping if PARAMS["postprocess_barcodes"]: hopping = "--hopping" else: hopping = "" # Additional options options = PARAMS["postprocess_options"] mexdir = PARAMS["postprocess_mexdir"] if mexdir is None: raise ValueError('"postprocess_mexdir" parameter not set' ' in file "pipeline.yml"') tenxdir = os.path.join(agg_dir, mexdir) if not os.path.exists(tenxdir): raise ValueError('The specified "postprocess_mexdir"' ' directory does not exist in directory ' + agg_dir) job_memory = PARAMS["postprocess_memory"] blacklist = PARAMS["postprocess_blacklist"] log_file = outfile.replace(".sentinel", ".log") statement = '''Rscript %(tenx_dir)s/R/cellranger_postprocessAggrMatrix.R --tenxdir=%(tenxdir)s --sampletable=%(sample_table)s --samplenamefields=%(name_field_titles)s --downsample=no %(hopping)s --blacklist=%(blacklist)s %(options)s --outdir=%(out_dir)s &> %(log_file)s ''' P.run(statement) IOTools.touch_file(outfile)
def cellrangerCount(infile, outfile): ''' Execute the cell ranger pipleline for all samples. ''' # set key parameters transcriptome = PARAMS["cellranger_transcriptome"] if transcriptome is None: raise ValueError('"cellranger_transcriptome" parameter not set' ' in file "pipeline.yml"') if not os.path.exists(transcriptome): raise ValueError('The specified "cellranger_transcriptome"' ' file does not exist') memory = PARAMS["cellranger_memory"] job_threads = PARAMS["cellranger_threads"] mem_per_core = int(float(memory) / job_threads) # round down job_memory = str(mem_per_core) + "M" # cellranger expects memory in GB cellranger_memory = str(int((mem_per_core * job_threads)/1000) - 2) # parse the sample name and expected cell number library_id, cellnumber, batch, trash = os.path.basename(infile).split(".") # build lists of the sample files seq_folders = [] sample_ids = [] # Parse the list of sequencing runs (i.e., paths) for the sample with open(infile, "r") as sample_list: for line in sample_list: seq_folder_path = line.strip() if seq_folder_path != "": seq_folders.append(seq_folder_path) sample_ids.append(os.path.basename(seq_folder_path)) input_fastqs = ",".join(seq_folders) input_samples = ",".join(sample_ids) id_tag = library_id + "-count" log_file = id_tag + ".log" statement = ( '''cellranger count --id %(id_tag)s --fastqs %(input_fastqs)s --sample %(input_samples)s --transcriptome %(transcriptome)s --expect-cells %(cellnumber)s --chemistry %(cellranger_chemistry)s --jobmode=local --localcores %(job_threads)s --localmem %(cellranger_memory)s --nopreflight &> %(log_file)s ''') P.run(statement) IOTools.touch_file(outfile)
def full(outfile): touch_file(outfile)
def aggregateAdaptors(infile, outfile): iotools.touch_file(outfile)
def BedFileVenn(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' bed1, bed2 = infiles liver_name = P.snip(os.path.basename(liver), ".replicated.bed") testes_name = P.snip(os.path.basename(testes), ".replicated.bed") to_cluster = True statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed; echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; sed -i '{N;s/\\n/\\t/g}' %(outfile)s; ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if iotools.is_empty(infiles[0]) or iotools.isEmpty(infiles[1]): iotools.touch_file(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run(statement) else: tmpfile = P.get_temp_filename(".") # need to merge incrementally fn = infiles[0] if iotools.is_empty(infiles[0]): iotools.touch_file(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run(statement) for fn in infiles[1:]: if iotools.is_empty(infiles[0]): iotools.touch_file(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run(statement) statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %(outfile)s ''' P.run(statement) os.unlink(tmpfile)