def runMutectReverse(infiles, outfile): '''Use control as tumor and vis versa to estimate false positive rate''' infile, normal_panel = infiles infile_tumour = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" basename = P.snip(outfile, ".mutect.reverse.snp.vcf") call_stats_out = basename + "_call_stats.reverse.out" coverage_wig_out = basename + "_coverage.reverse.wig" mutect_log = basename + ".reverse.log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = ( PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile_tumour)
def loadPicardDuplicateStats(infiles, outfile): '''Merge Picard duplicate stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = open('dupstats.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.bam") statfile = P.snip(f, ".bam") + ".dupstats" if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in open( statfile, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run()
def flashReads(infiles, outfile, overlap): ''' Flashes read pairs''' job_memory = "2G" # Use one thread to maintain read order job_threads = 1 # Retrieve the input file from [generateReadSamplesProduct, relocateReads] read1 = infiles[0] read2 = P.snip(read1, ".fastq.1.gz") + ".fastq.2.gz" outfilebase = P.snip(outfile, ".extendedFrags.fastq.gz") statement = '''flash -m %(overlap)s --interleaved-output %(read1)s %(read2)s -o %(outfilebase)s -z ; checkpoint; touch %(outfile)s''' # touch in case there are no flashed reads # but the process still succeeds with no errors P.run()
def createMAFAlignment(infiles, outfile): """ Takes all .axt files in the input directory, filters them to remove files based on supplied regular expressions, converts to a single maf file using axtToMaf, filters maf alignments under a specified length. """ outfile = P.snip(outfile, ".gz") axt_dir = PARAMS["phyloCSF_location_axt"] to_ignore = re.compile(PARAMS["phyloCSF_ignore"]) axt_files = [] for axt_file in os.listdir(axt_dir): if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file): axt_files.append(os.path.join(axt_dir, axt_file)) axt_files = (" ").join(sorted(axt_files)) E.info("axt files from which MAF alignment will be created: %s" % axt_files) target_genome = PARAMS["phyloCSF_target_genome"] target_contigs = os.path.join(PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs"]) query_genome = PARAMS["phyloCSF_query_genome"] query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"], PARAMS["annotations_interface_contigs"]) tmpf1 = P.getTempFilename("./phyloCSF") tmpf2 = P.getTempFilename("./phyloCSF") to_cluster = False # concatenate axt files, then remove headers statement = ("zcat %(axt_files)s" " > %(tmpf1)s;" " axtToMaf " " -tPrefix=%(target_genome)s." " -qPrefix=%(query_genome)s." " %(tmpf1)s" " %(target_contigs)s" " %(query_contigs)s" " %(tmpf2)s") P.run() E.info("Temporary axt file created %s" % os.path.abspath(tmpf1)) E.info("Temporary maf file created %s" % os.path.abspath(tmpf2)) removed = P.snip(outfile, ".maf") + "_removed.maf" to_cluster = False filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed, PARAMS["phyloCSF_filter_alignments"]) E.info("%s blocks were ignored in MAF alignment" " because length of target alignment was too short" % filtered[0]) E.info("%s blocks were output to filtered MAF alignment" % filtered[1]) os.unlink(tmpf1) os.unlink(tmpf2) to_cluster = False statement = ("gzip %(outfile)s;" " gzip %(removed)s") P.run()
def splitMultiAndSingleExonLincRna(infile, outfiles): ''' pulls out the multi-exonic and the single exonic lincRNA transcripts from the lincrna.gtf.gz ''' inf = gzip.open(infile) multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w") single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w") for entry in GTF.transcript_iterator(GTF.iterator(inf)): if len(entry) > 1: for exon in entry: multi.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") elif len(entry) == 1: for exon in entry: single.write( "\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."])) + "\t" + exon.attributes + "\n") for outfile in outfiles: outf = P.snip(outfile, ".gz") if not os.path.exists(outfile): statement = '''gzip %(outf)s''' P.run()
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print contigs for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def loadAlignmentStats(infiles, outfile): '''merge alignment stats into single tables.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(f, ".bam.stats") fn = f + ".alignment_summary_metrics" if not os.path.exists(fn): E.warn("file %s missing" % fn) continue lines = [ x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() for suffix, column in (("quality_by_cycle_metrics", "cycle"), ("quality_distribution_metrics", "quality")): # some files might be missing - bugs in Picard xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] header = ",".join([P.snip(x, ".bam.stats") for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --missing-value=0 %(filenames)s | python %(scriptsdir)s/csv2db.py --header-names=%(column)s,%(header)s --replace-header --add-index=track --table=%(tname)s >> %(outfile)s """ P.run() os.unlink(tmpfilename)
def reMergeBamfiles(infiles, sentinel): infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" bad_samples = PARAMS["options_to_remove"].split(",") to_merge = IDR.filterBadLibraries(infiles, bad_samples) IDR.mergeBams(to_merge, outfile) P.touch(sentinel)
def poolSampleBamfiles(infiles, sentinel): """ Merge filtered sample files for each tissue """ infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles] outfile = P.snip(sentinel, ".sentinel") + ".bam" IDR.mergeBams(infiles, outfile) P.touch(sentinel)
def findNPeaksForPooledPseudoreplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def findNPeaksForIndividualReplicates(infiles, outfile): idr_thresh = PARAMS["idr_options_inter_replicate_threshold"] try: module = P.snip(IDR.__file__, ".py") except ValueError: module = P.snip(IDR.__file__, ".pyc") P.submit(module, "findNPeaks", params=[str(idr_thresh), ], infiles=infiles, outfiles=outfile)
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load("effect.txt", outfile, options="--add-index=transcript_id") for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name P.load(outf.name, outfile, tablename=tabelname + "_" + suffix, options="--add-index=transcript_id " "--allow-empty-file " "--ignore-column=seq_na " "--ignore-column=seq_aa")
def linkBamToWorkingDirs(infiles, outfile): ''' symlink the bam file and index to the working directories for execution of the transcript building pipeline ''' bamfile = P.snip(infiles[0], ".bai") indexfile = infiles[0] directories = [P.snip(logfile, ".log") for logfile in infiles[1]] for directory in directories: os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile)) os.symlink( os.path.abspath(indexfile), os.path.join(directory, indexfile)) updateFile(outfile)
def runMutectOnDownsampled(infiles, outfile): '''call somatic SNPs using MuTect on downsampled bams''' infile, normal_panel = infiles infile_tumour = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = ( PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile_tumour, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile)
def mergeAndLoad(infiles, outfile, suffix): """load categorical tables (two columns) into a database. The tables are merged and entered row-wise. """ header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def runPCA(infile, outfile, rownames=1): ''' run principle components analysis on normalised matrix ''' # ncol = len(open(infile).readline().strip("\n").split("\t")) # read in and format data R('''dat <- read.csv("%s", header=T, stringsAsFactors=F, sep="\t", row.names=%i)''' % (infile, rownames)) # run PCA R('''pc.dat <- prcomp(as.matrix(t(dat)))''') # get scores R('''pc.dat.scores <- data.frame(pc.dat$x)''') R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''') R('''pc.dat.scores <- pc.dat.scores[, c("sample", colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]''') R('''write.table(pc.dat.scores, file="%s", sep="\t", quote=F, row.names=F)''' % outfile) # get the variance explained outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv" R('''ve <- data.frame(summary(pc.dat)$importance)''') R('''ve <- ve[2,]''') R('''write.table(ve, file="%s", sep="\t", quote=F, row.names=F)''' % outf_ve)
def buildPicardAlignStats(infile, outfile): '''Gather BAM file alignment statistics using Picard ''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals( ) P.run()
def plotMDS(infile, outfile): ''' perform multidimensional scaling of normalised counts ''' outname_matrix = P.snip(outfile, ".pdf") + ".tsv" R('''library(gtools)''') R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) R('''rownames(dat) <- dat$taxa dat <- dat[,1:ncol(dat)-1] dat <- dat[, mixedsort(colnames(dat))] conds <- unlist(strsplit(colnames(dat), ".R[0-9].*"))[seq(1, ncol(dat)*2, 2)] conds <- unlist(strsplit(conds, ".", fixed = T))[seq(2, length(conds)*2, 2)] dat <- as.matrix(t(dat)) dist <- dist(dat) ord1 <- cmdscale(dist) ord2 <- as.data.frame(ord1) ord2$cond <- conds plot1 <- ggplot(ord2, aes(x = V1, y = V2, colour = cond)) plot2 <- plot1 + geom_point(size = 3) cols <- rainbow(length(unique(conds))) plot3 <- plot2 + scale_colour_manual(values = c(cols)) ggsave("%s")''' % outfile)
def generateCoordinates(infile, outfile): fragments, probes = infile lookup = P.snip(outfile, "bed.gz") + "lookup.tsv" pipelineCaptCPerl.getProbeFragments(probes, fragments, outfile, lookup)
def buildCheckSums(infile, outfile): '''build checksums for files in the build directory. Files are uncompressed before computing the checksum as gzip stores meta information such as the time stamp. ''' track = P.snip(infile, ".log") suffixes = P.asList(PARAMS.get( '%s_suffixes' % track, PARAMS["suffixes"])) if len(suffixes) == 0: raise ValueError('no file types defined for test') regex_pattern = ".*\(%s\)" % "\|".join(suffixes) regex_pattern = pipes.quote(regex_pattern) # ignore log files as time stamps will # be different statement = '''find %(track)s.dir -type f -not -regex ".*.log" -regex %(regex_pattern)s -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} md5sum \; | perl -p -e "s/ +/\\t/g" | sort -k1,1 > %(outfile)s''' P.run()
def collectFastQCSections(infiles, section, datadir): '''iterate over all fastqc files and extract a particular section. Arguments --------- infiles : list List of filenames with fastqc output (logging information). The track name is derived from that. section : string Section name to extract datadir : string Location of actual Fastqc output to be parsed. Returns ------- results : list List of tuples, one tuple per input file. Each tuple contains track, status, header and data of `section`. ''' results = [] for infile in infiles: track = P.snip(os.path.basename(infile), ".fastqc") filename = os.path.join(datadir, track + "*_fastqc", "fastqc_data.txt") for fn in glob.glob(filename): for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): if name == section: results.append((track, status, header, data)) return results
def buildLineCounts(infile, outfile): '''compute line counts. Files are uncompressed before computing the number of lines. ''' track = P.snip(infile, ".log") suffixes = P.asList(PARAMS.get( '%s_suffixes' % track, PARAMS["suffixes"])) if len(suffixes) == 0: raise ValueError('no file types defined for test') regex_pattern = ".*\(%s\)" % "\|".join(suffixes) regex_pattern = pipes.quote(regex_pattern) # ignore log files as time stamps will # be different statement = '''find %(track)s.dir -type f -not -regex ".*.log" -regex %(regex_pattern)s -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} wc -l \; | sort -k1,1 > %(outfile)s''' P.run()
def GATKBaseRecal(infile, outfile, genome, dbsnp, solid_options=""): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals() statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def runTest(infile, outfile): '''run a test. Multiple targets are run iteratively. ''' track = P.snip(outfile, ".log") pipeline_name = PARAMS.get( "%s_pipeline" % track, "pipeline_" + track[len("test_"):]) pipeline_targets = P.asList( PARAMS.get("%s_target" % track, "full")) # do not run on cluster, mirror # that a pipeline is started from # the head node to_cluster = False template_statement = ''' (cd %%(track)s.dir; python %%(pipelinedir)s/%%(pipeline_name)s.py %%(pipeline_options)s make %s) >& %%(outfile)s ''' if len(pipeline_targets) == 1: statement = template_statement % pipeline_targets[0] P.run(ignore_errors=True) else: statements = [] for pipeline_target in pipeline_targets: statements.append(template_statement % pipeline_target) P.run(ignore_errors=True)
def mergeMeanTables(infiles, outfile): ''' Collate and merge all separate tables into a single large table for all MZ and DZ twins ''' job_memory = "300G" panel = outfile.split("/")[-1].split("-")[1] cell_type = outfile.split("/")[-1].split("mean_")[-1] cell_type = P.snip(cell_type, ".tsv") table_name = "_".join([cell_type, "mean"]) out_dir = "/".join(outfile.split("/")[:-1]) twin_id = "twin.id" statement = ''' python /ifs/devel/projects/proj052/flow_pipeline/scripts/flow2twins.py --task=merge_flow --twin-id-column=%(twin_id)s --demographics-file=%(twins_demographics)s --demo-id-column=%(twins_demo_header)s --database=%(database)s --tablename=%(table_name)s --filter-gates="(F|S)SC-(A|H)" --filter-zero-arrays --log=%(outfile)s.log --output-directory=%(out_dir)s --output-file-pattern=%(table_name)s ''' P.run() P.touch(outfile)
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' tablename = P.toTable(outfile) outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".dedup.alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name statement = '''cat %(tmpfilename)s | cgat csv2db --add-index=track --table=%(tablename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = IOTools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | python %(scriptsdir)s/gtf2gtf.py --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def calculateFalsePositiveRate(infiles, outfile): ''' taxonomy false positives and negatives etc ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() levels = ["phylum", "class", "order", "family", "genus", "species"] tablename_true = P.toTable(infiles[0]) # get corresponding estimate file tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[ 1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0])) outf = open(outfile, "w") track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load") for level in levels: for cutoff in [0, 1]: true_set = set() estimate_set = set() for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)): true_set.add(taxa[0]) for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))): estimate_set.add(taxa[0]) total_true = len(true_set) total_estimate = len(estimate_set) tp = true_set.intersection(estimate_set) fp = estimate_set.difference(true_set) fp_rate = float(len(fp)) / total_estimate tp_rate = float(len(tp)) / total_true outf.write("%s\t%f\t%f\t%s\t%s\n" % (level, fp_rate, tp_rate, track, str(cutoff))) outf.close()
def mapReadsWithBowtie(infiles, outfile, mismatches): ''' Aligns the digested reads to the genome ''' # If mapping analysis is not defined, use user specified parameters if (PARAMS["addtests_mapping"] == 0): align_command = PARAMS["bowtie_options"] else: align_command = "-v " +mismatches+" -p 1 -m 2 --best --strata --sam --chunkmb 256" # Obtain the reads reads = infiles[0] genome = PARAMS["environment_bowtiegenome"] job_memory = "4G" log_file = P.snip(outfile, ".sam") + ".log" statement = '''bowtie %(align_command)s --sam %(genome)s %(reads)s %(outfile)s 2> %(log_file)s''' P.run()
def loadBigWigStats(infiles, outfile): '''merge and load bigwig summary for all wiggle files. Summarise and merge bigwig files for all samples and load into a table called bigwig_stats Parameters ---------- infiles : list Input filenames in :term:`bigwig` format outfile : string Output filename, the table name is derived from `outfile`. ''' data = " ".join( ['<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' % x for x in infiles]) headers = ",".join([P.snip(os.path.basename(x), ".bw") for x in infiles]) load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=track") statement = '''cgat combine_tables --header-names=%(headers)s --skip-titles --missing-value=0 --ignore-empty %(data)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s ''' P.run()
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip(infile, ".bed.gz") fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) outs = IOTools.openFile(outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator(IOTools.openFile(infile))): lcontig = fasta.getLength(bed.contig) if mode == "intervals": seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end)) ids.append("%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end)) elif mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) masked = maskSequences(seqs, masker) outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)])) outs.close()
def mergeSummarizedContextStats(infiles, outfile, samples_in_columns=False): """combine output from :func:`summarizeTagsWithinContext`. Arguments --------- infiles : list List of filenames in :term:`tsv` format outfile : string Output filename in :term:`tsv` format. samples_in_columns : If True, put samples in columns. The default is to put them in rows. """ header = ",".join([P.snip(os.path.basename(x), ".contextstats.tsv.gz") for x in infiles]) filenames = " ".join(infiles) if not samples_in_columns: transpose_cmd = \ """| python %(scriptsdir)s/table2table.py --transpose""" % P.getParams() else: transpose_cmd = "" statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" %(transpose_cmd)s | gzip > %(outfile)s """ P.run()
def downloadSequinsNeatData(outfile): ''' Download the neat Sequins data from NCBI''' address_base = 'ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX189' outfile2srr = { 'neat-A.fastq.1.gz': 'SRR3743147', 'neat-B.fastq.1.gz': 'SRR3743148' } srr2srx = {'SRR3743147': 'SRX1897294', 'SRR3743148': 'SRX1897295'} outfile_base = os.path.basename(outfile) srr = outfile2srr[outfile_base] srx = srr2srx[srr] outfile_name = P.snip(outfile_base, '.fastq.1.gz') statement = ''' wget %(address_base)s/%(srx)s/%(srr)s/%(srr)s.sra -O %(outfile_name)s.sra ''' P.run() outdir = os.path.dirname(outfile) statement = Sra.extract(outfile_name + '.sra', outdir) P.run() statement = ''' mv %(outdir)s/%(outfile_name)s_1.fastq.gz %(outdir)s/%(outfile_name)s.fastq.1.gz; checkpoint; mv %(outdir)s/%(outfile_name)s_2.fastq.gz %(outdir)s/%(outfile_name)s.fastq.2.gz''' P.run() os.unlink(outfile_name + '.sra')
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' '''To see the read group information for a BAM file, use:''' '''samtools view -H sample.bam | grep '@RG' - no RG currently available ''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".dedup.bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''picard ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''picard AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None, kgenomes=None, hapmap=None, omni=None, mills=None, track=None): '''Create variant recalibration file''' job_options = getGATKOptions() job_threads = 3 track = P.snip(outfile, ".recal") if mode == 'SNP': statement = '''GenomeAnalysisTK -T VariantRecalibrator -R %(genome)s -input %(infile)s -resource:hapmap,known=false,training=true,truth=true,prior=15.0 %(hapmap)s -resource:omni,known=false,training=true,truth=true,prior=12.0 %(omni)s -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 %(dbsnp)s -resource:1000G,known=false,training=true,truth=false,prior=10.0 %(kgenomes)s -an QD -an SOR -an MQRankSum -an ReadPosRankSum -an FS -an MQ --maxGaussians 4 -mode %(mode)s -recalFile %(outfile)s -tranchesFile %(track)s.tranches -rscriptFile %(track)s.plots.R ''' % locals() P.run() elif mode == 'INDEL': statement = '''GenomeAnalysisTK -T VariantRecalibrator -R %(genome)s -input %(infile)s -resource:mills,known=true,training=true,truth=true,prior=12.0 %(mills)s -an QD -an MQRankSum -an ReadPosRankSum -an FS -an MQ --maxGaussians 4 --minNumBadVariants 5000 -mode %(mode)s -recalFile %(outfile)s -tranchesFile %(track)s.tranches -rscriptFile %(track)s.plots.R ''' % locals() P.run()
def buildLineCounts(infile, outfile): '''compute line counts. Files are uncompressed before computing the number of lines. ''' track = P.snip(infile, ".log") suffixes = P.asList(PARAMS.get('%s_suffixes' % track, PARAMS["suffixes"])) if len(suffixes) == 0: raise ValueError('no file types defined for test') regex_pattern = ".*\(%s\)" % "\|".join(suffixes) regex_pattern = pipes.quote(regex_pattern) statement = '''find %(track)s.dir -type f -regex %(regex_pattern)s -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} wc -l \; | sort -k1,1 > %(outfile)s''' P.run()
def runMutectOnDownsampled(infiles, outfile): '''call somatic SNPs using MuTect on downsampled bams''' infile, normal_panel = infiles infile_tumour = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) basename = P.snip(outfile, "_normal_mutect.vcf") call_stats_out = basename + "_call_stats.out" mutect_log = basename + ".log" (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction, tumor_LOD) = (PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"], PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"], PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"], PARAMS["mutect_LOD"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.mutectSNPCaller(infile_tumour, outfile, mutect_log, genome, cosmic, dbsnp, call_stats_out, PARAMS['mutect_memory'], PARAMS['mutect_threads'], quality, max_alt_qual, max_alt, max_fraction, tumor_LOD, normal_panel, infile)
def loadMissedReadCounts(infiles, outfile): '''load summary table of numbers of missed reads.''' def _getlines(inf): return len(IOTools.openFile(inf).readlines()) - 1 tmpfile = P.getTempFile() infiles = sorted(infiles) tmpfile.write( "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n") for x in range(0, len(infiles), 2): junctions, transcriptome = infiles[x], infiles[x + 1] track = P.snip(junctions, ".missed_junctions.gz") mapped_genome = _getlines(track + ".mapped_reads.gz") tmpfile.write("%s\t%i\t%i\t%i\n" % (track, mapped_genome, _getlines(junctions), _getlines(transcriptome))) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def collateTaxonBioProjectNumbers(infiles, outfile): '''Parse the XML files, and output the project IDs''' outf = IOTools.openFile(outfile, 'w') outf.write('bioproject\ttaxon_id\ttaxon_name\n') outf_failed = IOTools.openFile(P.snip(outfile, '.tsv') + '_discarded.tsv', 'w') for infile in infiles: inf = os.path.basename(infile)[:-len('.xml')] tax_id = inf.split('_').pop() inf = IOTools.openFile(infile).readlines() if len(inf) == 0: L.warn('No genbank accession for taxon: %s' % infile) outf_failed.write(infile + '\n') continue name = None project = None for line in inf: line = line.lstrip().strip() if line.startswith('DBLINK'): assert line.split()[1] == 'BioProject:' project = line.split()[2] if line.startswith('ORGANISM'): name = ' '.join(line.split()[1:]) if not project: L.warn('No genbank accession for taxon: %s' % infile) outf_failed.write(infile + '\n') continue outf.write('\t'.join([project, tax_id, name]) + '\n') outf.close() outf_failed.close()
def loadPicardAlignStats(infiles, outfile): '''Merge Picard alignment stats into single table and load into SQLite.''' # Join data for all tracks into single file outf = P.getTempFile() first = True for f in infiles: track = P.snip(os.path.basename(f), ".alignstats") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [ x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() # Load into database P.load(outf.name, outfile, options="--add-index=track") os.unlink(outf.name)
def buildContigBed(infile, outfile): ''' Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta` file and outputs them to :term:`BED` format Parameters ---------- infile : str infile is constructed from `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str :term:`BED` format file containing contig name, value (0) and contig size in nucleotides. The output file name is defined in `PARAMS: interface_contigs_bed` ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) outs = IOTools.openFile(outfile, "w") for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outs.write("%s\t%i\t%i\n" % (contig, 0, size)) outs.close()
def loadSummarizedContextStats(infiles, outfile, suffix=".contextstats.tsv.gz"): """merge output from :func:`summarizeTagsWithinContex` and load into database. Arguments --------- infiles : list List of filenames in :term:`tsv` format. The files should end in suffix. outfile : string Output filename, the table name is derived from `outfile`. suffix : string Suffix to remove from filename for track name. """ header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles]) filenames = " ".join(infiles) load_statement = P.build_load_statement( P.toTable(outfile), options="--add-index=track") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --skip-titles %(filenames)s | perl -p -e "s/bin/track/; s/\?/Q/g" | cgat table2table --transpose | %(load_statement)s > %(outfile)s """ P.run()
def runPCA(infile, outfile, rownames=1): ''' run principle components analysis on normalised matrix ''' # ncol = len(open(infile).readline().strip("\n").split("\t")) # read in and format data R('''dat <- read.csv("%s", header=T, stringsAsFactors=F, sep="\t", row.names=%i)''' % (infile, rownames)) # run PCA R('''pc.dat <- prcomp(as.matrix(t(dat)))''') # get scores R('''pc.dat.scores <- data.frame(pc.dat$x)''') R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''') R('''pc.dat.scores <- pc.dat.scores[, c("sample", colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]''' ) R('''write.table(pc.dat.scores, file="%s", sep="\t", quote=F, row.names=F)''' % outfile) # get the variance explained outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv" R('''ve <- data.frame(summary(pc.dat)$importance)''') R('''ve <- ve[2,]''') R('''write.table(ve, file="%s", sep="\t", quote=F, row.names=F)''' % outf_ve)
def STARmap(infile, outfile): ''' maps non-repetitive elements to genome ''' outprefix = P.snip(outfile, ".bam") job_threads = PARAMS["STARmap_threads"] statement = ''' STAR --runMode alignReads --runThreadN %(job_threads)i --genomeDir %(STARmap_genome)s --readFilesIn %(infile)s --outSAMunmapped Within --outFilterMultimapNmax 1 --outFilterMultimapScoreRange 1 --outFileNamePrefix %(outprefix)s --outSAMattributes All --outStd BAM_Unsorted --outSAMtype BAM Unsorted --outFilterType BySJout --outReadsUnmapped Fastx --outFilterScoreMin 10 --outSAMattrRGline ID:foo --alignEndsType Local > %(outfile)s ''' P.run()
def loadPicardHistogram(infiles, outfile, suffix, column, pipeline_suffix=".picard_stats", tablename=False): '''extract a histogram from a picard output file and load it into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. column : string Column name to take from the histogram. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) tablename = tablename.replace("_metrics", "_histogram") # some files might be missing xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))] if len(xfiles) == 0: E.warn("no files for %s" % tablename) return header = ",".join( [P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles]) filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles]) # there might be a variable number of columns in the tables # only take the first ignoring the rest load_statement = P.build_load_statement(tablename, options="--add-index=track " " --header-names=%s,%s" " --allow-empty-file" " --replace-header" % (column, header)) statement = """cgat combine_tables --regex-start="## HISTOGRAM" --missing-value=0 --take=2 %(filenames)s | %(load_statement)s >> %(outfile)s """ P.run()
def buildCodingPotential(infile, outfile): '''run CPC analysis as in the cpc script. This module runs framefinder and blastx on both strands. It seems to work, but I have not thoroughly tested it. I expect that the false positive rate increases (i.e., predicting non-coding as coding) in cases where the best framefinder match and the best blast match are on opposite strands. In the original CPC, these would be separated. ''' try: cpc_dir = os.environ["CPC_HOME"] except KeyError: raise ValueError("CPC_HOME environment variable is not set. ") tmpdir = P.getTempDir(".") track = P.snip(outfile, ".coding.gz") # extract features for frame finder # replaces extract_framefinder_feats.pl to parse both strands with open(os.path.join(tmpdir, "ff.feat"), "w") as outf: outf.write("\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n") for line in IOTools.openFile("%s.frame.gz" % track): if line.startswith(">"): try: (id, start, end, score, used, mode, tpe) = \ re.match( ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups() except AttributeError: raise ValueError("parsing error in line %s" % line) length = int(end) - int(start) + 1 strict = int(tpe == "strict") outf.write("\t".join((id, str(length), used, str(strict))) + "\n") to_cluster = USECLUSTER # extract features and prepare svm data s = [] s.append(''' zcat %(infile)s | perl %(cpc_dir)s/libs/blast2table.pl | tee %(tmpdir)s/blastx.table | perl %(cpc_dir)s/bin/extract_blastx_features.pl > %(tmpdir)s/blastx.feat1; ''') s.append(''' cat %(track)s_norepeats.fasta | perl %(cpc_dir)s/bin/add_missing_entries.pl %(tmpdir)s/blastx.feat1 > %(tmpdir)s/blastx.feat; ''') # step 2 - prepare data s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat > %(tmpdir)s/blastx.lsv; ''') s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat > %(tmpdir)s/ff.lsv; ''') s.append(''' perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv > %(tmpdir)s/test.lsv; ''') s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale -r %(cpc_dir)s/data/libsvm.range %(tmpdir)s/test.lsv > %(tmpdir)s/test.lsv.scaled; ''') # step 3: prediction m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0") # standard m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model") # Prob m_libsvm_model2 = os.path.join( cpc_dir, "data/libsvm.model2") # Prob + weighted version m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range") s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2 %(tmpdir)s/test.lsv.scaled %(m_libsvm_model0)s %(tmpdir)s/test.svm0.predict > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr; ''') s.append(''' printf "gene_id\\tlength\\tresult\\tvalue\\n" | gzip > %(outfile)s; cat %(tmpdir)s/test.svm0.predict | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta | gzip >> %(outfile)s; ''') # generate reports s.append('''cat %(tmpdir)s/blastx.feat | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf; gzip %(outfile)s.orf %(outfile)s.homology; ''') # now run it all statement = " checkpoint; ".join(s) P.run() # clean up shutil.rmtree(tmpdir)
def loadBAMStats(infiles, outfile): '''load output of :func:`buildBAMStats` into database. Arguments --------- infiles : string Input files, output from :func:`buildBAMStats`. outfile : string Logfile. The table name will be derived from `outfile`. ''' header = ",".join([P.snip(os.path.basename(x), ".readstats") for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) load_statement = P.build_load_statement( tablename, options="--add-index=track " " --allow-empty-file") E.info("loading bam stats - summary") statement = """cgat combine_tables --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | cgat table2table --transpose | %(load_statement)s > %(outfile)s""" P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options="--allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ P.run() # load mapping qualities, there are two columns per row # 'all_reads' and 'filtered_reads' # Here, only filtered_reads are used (--take=3) for suffix in ("mapq",): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) load_statement = P.build_load_statement( "%s_%s" % (tablename, suffix), options=" --allow-empty-file") statement = """cgat combine_tables --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty --take=3 %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | %(load_statement)s >> %(outfile)s """ P.run()
# load options from the config file P.get_parameters(["pipeline.yml"]) PARAMS = P.PARAMS ################################################### ################################################### ################################################### INFILES = glob.glob("*.fastq.1.gz") OUTFILES = ["filtered.dir/" + x for x in INFILES] # check the existence of paired files if PARAMS["paired"] == 1: for infile in INFILES: second_in_pair = P.snip(infile, ".fastq.1.gz") + ".fastq.2.gz" assert os.path.exists( second_in_pair ), "no second read for file {infile}: should be {second_in_pair}" trunclen = PARAMS["trim_trunclen"].split(",") maxee = PARAMS["trim_maxee"].split(",") assert len( trunclen) == 2, "specify 2 values only for paired data (truncLen)" assert len(maxee) == 2, "specify 2 values only for paired data (maxee)" ################################################### ################################################### ################################################### @follows(mkdir("filtered.dir"))
def buildPicardAlignStats(infile, outfile): '''Gather BAM file alignment statistics using Picard ''' track = P.snip(os.path.basename(infile), ".bam") statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals( ) P.run()
def loadPicardMetrics(infiles, outfile, suffix, pipeline_suffix=".picard_stats", tablename=None): '''load picard metrics. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``toTable(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.toTable(outfile), suffix) outf = P.getTempFile(".") filenames = ["%s.%s" % (x, suffix) for x in infiles] first = True for filename in filenames: track = P.snip(os.path.basename(filename), "%s.%s" % (pipeline_suffix, suffix)) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.openFile(filename, "r").readlines() # extract metrics part rx_start = re.compile("## METRICS CLASS") for n, line in enumerate(lines): if rx_start.search(line): lines = lines[n + 1:] break for n, line in enumerate(lines): if not line.strip(): lines = lines[:n] break if len(lines) == 0: E.warn("no lines in %s: %s" % (track, filename)) continue if first: outf.write("%s\t%s" % ("track", lines[0])) fields = lines[0][:-1].split("\t") else: f = lines[0][:-1].split("\t") if f != fields: raise ValueError( "file %s has different fields: expected %s, got %s" % (filename, fields, f)) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load(outf.name, outfile, tablename=tablename, options="--add-index=track --allow-empty-file") os.unlink(outf.name)
def sortByName(infile, outfile): '''Add number of hits tags to sam file''' to_cluster = USECLUSTER track = P.snip(outfile, ".bam") statement = '''samtools sort -n %(infile)s %(track)s;''' P.run()
def compareCheckSums(infiles, outfile): '''compare checksum files against existing reference data. ''' outf = IOTools.openFile(outfile, "w") outf.write("\t".join(( ("track", "status", "nfiles", "nref", "missing", "extra", "different", "different_md5", "different_lines", "files_missing", "files_extra", "files_different_md5", "files_different_lines"))) + "\n") for infile in infiles: track = P.snip(infile, ".stats") reffile = track + ".ref" # regular expression of files to test only for existence regex_exist = PARAMS.get('%s_regex_exist' % track, None) if regex_exist: regex_exist = re.compile(regex_exist) regex_linecount = PARAMS.get('%s_regex_linecount' % track, None) if regex_linecount: regex_linecount = re.compile(regex_linecount) if not os.path.exists(reffile): raise ValueError('no reference data defined for %s' % track) cmp_data = pandas.read_csv(IOTools.openFile(infile), sep="\t", index_col=0) ref_data = pandas.read_csv(IOTools.openFile(reffile), sep="\t", index_col=0) shared_files = set(cmp_data.index).intersection(ref_data.index) missing = set(ref_data.index).difference(cmp_data.index) extra = set(cmp_data.index).difference(ref_data.index) different = set(shared_files) # remove those for which only check for existence if regex_exist: different = set([x for x in different if not regex_exist.search(x)]) # select those for which only check for number of lines if regex_linecount: check_lines = [x for x in different if regex_linecount.search(x)] dd = (cmp_data['nlines'][check_lines] != ref_data['nlines'][check_lines]) different_lines = set(dd.index[dd]) different = different.difference(check_lines) else: different_lines = set() # remainder - check md5 dd = cmp_data['md5'][different] != ref_data['md5'][different] different_md5 = set(dd.index[dd]) if len(missing) + len(extra) + \ len(different_md5) + len(different_lines) == 0: status = "OK" else: status = "FAIL" outf.write("\t".join(map(str, ( track, status, len(cmp_data), len(ref_data), len(missing), len(extra), len(different_md5) + len(different_lines), len(different_md5), len(different_lines), ",".join(missing), ",".join(extra), ",".join(different_md5), ",".join(different_lines), ))) + "\n") outf.close()
def intersectionHeatmap(infiles, outfile): ''' calculate the intersection between the infiles and plot''' pandas2ri.activate() name2genes = {} df = pd.DataFrame(columns=["id_1", "id_2", "intersection", "perc"]) ix = 0 for inf in infiles: name = P.snip(os.path.basename(inf)).split(".")[0] name = name.replace(".", "_") with IOTools.openFile(inf, "r") as f: genes = set() for line in f: if line[0] == "#": continue values = line.strip().split("\t") info = values[7].split(";") for x in info: if x.split("=")[0] == "SNPEFF_GENE_NAME": gene_name = x.split("=")[1] break # if no gene name found, line is skipped if gene_name: genes.update((gene_name, )) name2genes[name] = genes df.loc[ix] = [name, name, len(genes), 1.0] ix += 1 for pair in itertools.permutations(list(name2genes.keys()), 2): id_1, id_2 = pair intersection = len(name2genes[id_1].intersection(name2genes[id_2])) not_intersecting = len(name2genes[id_1].symmetric_difference( name2genes[id_2])) intersection_perc = float(intersection) / (intersection + not_intersecting) df.loc[ix] = [id_1, id_2, intersection, intersection_perc] ix += 1 variant = os.path.basename(outfile).replace("overlap_", "").replace( "_heatmap.png", "") plotIntersectionHeatmap = R(''' function(df){ library(ggplot2) m_txt = element_text(size=15) m_txt_90 = element_text(size=15, angle=90, vjust=0.5, hjust=1) l_txt = element_text(size=20) p = ggplot(df, aes(id_1, id_2, fill=100*perc)) + geom_tile() + geom_text(aes(label=intersection), size=3) + scale_fill_gradient(name="Intersection (%%)", limits=c(0,100), low="yellow", high="dodgerblue4") + theme(axis.text.x = m_txt_90, axis.text.y = m_txt, legend.text = m_txt, legend.title = m_txt, aspect.ratio=1) + xlab("") + ylab("") + ggtitle("%(variant)s") ggsave("%(outfile)s", width=10, height=10) }''' % locals()) plotIntersectionHeatmap(df)
''' generic split by newline and tab for reading tsv files ''' return line[:-1].split("\t") ######################################################################### ######################################################################### ######################################################################### @follows(mkdir("gtfs")) @merge([PARAMS["genesets_abinitio_coding"], PARAMS["genesets_reference"]], os.path.join( "gtfs", P.snip(PARAMS["genesets_abinitio_coding"], ".gtf.gz") + "_coding.gtf.gz")) def buildCodingGeneSet(infiles, outfile): ''' takes the output from cuffcompare of a transcript assembly and filters for annotated protein coding genes. NB "pruned" refers to nomenclature in the transcript building pipeline - transcripts that appear in at least two samples. Because an abinitio assembly will often contain fragments of known transcripts and describe them as novel, the default behaviour is to produce a set that is composed of 'complete' or 'contained' transcripts i.e. nothing novel. This may underestimate the number
def getID(infile): return P.snip(os.path.basename(infile), ".mutect.snp.annotated.filtered.vcf")
--output-filename-pattern=%%DIR%%/ --deseq-fit-type=%(deseq_fit_type)s --deseq-dispersion-method=%(deseq_dispersion_method)s --log=%(outfile)s.log --fdr=%(edger_fdr)f" | grep -v "warnings" | gzip > %(outfile)s ''' P.run() @follows(aggregateTiledReadCounts, mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation"))) @files([((data, design), "diff_methylation/%s_%s.deseq.gz" % (P.snip(os.path.basename(data), ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv"))) for data, design in itertools.product( glob.glob("diff_methylation/*.counts.tsv.gz"), P.asList(PARAMS["deseq_designs"]))]) def runDESeq(infiles, outfile): '''estimate differential expression using DESeq. The final output is a table. It is slightly edited such that it contains a similar output and similar fdr compared to cuffdiff. ''' runDE(infiles, outfile, "deseq") ######################################################################### #########################################################################
def loadIntervals(infile, outfile): '''load intervals from :term:`bed` formatted files into the database. If a :term:`bam` file is associated with a :term:`bed` file, re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. nprobes: number of reads in interval peakcenter: position with maximum number of reads in interval avgval: average coverage within interval ''' tmpfile = P.getTempFile(".") headers = ("avgval", "disttostart", "genelist", "length", "peakcenter", "peakval", "position", "interval_id", "npeaks", "nprobes", "contig", "start", "end", "score", "strand") tmpfile.write("\t".join(headers) + "\n") (avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, npeaks, nprobes) = \ 0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0 track = Sample(filename=P.snip(infile, ".bed.gz")) bamfiles, offsets = getAssociatedBAMFiles(track) if bamfiles: E.info("%s: associated bamfiles = %s" % (track, bamfiles)) else: E.info("%s: no bamfiles associated" % (track)) # open all bamfiles samfiles = [pysam.Samfile(fn, "rb") for fn in bamfiles] c = E.Counter() # count tags for bed in Bed.iterator(IOTools.openFile(infile, "r")): c.input += 1 if "name" not in bed: bed.name = c.input try: strand = bed["strand"] except IndexError: strand = "." # The fifth field of a bed file can be used to supply a # score. Our iterator returns the optional fields as a "fields # array". The first of these is the interval name, and the # second the score. The score may be more is better or less is # better. if len(bed.fields) > 1: value = bed.fields[1] if value != "": score = value else: score = 1 else: score = 1 if samfiles: npeaks, peakcenter, length, avgval, peakval, nprobes = \ PipelinePeakcalling.countPeaks( bed.contig, bed.start, bed.end, samfiles, offsets) if nprobes == 0: c.skipped_reads += 1 else: # deal with bed12 bed_intervals = bed.toIntervals() length = sum([e - s for s, e in bed_intervals]) mid_point = length / 2 for s, e in bed_intervals: peakcenter = s + mid_point if peakcenter >= e: mid_point = peakcenter - e else: break npeaks, avgval, peakval, nprobes = \ (1, 1, 1, 1) c.output += 1 tmpfile.write("\t".join( map(str, (avgval, disttostart, genelist, length, peakcenter, peakval, position, bed.name, npeaks, nprobes, bed.contig, bed.start, bed.end, score, strand))) + "\n") if c.output == 0: E.warn("%s - no aggregate intervals") tmpfile.close() P.load(tmpfile.name, outfile, tablename=os.path.basename("%s_intervals" % track.asTable()), options="--allow-empty-file " "--add-index=interval_id") os.unlink(tmpfile.name) E.info("%s\n" % str(c))
def exportMemeCHiPIntervalSequences(infile, outfile): track = os.path.basename(P.snip(infile, "_intervals.load")) exportIntervalSequences(infile, outfile, track, "memechip")
def exportDremeIntervalSequences(infile, outfile): track = os.path.basename(P.snip(infile, "_intervals.load")) exportIntervalSequences(infile, outfile, track, "dreme")