def process_remote(infile): repository, acc = iotools.open_file(infile).readlines()[0].strip().split() if repository == "ENCODE": location, filetype = get_encode_file(acc) elif repository == "URL": location = acc if acc.endswith("gz"): filetype = ".".join(acc.split(".")[-2]) else: filetype = acc.split(".")[-1] else: raise ValueError("repository %s not yet supported" % repository) tmpfile = P.get_temp_filename(shared=False, suffix="." + filetype) preamble = "wget %(location)s -O %(tmpfile)s --quiet &&" postamble = "&& rm %(tmpfile)s" if filetype == "bam": preamble += "samtools index %(tmpfile)s && " postamble += " && rm %(tmpfile)s.bai " elif filetype == "bed.gz": tmp2 = P.get_temp_filename(shared=False) preamble += ''' zcat %(tmpfile)s | sort -k1,1 -k2,2n | bgzip > %(tmp2)s && mv %(tmp2)s %(tmpfile)s && tabix -p bed %(tmpfile)s && ''' postamble += "&& rm %(tmpfile)s.tbi" return preamble % locals(), postamble % locals(), tmpfile, filetype
def removeDuplicates(fastq1, outfile): '''Filter exact duplicates, if specified in config file''' if IS_PAIRED: fastq2 = P.snip(fastq1, FASTQ1_SUFFIX) + FASTQ2_SUFFIX outfile1 = P.snip(outfile, '.gz') outfile2 = P.snip(outfile1, '.fastq.1') + '.fastq.2' logfile = P.snip(outfile1, '.fastq.1') + '.log' cluster_file = P.snip(outfile1, '1') + '*.clstr' to_filter = PARAMS['cdhit_dedup'] if to_filter: tmpf1 = P.get_temp_filename('.') tmpf2 = P.get_temp_filename('.') statement = ("zcat %(fastq1)s > %(tmpf1)s &&" " zcat %(fastq2)s > %(tmpf2)s &&" " cd-hit-dup" " -i %(tmpf1)s" " -i2 %(tmpf2)s" " -o %(outfile1)s" " -o2 %(outfile2)s" " %(cdhit_options)s" " &> %(logfile)s &&" " gzip %(outfile1)s &&" " gzip %(outfile2)s &&" " gzip %(logfile)s &&" " rm -f %(tmpf1)s &&" " rm -f %(tmpf2)s &&" " rm -f %(cluster_file)s") P.run(statement, job_options=PARAMS['cdhit_run_options']) else: E.warn('Deduplication step is being skipped for: %s' % fastq1) symlnk(fastq1, outfile) symlnk(fastq2, outfile2 + '.gz') else: outfile1 = P.snip(outfile, '.gz') logfile = P.snip(outfile1, '.fastq.1') + '.log' cluster_file = P.snip(outfile1, '1') + '*.clstr' to_filter = PARAMS['preprocess_dedup'] if to_filter: tmpf1 = P.get_temp_filename('.') statement = ("zcat %(fastq1)s > %(tmpf1)s" " cd-hit-dup" " -i %(tmpf1)s" " -o %(outfile1)s" " %(cdhit_options)s" " &> %(logfile)s &&" " gzip %(outfile1)s &&" " gzip %(logfile)s &&" " rm -f %(tmpf1)s &&" " rm -f %(cluster_file)s") P.run(statement, job_options=PARAMS['cdhit_run_options']) else: E.warn('Deduplication step is being skipped for: %s' % fastq1) symlnk(fastq1, outfile)
def assembleWithStringTie(infiles, outfile): infile, reference = infiles basefile = os.path.basename(infile) job_threads = PARAMS["stringtie_threads"] job_memory = PARAMS["stringtie_memory"] tmpfile = P.get_temp_filename() if os.path.exists(tmpfile): os.unlink(tmpfile) statement = ''' portcullis full -t 1 -o portcullis/%(basefile)s/ -r %(portcullis_bedref)s -b %(portcullis_fastaref)s %(infile)s && mv portcullis/%(basefile)s/portcullis.filtered.bam %(tmpfile)s && rm -r portcullis/%(basefile)s/ && stringtie %(tmpfile)s -p %(stringtie_threads)s -G <(zcat %(reference)s) %(stringtie_options)s 2> %(outfile)s.log | gzip > %(outfile)s && rm %(tmpfile)s''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") tmpfilename = P.get_temp_filename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, tmpfilename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; ".join([ "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s" ]) P.run(statement, job_condaenv="portcullis")
def merge_bw(infiles, outfile): """Merge bigWigs using mergeBigWig""" infiles = " ".join(infiles) tmpfile = P.get_temp_filename() tmpfile2 = P.get_temp_filename() statement = '''bigWigMerge %(infiles)s %(tmpfile)s && LC_COLLATE-C sort -k1,1 -k2,2n -o %(tmpfile2)s %(tmpfile)s && bedGraphToBigWig %(tmpfile2)s %(contig_file)s %(outfile)s''' P.run(statement)
def remove_reads(infiles, outfile): """remove all of the reads mapping at least once to the genome""" infile, pre_trna_genome = infiles temp_file = P.get_temp_filename(".") temp_file1 = P.get_temp_filename(".") statement = """samtools view -h %(infile)s> %(temp_file)s && perl %(cribbslab)s/perl/removeGenomeMapper.pl %(pre_trna_genome)s %(temp_file)s %(temp_file1)s && samtools view -b %(temp_file1)s > %(outfile)s""" job_memory = "50G" P.run(statement) os.unlink(temp_file) os.unlink(temp_file1)
def buildGff(infile, outfile): '''Creates a gff for DEXSeq This takes the gtf and flattens it to an exon based input required by DEXSeq. The required python script is provided by DEXSeq and uses HTSeqCounts. Parameters ---------- infile : string Input filename in :term:`gtf` format outfile : string A :term:`gff` file for use in DEXSeq annotations_interface_geneset_all_gtf : string :term:`PARAMS`. Filename of :term:`gtf` file containing all ensembl annotations ''' tmpgff = P.get_temp_filename(".") statement = "gunzip -c %(infile)s > %(tmpgff)s" P.run(statement) ps = PYTHONSCRIPTSDIR statement = '''python %(ps)s/dexseq_prepare_annotation.py %(tmpgff)s %(outfile)s''' P.run(statement, job_condaenv="splicing") os.unlink(tmpgff)
def runBioProspector(infiles, outfile, dbhandle): '''run bioprospector for motif discovery. Bioprospector is run on only the top 10% of peaks. ''' # bioprospector currently not working on the nodes to_cluster = False # only use new nodes, as /bin/csh is not installed # on the old ones. # job_options = "-l mem_free=8000M" tmpfasta = P.get_temp_filename(".") track = outfile[:-len(".bioprospector")] nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=True, masker="dust", proportion=P.get_params()["bioprospector_proportion"]) if nseq == 0: E.warn("%s: no sequences - bioprospector skipped" % track) iotools.touch_file(outfile) else: statement = ''' BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log ''' P.run(statement) os.unlink(tmpfasta)
def run(self, infiles, outfile, params): files = " ".join(infiles) job_threads = params.job_threads # todo: # 1. add header. # 2. do batch+merge sort in order to avoid hitting temporary space limits. # 3. remove unnecessary info fields while sorting, add them later. tmpdir = P.get_temp_filename() retval = P.run( "mkdir {tmpdir}; " "bcftools view -h {infiles[0]} " "| cut -f 1-10 " "| bgzip > {outfile}; " "zcat {files} " "| awk -v OFS='\\t' " "'!/^#/ && $5 != \"<NON_REF>\" " "{{$8=\".\";$9=\".\";$6=\".\";$7=\"GT\";$10=\".\"; print}}' " "2> {outfile}.filter.log " "| sort -k1,1V -k2,2n " "--parallel {job_threads} " "-T {tmpdir} " "2> {outfile}.sort.log " "| uniq " "| bgzip " ">> {outfile}; " "tabix -p vcf {outfile}; " "rm -rf {tmpdir} ".format(**locals()))
def map_with_bowtie(infiles, outfile): """ map reads with bowtie to get general alignment so features can be counted over RNA gene_biotypes """ fastq, genome = infiles tmp_fastq = P.get_temp_filename(".") temp_file = P.get_temp_filename(".") genome = genome.replace(".fa", "") statement = """gzip -dc %(fastq)s > %(tmp_fastq)s && bowtie -k 10 -v 2 --best --strata --sam %(genome)s %(tmp_fastq)s 2> %(outfile)s_bowtie.log | samtools view -bS | samtools sort -T %(temp_file)s -o %(outfile)s && samtools index %(outfile)s """ job_memory = "15G" P.run(statement)
def run(self, infile, outfile, params): if params.reference_bed is None: raise ValueError("{} requires reference_bed to be set".format( self.name)) # requires a consistent sort order, so sort both files. # It also requires the chromosome content to be identical, # so restrict output to common sets. tmpf = P.get_temp_filename(clear=True) tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.reference_bed) statements = [stmnt] statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa " "| bgzip " "> {outfile}.shared.bed.gz") statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_test.bed.gz") statements.append("{params.path} intersect " "-b {tmpf_test} " "-a {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_truth.bed.gz") statements.append("rm -f {tmpf_test} {tmpf_truth}") for section in self.sections: statements.append( "tabix -p bed {outfile}.{section}.bed.gz".format(**locals())) statement = "; ".join(statements) retval = P.run(statement.format(**locals())) # these are small files, so doing it here. Implement tabix.count() # method counts = dict() for section in self.sections: # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf: inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz") counts[section] = len(list(inf.fetch())) inf.close() with IOTools.open_file(outfile, "w") as outf: outf.write("section\tcounts\n") outf.write("\n".join( ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n") return retval
def buildBedGraph(infile, outfile): '''build wiggle files from bam files. Generate :term:`bigWig` format file from :term:`bam` alignment file Parameters ---------- infile : str Input filename in :term:`bam` format outfile : str Output filename in :term:`bigwig` format annotations_interface_contigs : str :term:`PARAMS` Input filename in :term:`bed` format ''' inf = infile[0] inf_name = inf.replace(".bam", "") idxstats = infile[1] # scale by Million reads mapped reads_mapped = Bamtools.getNumberOfAlignments(inf) for idx in idxstats: file_name = idx.replace(".idxstats", "") if file_name == inf_name: # pass to a function that extracts the number of reads aligned to # spike in and human genome regex = PARAMS['quant_regex'] + "*" scale = ModuleQuantchip.getSpikeInReads(idx, str(regex)) contig_sizes = ModuleQuantchip.getContigSizes(idx) else: continue tmpfile = P.get_temp_filename() tmpfile2 = P.get_temp_filename() job_memory = "30G" statement = '''bedtools genomecov -ibam %(inf)s -g %(contig_sizes)s -bg -scale %(scale)f > %(tmpfile)s && sort -k1,1 -k2,2n -o %(tmpfile2)s %(tmpfile)s && cat %(tmpfile2)s | grep chr > %(outfile)s && rm -f %(tmpfile)s %(tmpfile2)s ''' P.run(statement)
def downsample(infile, outfile): '''downsample fastq files using seqtk tool.''' tmp_file = P.get_temp_filename(".") statement = '''zcat %(infile)s > %(tmp_file)s && seqtk sample -2 -s100 %(tmp_file)s %(downsample_read)s | gzip > %(outfile)s''' job_memory = "30G" P.run(statement) os.unlink(tmp_file)
def map_tran_gene(outfile): ''''Add an identifier to the transcript IDs''' tmp_cdna = P.get_temp_filename(".") statement = '''zcat < %(cdna_fasta)s | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > %(tmp_cdna)s && cat %(tmp_cdna)s | awk '{print $0"."NR}' > %(outfile)s''' P.run(statement)
def map_tr2gene(infile, outfile): '''Map the transcripts to genes.''' tmp_cdna = P.get_temp_filename(".") statement = '''zcat < %(cdna_fasta)s | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > %(tmp_cdna)s && awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' %(infile)s %(tmp_cdna)s > %(outfile)s''' P.run(statement)
def remove_reads(infiles, outfile): """remove all of the reads mapping at least once to the genome""" infile, pre_trna_genome = infiles temp_file = P.get_temp_filename(".") temp_file1 = P.get_temp_filename(".") PY_SRC_PATH = os.path.abspath(os.path.dirname(__file__)) statement = """samtools view -h %(infile)s> %(temp_file)s && perl %(PY_SRC_PATH)s/perl/removeGenomeMapper.pl %(pre_trna_genome)s %(temp_file)s %(temp_file1)s && samtools view -b %(temp_file1)s > %(outfile)s""" % locals() job_memory = "50G" P.run(statement) os.unlink(temp_file) os.unlink(temp_file1)
def download(self, genes=None, fields=None, scope=None, species=None): ''' download an up to date ontology file, parse the xml data into a Python "ElementTree" and delete the ontology file. ''' ontologyfile = P.get_temp_filename(".") os.system("wget -O %s %s" % (ontologyfile, self.datasource)) tree = ET.parse(ontologyfile) os.remove(ontologyfile) self.dataset = tree
def bustools_sort(infile, outfile): """ Generate a sorted bus file """ tmp = P.get_temp_filename(".") statement = """bustools sort -T %(tmp)s -t %(kallisto_threads)s -o %(outfile)s %(infile)s/output.bus""" P.run(statement)
def capture_list(outfile): '''Get the transcripts to capture list and transcripts to genes for cDNA''' tmp_cdna = P.get_temp_filename(".") statement = '''zcat < %(cdna_fasta)s cDNA.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > %(tmp_cdna)s && cat %(tmp_cdna)s | tr "." " " | awk '{print $1}' > %(outfile)s ''' P.run(statement) os.unlink(tmp_cdna)
def intron_bed2fa(outfile): '''This converts introns bed to introns fa''' tmp_bed = P.get_temp_filename(".") statement = '''zcat < %(intron_bed)s > %(tmp_bed)s && bedtools getfasta -name -fo %(outfile)s -fi %(genome_file)s -bed %(tmp_bed)s''' P.run(statement) os.unlink(tmp_bed)
def find_intron_fa_header(infile, outfile): '''Fix the INTRONS FASTA header''' tmp_cdna = P.get_temp_filename(".") statement = '''zcat %(cdna)s > %(tmp_cdna)s && awk '{print ">"$1"."NR" gene_id:"$2" gene_name:"$3}' %(infile)s > geneset.dir/cDNA_fasta_header.txt && awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' geneset.dir/cDNA_fasta_header.txt %(tmp_cdna)s > %(outfile)s''' P.run(statement) os.unlink(tmp_cdna)
def tss_gene_parse(infile, outfile): """Filter a gtf using gene lists and then outut them as gtf""" bedfile = PARAMS['geneexpression_tss'] tmpfile = P.get_temp_filename() statement = """zcat %(bedfile)s | grep -f %(infile)s > %(tmpfile)s && cat %(tmpfile)s | awk '{$4 = "TSS"; print}' OFS='\\t' | gzip > %(outfile)s""" P.run(statement)
def buildRefFlat(infile, outfile): '''build flat geneset for Picard RnaSeqMetrics. ''' tmpflat = P.get_temp_filename(".") statement = ''' gtfToGenePred -genePredExt -geneNameAsName2 %(infile)s %(tmpflat)s && paste <(cut -f 12 %(tmpflat)s) <(cut -f 1-10 %(tmpflat)s) > %(outfile)s ''' P.run(statement, job_memory=PARAMS["job_memory"]) os.unlink(tmpflat)
def run(self, outfile, params): if "--threads" in params.options or "-t " in params.options: job_threads = int(re.search("(-t|--threads)\s*(\d+)", params.options).groups()[1]) fastq = resolve_argument(params.fastq, ",").split(",") if len(fastq) == 1: fastq = '-U "{}"'.format(fastq) else: fastq = '-1 "{}" -2 "{}"'.format(*fastq) tmpdir = P.get_temp_filename(clear=True) if "index" in params._fields: index = params.index else: index = params.reference_fasta if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) # pipes.quote needs to shlex.quote in py3 readgroup_option = "--rg-id {}".format(readgroup_id) # add additional level of quoting and remove "ID:{}" readgroup_string = re.sub("@RG\tID:\S+\t", "", readgroup_string) readgroup_string = " ".join(["--rg {}".format(x) for x in readgroup_string.split("\t")]) else: readgroup_option = "" readgroup_string = "" return P.run( "mkdir {tmpdir}; " "{self.path} " "{readgroup_option} " "{readgroup_string} " "{params.options} " "-x {index} " "{fastq} " "2> {outfile}.log " "| samtools view -b /dev/stdin " "2> {outfile}.view.log " "| samtools sort -T {tmpdir} -O bam /dev/stdin " "2> {outfile}.sort.log " "> {outfile}; " "samtools index {outfile}; " "rm -rf {tmpdir}".format(**locals()), **params._asdict())
def buildReferenceKallisto(infiles, outfile): ''' Builds a reference transcriptome and decoy sequneces for alevin and kallisto Parameters ---------- infile: str path to the GTF file containing transcript and gene level annotations genome_dir: str :term: `PARAMS` the directory of the reference genome genome: str :term: `PARAMS` the filename of the reference genome (without .fa) outfile: str path to output file ''' prim_trans1, prim_trans2 = infiles genome_file1 = PARAMS['genome1'] if PARAMS['mixed_species']: genome_file2 = PARAMS['genome2'] tmp1 = P.get_temp_filename('.') tmp2 = P.get_temp_filename('.') statement = ''' grep "^>" <(gunzip -c %(genome_file1)s %(genome_file2)s) | cut -d " " -f 1 > decoys.txt && sed -i.bak -e 's/>//g' decoys.txt && cat %(prim_trans1)s %(prim_trans2)s %(genome_file1)s %(genome_file1)s > %(outfile)s ''' else: statement = ''' grep "^>" <(gunzip -c %(genome_file1)s) | cut -d " " -f 1 > decoys.txt && sed -i.bak -e 's/>//g' decoys.txt && cat %(prim_trans1)s %(genome_file1)s > %(outfile)s ''' P.run(statement) if PARAMS['mixed_species']: os.unlink(tmp1) os.unlink(tmp2)
def busText(infile, outfile): ''' Sort the bus file produced by kallisto and then convert it to a text file. ''' tmp_bus = P.get_temp_filename(".") statement = ''' sleep 10 bustools sort -o %(tmp_bus)s %(infile)s ; bustools text -o %(outfile)s %(tmp_bus)s ''' P.run(statement)
def loadManualAnnotations(infile, outfile): tmp = P.get_temp_filename(".") annotation = P.snip(infile, "_annotations.tsv") with iotools.open_file(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with iotools.open_file(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def aggregateAdaptors(infiles, outfile): ''' Collate fasta files into a single contaminants file for adapter removal. ''' tempfile = P.get_temp_filename() infiles = " ".join(infiles) statement = """ cat %(infiles)s | fastx_reverse_complement > %(tempfile)s && cat %(tempfile)s %(infiles)s | fastx_collapser > %(outfile)s && rm -f %(tempfile)s """ P.run(statement)
def create_fragment_bed(infile, outfile): """Take the clusterInfo and create a bed file containing all of the fragments of tRNAs""" cluster_info = infile.replace("_cluster.fa","_clusterInfo.fa") tmp_file = P.get_temp_filename(".") PY_SRC_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "python")) statement = """python %(PY_SRC_PATH)s/trna_fragment_bed.py -I %(cluster_info)s -S %(tmp_file)s && sort %(tmp_file)s | uniq > %(outfile)s""" P.run(statement) os.unlink(tmp_file)
def run_rmats_pre(infiles, outfile, track): infile, gtffile = infiles od = os.path.abspath("rmats.dir") statement = '''rmats.py --task prep --tmp rmats.dir/%(track)s.dir --gtf <(zcat %(gtffile)s) --readLength %(rmats_readLength)s -t %(rmats_paired)s --od rmats.dir --b1 <(echo %(infile)s) &> rmats.dir/%(track)s.prep.log ''' if infile.endswith(".remote"): token = glob("gdc-user-token*") tmpfilename = P.get_temp_filename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM(infile, token, tmpfilename, filter_bed=PARAMS["contigs_bed"]) s = re.sub(";\n", " &&\n", s) infile = ",".join(infile) statement = " && ".join([ "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s" ]) P.run(statement, job_condaenv=PARAMS["rmats_env"], job_memory=PARAMS["rmats_prep_memory"]) rmats_counter = "" for f_path in glob("rmats.dir/%(track)s.dir/*.rmats" % locals()): shutil.copy(f_path, P.snip(outfile, ".rmats") + rmats_counter + ".rmats") if rmats_counter == "": rmats_counter = 1 else: rmats_counter += 1
def fix_intron_fasta(infiles, outfile): '''fix all of the headers for the introns FASTA file so that they contain the transcript ID, an identifier specifying that the transcript is an “intronic” transcript, and a unique number to avoid duplicates.''' introns_t2g, introns = infiles tmp_fasta = P.get_temp_filename(".") statement = '''awk '{print ">"$1"."NR"-I"" gene_id:"$2" gene_name:"$3}' %(introns_t2g)s > %(tmp_fasta)s && awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' %(tmp_fasta)s %(introns)s > %(outfile)s ''' P.run(statement) os.unlink(tmp_fasta)