def downloadFiles(infiles, outfile): infile = infiles basefile = os.path.basename(infile) filename = "temp_bams/%s" % basefile baseoutfile = os.path.basename(outfile) outdir = os.path.dirname(outfile) if infile.endswith(".remote"): for line in IOTools.open_file(infile): repo, acc = line.strip().split("\t")[:2] if repo == "SRA": if not os.path.isfile(outfile + ".1.gz"): statement = "; ".join( [Sra.prefetch(acc), Sra.extract(acc, outdir)]) P.run(statement) else: pass elif repo == "GDC": base = os.path.splitext(basefile) outfile = "bam.dir/" + base[0] + ".bam" token = glob.glob("gdc-user-token*") if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, filename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) if not os.path.isfile(outfile): statement = "; ".join([ "mkdir -p %(filename)s", s, '''cp %(infile)s %(outfile)s; rm -r %(filename)s''' ]) P.run(statement) else: pass else: raise ValueError("Unknown repository: %s" % repo) else: pass
def assembleWithStringTie(infiles, outfile): infile, reference = infiles job_threads = PARAMS["stringtie_threads"] job_memory = PARAMS["stringtie_memory"] statement = '''stringtie %(infile)s -p %(stringtie_threads)s -G <(zcat %(reference)s) %(stringtie_options)s 2> %(outfile)s.log | gzip > %(outfile)s ''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") tmpfilename = P.getTempFilename() if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, tmpfilename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; checkpoint ;".join( ["mkdir %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s"]) P.run()
def quantifyWithSalmon(infiles, outfile): '''Quantify existing samples against genesets''' job_threads = 2 job_memory = "16G" infile, gtffile = infiles basefile = os.path.basename(infile) sample_name = basefile.split(os.extsep, 1) sorted_bam = "sorted_bams/" + sample_name[0] + "_sorted.bam" gtfbase = P.snip(os.path.basename(gtffile), ".gz") salmonIndex = "salmon_index/" + gtfbase + ".salmon.index" fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq" fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq" salmon_options = PARAMS["salmon_quantoptions"] statement = ''' samtools sort -n %(infile)s -o %(sorted_bam)s; samtools fastq -1 %(fastq1)s -2 %(fastq2)s -0 /dev/null -s /dev/null -n -F 0x900 %(sorted_bam)s; salmon quant -i %(salmonIndex)s --libType IU -1 %(fastq1)s -2 %(fastq2)s -o %(outfile)s %(salmon_options)s; mv %(outfile)s/quant.sf %(outfile)s.sf; rm %(fastq1)s; rm %(fastq2)s; rm %(sorted_bam)s ''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") filename = "temp_bams/%s" % basefile tmpfilename = P.get_temp_filename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, filename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; ".join( ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"]) P.run(statement)
def assembleWithStringTie(infiles, outfile): infile, reference = infiles basefile = os.path.basename(infile) job_threads = PARAMS["stringtie_threads"] job_memory = PARAMS["stringtie_memory"] tmpfile = P.get_temp_filename() if os.path.exists(tmpfile): os.unlink(tmpfile) statement = ''' portcullis full -t 1 -o portcullis/%(basefile)s/ -r %(portcullis_bedref)s -b %(portcullis_fastaref)s %(infile)s && mv portcullis/%(basefile)s/portcullis.filtered.bam %(tmpfile)s && rm -r portcullis/%(basefile)s/ && stringtie %(tmpfile)s -p %(stringtie_threads)s -G <(zcat %(reference)s) %(stringtie_options)s 2> %(outfile)s.log | gzip > %(outfile)s && rm %(tmpfile)s''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") tmpfilename = P.get_temp_filename() if os.path.exists(tmpfilename): os.unlink(tmpfilename) if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, tmpfilename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; ".join([ "mkdir -p %(tmpfilename)s", s, statement, "rm -r %(tmpfilename)s" ]) P.run(statement)
def quantifyWithSalmon(infiles, outfile): '''Quantify existing samples against genesets''' job_threads = 2 job_memory = "8G" infile, gtffile = infiles basefile = os.path.basename(infile) gtfbase = P.snip(os.path.basename(gtffile), ".gz") salmonIndex = "salmon_index/" + gtfbase + ".salmon.index" fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq" fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq" salmon_options = PARAMS["salmon_quantoptions"] statement = ''' samtools fastq -1 %(fastq1)s -2 %(fastq2)s %(infile)s; salmon quant -i %(salmonIndex)s --libType A -1 %(fastq1)s -2 %(fastq2)s -o %(outfile)s --threads %(job_threads)s %(salmon_options)s; checkpoint; mv %(outfile)s/quant.sf %(outfile)s.sf; rm %(fastq1)s; rm %(fastq2)s ''' if infile.endswith(".remote"): token = glob.glob("gdc-user-token*") filename = "temp_bams/%s" % basefile tmpfilename = P.getTempFilename() if len(token) > 0: token = token[0] else: token = None s, infile = Sra.process_remote_BAM( infile, token, filename, filter_bed=os.path.join( PARAMS["annotations_dir"], PARAMS["annotations_interface_contigs_bed"])) infile = " ".join(infile) statement = "; checkpoint; ".join( ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"]) P.run()
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file): '''Generate a .fasta file of adaptor sequences that are overrepresented in the reads from a sample. Requires cutadapt >= 1.7. Arguments --------- infile : string Input filename that has been QC'ed. The filename is used to check if the input was a :term:`sra` file and guess the number of tracks to check. outfile : string Output filename in :term:`fasta` format. track : string Track name, used to access FastQC results in database. dbh : object Database handle. contaminants_file : string Path of file containing contaminants used for screening by Fastqc. ''' tracks = [track] if infile.endswith(".sra"): # patch for SRA files, look at multiple tracks f, fastq_format = Sra.peek(infile) if len(f) == 2: tracks = [track + "_fastq_1", track + "_fastq_2"] found_contaminants = [] for t in tracks: table = PipelineTracks.AutoSample(os.path.basename(t)).asTable() query = '''SELECT Possible_Source, Sequence FROM %s_fastqc_Overrepresented_sequences;''' % table cc = dbh.cursor() try: found_contaminants.extend(cc.execute(query).fetchall()) except sqlite3.OperationalError, msg: print msg # empty table continue
def runKalistoOnRemoteBAM(infiles, outfile): '''running kalisto on .bam or .remote files''' job_memory="6G" infile, kallisto_index = infiles outdir = os.path.dirname(outfile) outfile = P.snip(outfile, ".gz") statement = [] tempfastq1 = P.getTempFilename() tempfastq2 = P.getTempFilename() rm_files = [tempfastq1, tempfastq2] if infile.endswith(".remote"): tempbam = P.getTempFilename() s, infiles = Sra.process_remote_BAM(infile, outdir=tempbam) infile = " ".join(infiles) statement.append(s) rm_files.append(tempbam) statement.append('''samtools fastq %(infile)s -1 %(tempfastq1)s -2 %(tempfastq2)s''') statement.append('''kallisto quant -i %(kallisto_index)s -o %(outdir)s %(tempfastq1)s %(tempfastq2)s''') rm_files = " ".join(rm_files) statement.append('''rm -R %(rm_files)s''') statement.append('''gzip %(outfile)s''') statement = "; \n checkpoint;\n".join(statement) P.run()
def downloadSequinsNeatData(outfile): ''' Download the neat Sequins data from NCBI''' address_base = 'ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX189' outfile2srr = { 'neat-A.fastq.1.gz': 'SRR3743147', 'neat-B.fastq.1.gz': 'SRR3743148' } srr2srx = {'SRR3743147': 'SRX1897294', 'SRR3743148': 'SRX1897295'} outfile_base = os.path.basename(outfile) srr = outfile2srr[outfile_base] srx = srr2srx[srr] outfile_name = P.snip(outfile_base, '.fastq.1.gz') statement = ''' wget %(address_base)s/%(srx)s/%(srr)s/%(srr)s.sra -O %(outfile_name)s.sra ''' P.run() outdir = os.path.dirname(outfile) statement = Sra.extract(outfile_name + '.sra', outdir) P.run() statement = ''' mv %(outdir)s/%(outfile_name)s_1.fastq.gz %(outdir)s/%(outfile_name)s.fastq.1.gz; checkpoint; mv %(outdir)s/%(outfile_name)s_2.fastq.gz %(outdir)s/%(outfile_name)s.fastq.2.gz''' P.run() os.unlink(outfile_name + '.sra')
def extractGSE65525(infile, outfile): ''' extract fastqs ''' statement = SRA.extract(infile, "GSE65525/fastqs.dir") P.run()
def extractGGSE53638(infile, outfile): ''' extract the fastqs from the SRA ''' statement = SRA.extract(infile, "GSE53638/fastqs.dir") P.run()
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file): '''Generate a .fasta file of adaptor sequences that are overrepresented in the reads from a sample. Requires cutadapt >= 1.7. Arguments --------- infile : string Input filename that has been QC'ed. The filename is used to check if the input was a :term:`sra` file and guess the number of tracks to check. outfile : string Output filename in :term:`fasta` format. track : string Track name, used to access FastQC results in database. dbh : object Database handle. contaminants_file : string Path of file containing contaminants used for screening by Fastqc. ''' tracks = [track] if infile.endswith(".sra"): # patch for SRA files, look at multiple tracks f, fastq_format, datatype = Sra.peek(infile) if len(f) == 2: tracks = [track + "_fastq_1", track + "_fastq_2"] elif infile.endswith(".fastq.1.gz"): tracks = [track + "_fastq_1", track + "_fastq_2"] elif infile.endswith(".fastq.gz"): tracks = [track] found_contaminants = [] for t in tracks: table = PipelineTracks.AutoSample(os.path.basename(t)).asTable() # if sample name starts with a number, sql table will have # prepended "_" if re.match("^\d+.*", table): table = "_" + table query = '''SELECT Possible_Source, Sequence FROM %s_fastqc_Overrepresented_sequences;''' % table cc = dbh.cursor() # if there is no contamination table for even a single sample # it will prevent the whole pipeline progressing try: found_contaminants.extend(cc.execute(query).fetchall()) except sqlite3.OperationalError: E.warn("No table found for {}".format(t)) if len(found_contaminants) == 0: P.touch(outfile) return # read contaminants from existing file with IOTools.openFile(contaminants_file, "r") as inf: known_contaminants = [l.split() for l in inf if not l.startswith("#") and l.strip()] known_contaminants = {" ".join(x[:-1]): x[-1] for x in known_contaminants} # output the full sequence of the contaminant if found # in the list of known contaminants, otherwise don't report! matched_contaminants = set() with IOTools.openFile(outfile, "w") as outf: for found_source, found_seq in found_contaminants: possible_source = found_source.split(" (")[0] if possible_source in known_contaminants: matched_contaminants.update((possible_source,)) else: pass if len(matched_contaminants) > 0: for match in matched_contaminants: outf.write(">%s\n%s\n" % (match.replace(" ,", ""), known_contaminants[match]))