def _reallocate_reads(self, in_fn): ''' Run the reallocateReads script, overwriting the input file with a new bam file in which non-unique reads are reallocated according to the distribution of unique reads. A pretty horrible hack, statistically speaking. ''' # Re-distribute non-unique reads. tmpfile = os.path.join(self.conf.tmpdir, "%s_cs_processAlignmentBwa.tmp" % in_fn) LOGGER.info("Re-allocating non-unique reads to a temporary file %s.", tmpfile) cmd = (self.conf.read_reallocator, in_fn, tmpfile) LOGGER.debug(cmd) call_subprocess(cmd, path=self.conf.hostpath) # Re-sort output bam LOGGER.info("Sorting temporary file %s back to %s.", tmpfile, in_fn) cmd = (self.conf.read_sorter, 'sort', '-m', self.conf.meminbytes, '-o', in_fn, tmpfile) LOGGER.debug(cmd) call_subprocess(cmd, path=self.conf.hostpath) # Remove tmp file LOGGER.info("Removing temporary file %s.", tmpfile) os.unlink(tmpfile)
def update_library_bam_readgroups(libcode, update=False): ''' Add or Replace the read groups for all the bam files attached to a given library. If update is False (the default), picard AddOrReplaceReadGroups is used; if update is True then just the bam file header is rewritten using an internal pipeline function. ''' lib = Library.objects.get(code=libcode) bams = Alnfile.objects.filter(alignment__lane__library__code=libcode, filetype__code='bam') common_args = ('VALIDATION_STRINGENCY=SILENT', 'TMP_DIR=%s' % CONFIG.tmpdir) for bam in bams: LOGGER.info("Updating bam file: %s", bam.filename) checksum = checksum_file(bam.repository_file_path, unzip=False) if checksum != bam.checksum: raise ValueError( "Stored bam checksum does not agree with that in the repository." ) if update: LOGGER.debug("Rewriting bam file header: %s", bam.filename) update_bam_readgroups(bam) else: tmpfile = "%s.update_rg" % (bam.filename, ) cmd = ('picard', 'AddOrReplaceReadGroups', 'INPUT=%s' % bam.repository_file_path, 'OUTPUT=%s' % tmpfile, 'RGLB=%s' % lib.code, 'RGSM=%s' % sanitize_samplename(lib.sample.name), 'RGCN=%s' % bam.alignment.lane.facility.code, 'RGPU=%d' % int(bam.alignment.lane.lanenum), 'RGPL=illumina') + common_args LOGGER.debug("Running command: %s", " ".join(cmd)) call_subprocess(cmd, path=os.environ['PATH']) update_repo_bamfile(bam, tmpfile)
def make_bed_graph(self, aln): ''' Code wrapper for makeWiggle. ''' aln = Alignment.objects.get( id=aln.id) # Reload passed object within transaction. bed = aln.alnfile_set.filter(filetype=self.bedtype).exclude( filename__contains='chr21')[0] # Note makeWiggle can read gzipped bed files directly; we use that fact here. lib = aln.lane.library bedFN = bed.repository_file_path # Write to local directory first. bgrBASE = os.path.splitext(bed.filename)[0] bgrFN = bgrBASE + self.bgrtype.suffix cmd = BED2BGR % (quote(bedFN), quote(bgrBASE)) LOGGER.debug(cmd) if not self.testMode: call_subprocess(cmd, shell=True, path=self.conf.hostpath) if not os.path.exists(bgrFN): LOGGER.error("Failed to create bgr file '%s'" % (bgrFN, )) else: chksum = checksum_file(bgrFN) bgr = Alnfile(filename=os.path.basename(bgrFN), checksum=chksum, filetype=self.bgrtype, alignment=aln) bgrFN = rezip_file(bgrFN) move(bgrFN, bgr.repository_file_path) set_file_permissions(self.conf.group, bgr.repository_file_path) bgr.save()
def get_genome_size_file(self, genome): ''' Retrieve the filename containing chromosome lengths for a given genome. Returns two values: the name of the filename, and whether that filename should be treated as a temporary file, i.e. to be deleted once done with. Such deletion is the responsibility of the calling code. ''' fnchrlen = os.path.join(self.conf.genomesizedir, genome + ".fa.length") if not os.path.exists(fnchrlen): tmpfile = NamedTemporaryFile(delete=False, dir=self.conf.tmpdir) cmd = "%s %s > %s" % ('fetchChromSizes', genome, tmpfile.name) # Note - assumes we're running on our primary host. FIXME? call_subprocess(cmd, shell=True, path=self.conf.hostpath) tmpfile.close() try: LOGGER.info("Storing new chromosome sizes file as %s", fnchrlen) move(tmpfile.name, fnchrlen) set_file_permissions(self.conf.group, fnchrlen) except Exception, err: LOGGER.warning( "Attempt to store chromosome sizes file" + " as %s failed: %s", fnchrlen, err) return (tmpfile.name, True)
def postprocess_results(self, fns): '''Checks for output, add to self.output_files. Note that we want the compressed archive to be gzipped (*.tar.gz), not zipped. We also want the fastqc_report.txt file stored separately and uncompressed.''' for fpath in fns: fname = os.path.split(fpath)[1] fname = re.sub(r'\.gz$', '', fname) # FastQC strips '.fastq' but not '.fq', so we only remove the former here. fname = re.sub(r'\.fastq$', '', fname) base = "%s_fastqc" % fname bpath = os.path.join(self.workdir, base) if not os.path.exists(bpath): raise StandardError("Expected output directory not found: %s" % bpath) # Sort out the tar-gzipped archive. gzarch = "%s.tar" % bpath tar = tarfile.open(gzarch, mode='w') # A little jimmying around so we only get the directory we want. pwd = os.getcwd() os.chdir(self.workdir) tar.add(base) os.chdir(pwd) tar.close() self.output_files.append(gzarch) self.output_md5s.append(checksum_file(gzarch)) # The text file containing summary results. Useful for analyses. resfile = "%s.txt" % bpath copy(os.path.join(bpath, 'fastqc_data.txt'), resfile) self.output_files.append(resfile) self.output_md5s.append(checksum_file(resfile)) # Generating a PDF for our end-users. html = os.path.join(bpath, 'fastqc_report.html') pdf = "%s.pdf" % bpath # FIXME resource_filename is a little brittle, would # resource_string be better? cmd = [ 'wkhtmltopdf-amd64', '--user-style-sheet', resource_filename(Requirement.parse('osqpipe'), 'osqpipe/pipeline/fastqc_pdf_styles.css'), html, pdf ] call_subprocess(cmd, path=self.path) self.output_files.append(pdf) self.output_md5s.append(checksum_file(pdf))
def convert_to_psl(self, lav): ''' Converts an input lav file to a temporary psl file. ''' pslfn = os.path.join(self.local_tempdir, filebasename(lav) + '.psl') cmd = ['lavToPsl', lav, pslfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) return pslfn # Delete this file in the caller code.
def generate_bigwig_files(self, bedgraphs, chrom_sizes): ''' Uses bedGraphToBigWig to make bigWig file(s). ''' LOGGER.info("Creating bigWig files...") bigwigs = [] for bgr_fn in bedgraphs: bwfile = splitext(bgr_fn)[0] + '.bw' cmd = ('bedGraphToBigWig', bgr_fn, chrom_sizes, bwfile) LOGGER.debug(cmd) try: # This can fail, e.g. for very small input files. call_subprocess(cmd, path=self.conf.hostpath) bigwigs.append(bwfile) except CalledProcessError, err: LOGGER.warn("Unable to create bigWig file: %s", err)
def convert_to_2bit(self, fasta, workdir=None): ''' Runs faToTwoBit on the designated fasta file; returns the name of the output 2bit file. ''' LOGGER.info("Converting fasta to 2bit: %s", fasta) if workdir is None: workdir = self.local_workdir twobitfn = os.path.join(workdir, filebasename(fasta) + '.2bit') cmd = ['faToTwoBit', '-noMask', fasta, twobitfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) return twobitfn
def get_2bit(self, fasta): ''' Simply generate a temporary 2bit file from the specified fasta file. Note the differences between this and convert_to_2bit. FIXME refactor so there's only one of these functions. ''' outfn = os.path.join(self.local_tempdir, filebasename(fasta) + '.2bit') if os.path.exists(outfn): return outfn LOGGER.info("Generating 2bit file for %s", fasta) cmd = ['faToTwoBit', fasta, outfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) self._tempfiles.append(outfn) return outfn
def demultiplex(self, codes, fname): '''Actually run the demultiplexing, using demuxIllumina.''' # look up adapters from database, # write sampleSheet file LOGGER.debug("Making sample sheet.") sheet = self.make_sample_sheet(codes, fname) LOGGER.info("Sample sheet created.") # invoke demultiplexer cmd = [self.demux_prog, '-d', sheet, fname] # demuxIllumina v2.0 and above LOGGER.debug("Command for demultiplexing: %s", " ".join(cmd)) pout = call_subprocess(cmd, path=self.conf.hostpath) fnpat = re.compile(r"tag\s+(\w+):\s+([^\s]+)\s*$") fnset = set() lostpat = re.compile(r"lost\s+([\/\d]+)\s+reads") for line in pout: matchobj = fnpat.match(line) if matchobj: fnset.add(matchobj.group(2)) else: matchobj2 = lostpat.match(line) if matchobj2: LOGGER.info("lost %s reads", matchobj2.group(1)) for fname in fnset: set_file_permissions(self.conf.group, fname) # Delete the sample sheet. os.unlink(sheet)
def fetch_mga(flowcell, flowlane, destination, nameprefix, lims_fc=None): """Fetches MGA report from Genologics LIMS. Returns PDF report.""" mgafiles = [] flowlane = int(flowlane) # start logging if TEST_MODE: LOGGER.setLevel(DEBUG) else: LOGGER.setLevel(INFO) # install lims if lims_fc is None: lims = Lims() if not lims.running(): LOGGER.error("Remote LIMS access broken... cannot continue.") sys.exit("LIMS not running.") # search lims for a lane on flowcell lims_fc = lims.load_flowcell(flowcell) if TEST_MODE: lims_fc.dump() lane = lims_fc.get_lane(flowlane) # Retrieve Lane MGA file. See upstream_lims module for supported # file type strings (e.g. 'LANE_MGA'). files = lane.lims_files('LANE_MGA') if len(files) == 0: LOGGER.info("No files to retrieve for %s_%d", flowcell, lane.lane) for lfile in files: if lfile.uri.lower()[-4:] != 'html': continue LOGGER.debug("MGA HTML URI: %s", lfile.uri) if destination: local_pdf = os.path.join(destination, ("%s.pdf" % nameprefix)) else: local_pdf = nameprefix + ".pdf" # Convert the HTML page direct to PDF for storage in the repository. cmd = ['wkhtmltopdf-amd64', lfile.uri, local_pdf] try: call_subprocess(cmd, path=CONFIG.hostpath) mgafiles.append(local_pdf) except CalledProcessError, err: LOGGER.warning( "Unable to download and/or convert MGA report to PDF.")
def run_fastqc(self, fns, threads=2): """Executes fastqc report generation.""" assert (len(fns) > 0) if len(fns) < threads: threads = len(fns) cmd = [self.program_name, '-q', '-t', threads, '-o', self.workdir] if len(self.program_params) > 0: cmd.extend(self.program_params.split()) cmd.extend(fns) cmd = [str(x) for x in cmd] LOGGER.info("Running FastQC command: %s", " ".join(cmd)) call_subprocess(cmd, path=self.path)
def mask_tandem_repeats(self, fasta): ''' Runs trfBig over the designated fasta file. Should return the newly-generated masked fasta file name. Runs quite slowly, so we keep the outputs following 2bit conversion. ''' LOGGER.info("Masking tandem repeats for fasta: %s", fasta) curdir = os.getcwd() # trfBig writes to current working directory a lot. os.chdir(self.local_tempdir) maskfn = os.path.splitext(fasta)[0] + MASKTAG + '.fa' cmd = ['trfBig', fasta, maskfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) os.chdir(curdir) return maskfn
def get_chr_sizes(self, fasta): ''' Runs faSize on a fasta file to generate chr size data. ''' # We keep a cached because we'll be using this more than once. if fasta in self._chr_sizes: return self._chr_sizes[fasta] LOGGER.info("Calculating chr sizes for %s", fasta) sizefn = os.path.join(self.local_tempdir, filebasename(fasta) + '.sizes') cmd = 'faSize %s -detailed > %s' % (bash_quote(fasta), bash_quote(sizefn)) call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH'], shell=True) self._tempfiles.append(sizefn) self._chr_sizes[fasta] = sizefn return sizefn
def run_analysis(self, fns): """Executes cross-correlation report generation.""" assert(len(fns) == 1) basefn = os.path.splitext(fns[0])[0] pdf = "%s_xcor.pdf" % basefn out = "%s_xcor.txt" % basefn with NamedTemporaryFile(prefix=basefn, suffix='.bam', dir=self.workdir) as tempbam: with NamedTemporaryFile(suffix='.txt', dir=self.workdir) as tempout: mdcmd = [ 'picard', 'MarkDuplicates', 'I=%s' % fns[0], 'O=%s' % tempbam.name, 'M=%s' % tempout.name, 'REMOVE_DUPLICATES=true', 'VALIDATION_STRINGENCY=SILENT' ] LOGGER.info('Removing bam file duplicate reads prior to cross-correlation analysis') call_subprocess(mdcmd, path=self.path) # tempout closes and is deleted. cmd = [ self.program_name, '-c=%s' % tempbam.name, '-savp=%s' % pdf, '-out=%s' % out ] if len(self.program_params) > 0: cmd.extend( self.program_params.split() ) cmd = [ str(x) for x in cmd ] LOGGER.debug("Constructed cross-correlation command: %s", " ".join(cmd)) LOGGER.info("Running Cross-correlation analysis") call_subprocess(cmd, path=self.path) # tempbam closes and is deleted. self.output_files.extend([out, pdf])
def generate_wig_files(self, beds): ''' Uses makeWiggle to generate wiggle files. ''' wigs = [] for bed_fn in beds: cmd = ('makeWiggle', '-t', '3', bed_fn, splitext(bed_fn)[0]) LOGGER.debug(" ".join(cmd)) pout = call_subprocess(cmd, path=self.conf.hostpath) for line in pout: wigs.append(line.strip()) pout.close() return wigs
def chain(self, lavs): ''' Chains the lastz output .lav files together. ''' # We keep the filtered chain file. gen_from = filebasename(self.from_genome) gen_to = filebasename(self.to_genome) prechain = os.path.join(self.local_workdir, '%s_vs_%s.pre.chain' % (gen_from, gen_to)) if os.path.exists(prechain): LOGGER.warning( "Prechain file already exists." + " Assuming we can start from this point: %s", prechain) return prechain # Convert lavs to appropriately-organised psl files. psls = self.process_lavs_to_psl(lavs) # FIXME at some point we need to add these psls to self._tempfiles # Run the initial chaining. LOGGER.info("Running the initial chaining.") chaindir = os.path.join(self.local_tempdir, 'chain/') os.mkdir(chaindir) chains = [] for psl in psls: chfn = os.path.join(chaindir, filebasename(psl) + '.chain') cmd = [ 'axtChain', '-psl', '-linearGap=%s' % self.linear_gap, psl, '-faQ', self.from_genome, '-faT', self.to_genome, chfn ] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) chains.append(chfn) self._tempfiles.append(chfn) self._tempfiles.append(chaindir) # Filter the chained alignments before returning. allchain = os.path.join(self.local_tempdir, 'all.chain') cmd = ('chainMergeSort -tempDir=%s %s > %s' % (bash_quote(self.local_tempdir), " ".join( [bash_quote(x) for x in chains]), bash_quote(allchain))) call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH'], shell=True) self._tempfiles.append(allchain) from_sizes = self.get_chr_sizes(self.from_genome) to_sizes = self.get_chr_sizes(self.to_genome) # Actually create the prechain file. cmd = ['chainPreNet', allchain, from_sizes, to_sizes, prechain] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) return prechain
def rename_files(files, fromFun): for fobj in files: old = fromFun(fobj) old = "/".join((SERVERDIR, old)) new = fobj.repository_file_path new = "/".join((SERVERDIR, new)) if old != new: d = os.path.dirname(new) # print "Creating directory %s" % (d,) cmd = ssh_command(('mkdir', '-p', quote(d))) call_subprocess(cmd, shell=True, path=CONFIG.hostpath) # print "Moving %s to %s" % (old, new) old = bash_quote(old) new = bash_quote(new) cmd = ssh_command(('mv', quote(old), quote(new))) try: call_subprocess(cmd, shell=True, path=CONFIG.hostpath) except CalledProcessError, err: print "Warning: move failed for file %s: %s" % (old, err)
def net(self, prechain): ''' Create nets from the chained alignements and convert them to axt format. Also generate a liftOver file. ''' from_sizes = self.get_chr_sizes(self.from_genome) to_sizes = self.get_chr_sizes(self.to_genome) net = os.path.join(self.local_workdir, prechain + '.net') cmd = (('chainNet %s -minSpace=1 %s %s stdout /dev/null' + ' | netSyntenic stdin %s') % (bash_quote(prechain), bash_quote(from_sizes), bash_quote(to_sizes), bash_quote(net))) # This may fail for spurious reasons (e.g. absence of # /proc/self/stat on non-linux machines). try: LOGGER.info("Running chainNet and netSyntenic on prechain file.") call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH'], shell=True) except CalledProcessError, err: LOGGER.warning("chainNet or netSyntenic raised exception: %s", err)
def samtools_merge_bams(self, bams, output_fn): ''' Use samtools to merge a set of bams locally. ''' if len(bams) == 0: raise ValueError("Zero input bam files for merging.") if len(bams) == 1: LOGGER.warning( "Only one bam file supplied; copying, rather than merging.") copy(bams[0], output_fn) return # We assume our input bam files are appropriately sorted (which, # coming from the repository, they should be). cmd = ['samtools', 'merge', output_fn] + bams LOGGER.info("Using samtools to merge bam files: %s", ", ".join([os.path.basename(bam) for bam in bams])) call_subprocess(cmd, path=CONFIG.hostpath) if not os.path.exists(output_fn): raise StandardError("Merged output file does not exist: %s", output_fn)
def run_picard(libcode, facility, lanenum=None, genome=None): bams = Alnfile.objects.filter(alignment__lane__library__code=libcode, alignment__lane__facility__code=facility, filetype__code='bam') if lanenum is not None: bams = bams.filter(alignment__lane__lanenum=lanenum) if genome is not None: bams = bams.filter(alignment__genome__code=genome) if len(bams) == 0: raise StandardError("Unable to find matching bam file in the repository.") for bam in bams: LOGGER.info("Confirming file checksum: %s", bam.filename) oldsum = checksum_file(bam.repository_file_path, unzip=False) if oldsum != bam.checksum: raise ValueError(("MD5 checksum of bam file on disk (%s) does not agree" + " with stored repository value (%s): %s") % (oldsum, bam.checksum, bam.filename)) newbam = bam.repository_file_path + '.cleaned' postproc = BamPostProcessor(input_fn=bam.repository_file_path, output_fn=newbam) # Run CleanSam LOGGER.info("Running CleanSam...") call_subprocess(postproc.clean_sam(), path=CONFIG.hostpath) # Run AddOrReplaceReadGroups LOGGER.info("Running AddOrReplaceReadGroups...") call_subprocess(postproc.add_or_replace_read_groups(), path=CONFIG.hostpath) os.unlink(postproc.cleaned_fn) # Run FixMateInformation LOGGER.info("Running FixMateInformation...") call_subprocess(postproc.fix_mate_information(), path=CONFIG.hostpath) os.unlink(postproc.rgadded_fn) # Quick sanity check on the output newcount = count_bam_reads(newbam) # FIXME total_reads should be total reads in bam, not in fastq. oldcount = bam.alignment.total_reads if bam.alignment.lane.paired: oldcount = oldcount * 2 if newcount != oldcount: raise ValueError(("Read count in cleaned bam file (%d) does not agree" + " with total_reads in repository (%d): %s") % (newcount, oldcount, newbam)) # Clean up and replace the old bam file with the new one. LOGGER.info("Replacing old bam file with new: %s", bam.repository_file_path) replace_repo_file(bam, newbam)
def generate_bedgraph_files(self, beds): ''' Uses makeWiggle to generate bedgraph files. ''' # make bedgraph file(s) LOGGER.info("Creating bedgraph files...") bedgraphs = [] for bed_fn in beds: cmd = ('makeWiggle', '-t', '3', '-B', '-1', bed_fn, splitext(bed_fn)[0]) LOGGER.debug(cmd) try: # Occasionally fails for poorer-quality genomes. pout = call_subprocess(cmd, path=self.conf.hostpath) for line in pout: bedgraphs.append(line.strip()) pout.close() except CalledProcessError, err: LOGGER.warn("Unable to create bedGraph file: %s", err)
def process_lavs_to_psl(self, lavs): ''' Convert .lav files to .psl, swap query and target, and split on target. ''' # Convert lav files to psl, concatenate. LOGGER.info("Reorganising lav files into psl files.") psls = [self.convert_to_psl(x) for x in lavs] allpsl = os.path.join(self.local_tempdir, 'all.psl') # Concatenate the files. We take this opportunity to strip out the # junk we've added to the chromosome names. def repl(match): ''' Regex replace function. ''' return "\t%s\t" % match.group(1) from_sizes = self.get_chr_sizes_dict(self.from_genome) to_sizes = self.get_chr_sizes_dict(self.to_genome) # We allow for any pid prefix so we can restart in a new process # if needed. Also allow for genome/chrN_trfBig_masked to support # fill-in files generated locally. genstr = ( r'(?:%s|%s)' % (filebasename(self.from_genome), filebasename(self.to_genome))) strip_re = re.compile(r'\t(?:\d+_%s_)?([^\t]*)%s\t' % (genstr, MASKTAG)) # Keep this regex in sync with the file naming scheme used in split_chrs. subchr_re = re.compile(r'^(.*)_\+(\d+)$') with open(allpsl, 'wb') as allfh: for inp in psls: with open(inp, 'rb') as pfh: for line in pfh: # We need to rewrite the chrnames here. Also remove the # trailing newline so it doesn't confuse the processing below. newline = strip_re.sub(repl, line).rstrip('\n') # Parse out sub-chromosome coordinates from filenames and # fix coords appropriately. This is heavily dependent on # the PSL file following the specification. fields = newline.split("\t") if len(fields) > 1: # Sort out the query positions. chrA_match = subchr_re.match(fields[9]) if chrA_match: fields[9] = chrA_match.group(1) basecoord = int(chrA_match.group(2)) for fnum in (11, 12): fields[fnum] = str( int(fields[fnum]) + basecoord) fields[19] = ','.join([ str(int(x) + basecoord) for x in fields[19].split(',') if x != '' ]) + ',' fields[10] = from_sizes[fields[9]] # Sort out the target positions. chrB_match = subchr_re.match(fields[13]) if chrB_match: fields[13] = chrB_match.group(1) basecoord = int(chrB_match.group(2)) for fnum in (15, 16): fields[fnum] = str( int(fields[fnum]) + basecoord) fields[20] = ','.join([ str(int(x) + basecoord) for x in fields[20].split(',') if x != '' ]) + ',' fields[14] = to_sizes[fields[13]] # Quick check on our output. This is essentially cribbed # from the pslToBed code. if (int(fields[11]) >= int(fields[12]) or int(fields[12]) > int(fields[10]) or int(fields[15]) >= int(fields[16]) or int(fields[16]) > int(fields[14])): raise StandardError(( "Mangled PSL format output. Offending input line was in file %s:" + "\n\n%s\n\nMunged to:\n%s\n\n") % (inp, line, "\t".join(fields))) newline = "\t".join(fields) + "\n" allfh.write(newline) os.unlink(inp) # Attempt to save some temp space # Swap target and source annotation, such that splitting on the # target actually splits on the query. swppsl = os.path.join(self.local_tempdir, 'all-swap.psl') cmd = ['pslSwap', allpsl, swppsl] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) os.unlink(allpsl) # Split psl files by target chromosome. psldir = os.path.join(self.local_tempdir, 'psl/') os.mkdir(psldir) # Consider -lump option for scaffolds FIXME cmd = ['pslSplitOnTarget', swppsl, psldir] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) target_psls = [ os.path.join(self.local_tempdir, psldir, x) for x in os.listdir(psldir) ] self._tempfiles.extend(target_psls + [psldir]) os.unlink(swppsl) return target_psls
def ed_get_files_by_fastqfile(self, file1, file1_md5, laneid, file2=None, file2_md5=None): '''Downloads all files for the userid''' # For each fastq file in Edinburgh Genomics, one could also download *_R1_fastqc.html file but these are little use of us # and the code for this below has been commented off. # file1_fastqc = file1.rstrip('.fastq.gz') + '_fastqc.html' # rfiles = [file1, file1_fastqc] # rmd5s = [file1_md5, None] rfiles = [file1] rmd5s = [file1_md5] fail_command = 'cs_edinburgh_download.py -a --file1 %s --file1_md5 %s -p %s -l %d' % ( file1, file1_md5, self.project, laneid) if file2 is not None: # file2_fastqc = file2.rstrip('.fastq.gz') + '_fastqc.html' # rfiles = rfiles + [file2, file2_fastqc] # rmd5s = rmd5s + [file2_md5, None] rfiles = rfiles + [file2] rmd5s = rmd5s + [file2_md5] fail_command += " --file2 %s --file2_md5 %s" % (file2, file2_md5) failed = False # Download files in rfiles for rfpath, md5sum in zip(rfiles, rmd5s): attempts = 0 while attempts < self.maxattempts: ## Depending if the file to be downloaded is .html skip downloading .md5 as there won't be one. #if rfpath.endswith('.html'): # res = self.ed_get_file_with_md5(rfpath, md5sum, download_md5=False) #else: # res = self.ed_get_file_with_md5(rfpath, md5sum, download_md5=True) res = self.ed_get_file_with_md5(rfpath, md5sum, download_md5=True) if res == 0: break else: attempts += 1 if attempts == self.maxattempts: LOGGER.error( "Giving up (after %d attempts) on trying to download %s" % (attempts, rfpath)) # Set download failed cmd = 'communicateStatus.py --laneid %d --status downloading_failed' % laneid # Record failed command (in case its of any use) cmd += ' && echo %s >> %s' % (fail_command, self.failedcommands) run_command(cmd, shell=True) failed = True else: LOGGER.error("Trying to download %s again." % rfpath) # Register files as downloaded if not failed: # Get flowcell and machine info from read header of fastq file # (machine, flowcell, flowlane) = parse_read_header(fqfile1) # 5. Set flowcell status 'downloaded' or 'failed download' # cmd = 'communicateStatus.py --status complete --flowcell %s --flowlane 0 --library %s --facility EDG' % (flowcell, userid) cmd = 'communicateStatus.py --laneid %d --status downloaded' % laneid call_subprocess(cmd, shell=True)
class LastzAligner(ClusterJobManager): ''' Class to handle all the steps required for generating an axt-format net alignment file using lastz as the aligner. ''' # local_tempdir will need to be able to handle around 75GB when aligning # two typical mammalian genomes. def __init__(self, from_genome, to_genome, hsp_thresh=3000, length_limit=None, linear_gap='loose', local_tempdir=None, resume=False, *args, **kwargs): super(LastzAligner, self).__init__(*args, **kwargs) self.from_genome = from_genome self.to_genome = to_genome self.hsp_thresh = hsp_thresh self.length_limit = length_limit self.linear_gap = linear_gap # Flag used to tell the object to fill in missing lav files by # resubmitting to the cluster, rather than just working with # what's available. self.resume = resume systempdir = gettempdir() if local_tempdir is None else local_tempdir self.local_tempdir = os.path.join(systempdir, str(os.getpid())) os.mkdir(self.local_tempdir) # Fails on pre-existing directory. self._tempfiles = [] self._chr_sizes = {} def split_chrs(self, fasta, dryrun=False): ''' Split a designated fasta file by chromosome. Returns a list of the generated fasta files. Any chromosome whose sequence exceeds self.length_limit will be split appropriately. Calling with dryrun=True returns a list of files which would have been created; this may be useful when deciding on an appropriate length_limit parameter. ''' LOGGER.info("Splitting fasta by chromosome: %s", fasta) # N.B. the trailing '/' is important here: wdir = os.path.join(self.local_tempdir, '%s_chr_split/' % filebasename(fasta)) if not dryrun: os.mkdir(wdir) # Fails on pre-existing directory. self._tempfiles.append(wdir) outfiles = [] handle = open(fasta, 'rU') for chromosome in SeqIO.parse(handle, 'fasta'): # Check whether we need to split the chromosome. seqlen = len(chromosome.seq) if self.length_limit and seqlen > self.length_limit: # Figure out how many chunks we need. denom = 2 while (float(seqlen) / denom) > self.length_limit: denom += 1 # Output the sequences for segnum in range(denom): start = (segnum * (seqlen / denom)) + 1 end = min(seqlen, (segnum + 1) * (seqlen / denom)) # This filename format will be parsed later, in # process_lavs_to_psl. The filename coordinate needs to be # added to the output psl coords. new_id = "%s_+%d" % (chromosome.id, start - 1) chrfile = os.path.join(wdir, "%s.fa" % new_id) chrseg = chromosome[start - 1:end] chrseg.id = new_id if not dryrun: with open(chrfile, 'w') as chrfh: SeqIO.write([chrseg], chrfh, 'fasta') outfiles.append(chrfile) else: # If chromosome is small enough, just dump it out in a single file. chrfile = os.path.join(wdir, "%s.fa" % chromosome.id) if not dryrun: with open(chrfile, 'w') as chrfh: SeqIO.write([chromosome], chrfh, 'fasta') outfiles.append(chrfile) return outfiles def mask_tandem_repeats(self, fasta): ''' Runs trfBig over the designated fasta file. Should return the newly-generated masked fasta file name. Runs quite slowly, so we keep the outputs following 2bit conversion. ''' LOGGER.info("Masking tandem repeats for fasta: %s", fasta) curdir = os.getcwd() # trfBig writes to current working directory a lot. os.chdir(self.local_tempdir) maskfn = os.path.splitext(fasta)[0] + MASKTAG + '.fa' cmd = ['trfBig', fasta, maskfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) os.chdir(curdir) return maskfn def convert_to_2bit(self, fasta, workdir=None): ''' Runs faToTwoBit on the designated fasta file; returns the name of the output 2bit file. ''' LOGGER.info("Converting fasta to 2bit: %s", fasta) if workdir is None: workdir = self.local_workdir twobitfn = os.path.join(workdir, filebasename(fasta) + '.2bit') cmd = ['faToTwoBit', '-noMask', fasta, twobitfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) return twobitfn def make_cluster_filename(self, localfile): ''' Generate a unique filename to be used on the cluster, without unnecessarily divulging local file paths. ''' pathbits = os.path.split(localfile) hasher = md5() hasher.update(pathbits[0]) clusterfile = "%d_%s_%s" % (os.getpid(), hasher.hexdigest(), pathbits[1]) return clusterfile def align(self, from_list, to_list, omit_list=None): ''' Actually run the alignment. Requires lastz on the cluster, and of course bsub/LSF et al. Note that we will generate one lastz process for each chr-chr combination, so this should lend itself well to a clustered solution. ''' # Just in case, for convenience. if type(from_list) in (str, unicode): from_list = [from_list] if type(to_list) in (str, unicode): to_list = [to_list] if omit_list is None: omit_list = [] # Make sure the filenames on the cluster won't easily collide. cluster_from = [self.make_cluster_filename(x) for x in from_list] cluster_to = [self.make_cluster_filename(x) for x in to_list] # Note that we copy all the files to the cluster even if we only # want to repeat a handful of alignments; managing the files is # simpler that way. LOGGER.info("Copying files to cluster server.") self.submitter.remote_copy_files(filenames=from_list + to_list, destnames=cluster_from + cluster_to) job_ids = [] lavfiles = [] for from_num in range(len(from_list)): for to_num in range(len(to_list)): # Files on localhost from_file = from_list[from_num] to_file = to_list[to_num] # Files on the cluster from_clust = cluster_from[from_num] to_clust = cluster_to[to_num] outfile = "%s_%s.lav" % (filebasename(from_file), filebasename(to_file)) if outfile in omit_list: LOGGER.warning("Skipping pre-existing lav file %s...", outfile) lavfiles.append(outfile) continue ## FIXME consider the --inner option here (ensembl-compara ## appears to use --inner=2200). LOGGER.info("Launching alignment (%s : %s).", from_file, to_file) clusterout = "%d_%s" % (os.getpid(), outfile) ## We use this file to monitor lastz completion, to ## disambiguate lastz failure from scp failure. FIXME if this ## turns out to be scp failure we can add a final re-try to ## the monitor job. clusterdone = clusterout + '.done' ## Note that using --chain here appears to be undesirable ## since the lastz chaining implementation is rather too ## simplistic for our purposes (see lastz docs). cmd = [ 'lastz', to_clust, from_clust, # This is the correct order. '--format=lav', '--hspthresh=%d' % self.hsp_thresh, '--output=%s' % clusterout ] sshcmd = self.return_file_to_localhost(clusterout, outfile, execute=False) LOGGER.debug(sshcmd) cmd = " ".join(cmd) + ( ' && touch %s && %s && rm %s %s' % (clusterdone, sshcmd, clusterout, clusterdone)) # 4GB is the default max mem for lastz. Setting mem=4000 means # some larger alignments fail silently; using 5000 seems much # more robust on our cluster. job_ids.append( self.submitter.submit_command(cmd=cmd, mem=self.memsize * 1024, auto_requeue=False, time_limit=self.time_limit)) lavfiles.append(outfile) # Reduce the rate of cluster job submission, if desired. sleep(self.throttle) # Caller code tends to assume these paths are absolute. lavfiles = [os.path.join(self.local_workdir, x) for x in lavfiles] return (job_ids, lavfiles, cluster_from + cluster_to) def convert_to_psl(self, lav): ''' Converts an input lav file to a temporary psl file. ''' pslfn = os.path.join(self.local_tempdir, filebasename(lav) + '.psl') cmd = ['lavToPsl', lav, pslfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) return pslfn # Delete this file in the caller code. def process_lavs_to_psl(self, lavs): ''' Convert .lav files to .psl, swap query and target, and split on target. ''' # Convert lav files to psl, concatenate. LOGGER.info("Reorganising lav files into psl files.") psls = [self.convert_to_psl(x) for x in lavs] allpsl = os.path.join(self.local_tempdir, 'all.psl') # Concatenate the files. We take this opportunity to strip out the # junk we've added to the chromosome names. def repl(match): ''' Regex replace function. ''' return "\t%s\t" % match.group(1) from_sizes = self.get_chr_sizes_dict(self.from_genome) to_sizes = self.get_chr_sizes_dict(self.to_genome) # We allow for any pid prefix so we can restart in a new process # if needed. Also allow for genome/chrN_trfBig_masked to support # fill-in files generated locally. genstr = ( r'(?:%s|%s)' % (filebasename(self.from_genome), filebasename(self.to_genome))) strip_re = re.compile(r'\t(?:\d+_%s_)?([^\t]*)%s\t' % (genstr, MASKTAG)) # Keep this regex in sync with the file naming scheme used in split_chrs. subchr_re = re.compile(r'^(.*)_\+(\d+)$') with open(allpsl, 'wb') as allfh: for inp in psls: with open(inp, 'rb') as pfh: for line in pfh: # We need to rewrite the chrnames here. Also remove the # trailing newline so it doesn't confuse the processing below. newline = strip_re.sub(repl, line).rstrip('\n') # Parse out sub-chromosome coordinates from filenames and # fix coords appropriately. This is heavily dependent on # the PSL file following the specification. fields = newline.split("\t") if len(fields) > 1: # Sort out the query positions. chrA_match = subchr_re.match(fields[9]) if chrA_match: fields[9] = chrA_match.group(1) basecoord = int(chrA_match.group(2)) for fnum in (11, 12): fields[fnum] = str( int(fields[fnum]) + basecoord) fields[19] = ','.join([ str(int(x) + basecoord) for x in fields[19].split(',') if x != '' ]) + ',' fields[10] = from_sizes[fields[9]] # Sort out the target positions. chrB_match = subchr_re.match(fields[13]) if chrB_match: fields[13] = chrB_match.group(1) basecoord = int(chrB_match.group(2)) for fnum in (15, 16): fields[fnum] = str( int(fields[fnum]) + basecoord) fields[20] = ','.join([ str(int(x) + basecoord) for x in fields[20].split(',') if x != '' ]) + ',' fields[14] = to_sizes[fields[13]] # Quick check on our output. This is essentially cribbed # from the pslToBed code. if (int(fields[11]) >= int(fields[12]) or int(fields[12]) > int(fields[10]) or int(fields[15]) >= int(fields[16]) or int(fields[16]) > int(fields[14])): raise StandardError(( "Mangled PSL format output. Offending input line was in file %s:" + "\n\n%s\n\nMunged to:\n%s\n\n") % (inp, line, "\t".join(fields))) newline = "\t".join(fields) + "\n" allfh.write(newline) os.unlink(inp) # Attempt to save some temp space # Swap target and source annotation, such that splitting on the # target actually splits on the query. swppsl = os.path.join(self.local_tempdir, 'all-swap.psl') cmd = ['pslSwap', allpsl, swppsl] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) os.unlink(allpsl) # Split psl files by target chromosome. psldir = os.path.join(self.local_tempdir, 'psl/') os.mkdir(psldir) # Consider -lump option for scaffolds FIXME cmd = ['pslSplitOnTarget', swppsl, psldir] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) target_psls = [ os.path.join(self.local_tempdir, psldir, x) for x in os.listdir(psldir) ] self._tempfiles.extend(target_psls + [psldir]) os.unlink(swppsl) return target_psls def get_chr_sizes(self, fasta): ''' Runs faSize on a fasta file to generate chr size data. ''' # We keep a cached because we'll be using this more than once. if fasta in self._chr_sizes: return self._chr_sizes[fasta] LOGGER.info("Calculating chr sizes for %s", fasta) sizefn = os.path.join(self.local_tempdir, filebasename(fasta) + '.sizes') cmd = 'faSize %s -detailed > %s' % (bash_quote(fasta), bash_quote(sizefn)) call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH'], shell=True) self._tempfiles.append(sizefn) self._chr_sizes[fasta] = sizefn return sizefn def get_chr_sizes_dict(self, fasta): ''' As for get_chr_sizes, but also parses the file and returns a dict for convenience. ''' sizefn = self.get_chr_sizes(fasta) sizes = dict() with open(sizefn, 'r') as sizefh: for row in sizefh: (chrom, size) = [x.strip() for x in row.split()] sizes[chrom] = size return sizes def chain(self, lavs): ''' Chains the lastz output .lav files together. ''' # We keep the filtered chain file. gen_from = filebasename(self.from_genome) gen_to = filebasename(self.to_genome) prechain = os.path.join(self.local_workdir, '%s_vs_%s.pre.chain' % (gen_from, gen_to)) if os.path.exists(prechain): LOGGER.warning( "Prechain file already exists." + " Assuming we can start from this point: %s", prechain) return prechain # Convert lavs to appropriately-organised psl files. psls = self.process_lavs_to_psl(lavs) # FIXME at some point we need to add these psls to self._tempfiles # Run the initial chaining. LOGGER.info("Running the initial chaining.") chaindir = os.path.join(self.local_tempdir, 'chain/') os.mkdir(chaindir) chains = [] for psl in psls: chfn = os.path.join(chaindir, filebasename(psl) + '.chain') cmd = [ 'axtChain', '-psl', '-linearGap=%s' % self.linear_gap, psl, '-faQ', self.from_genome, '-faT', self.to_genome, chfn ] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) chains.append(chfn) self._tempfiles.append(chfn) self._tempfiles.append(chaindir) # Filter the chained alignments before returning. allchain = os.path.join(self.local_tempdir, 'all.chain') cmd = ('chainMergeSort -tempDir=%s %s > %s' % (bash_quote(self.local_tempdir), " ".join( [bash_quote(x) for x in chains]), bash_quote(allchain))) call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH'], shell=True) self._tempfiles.append(allchain) from_sizes = self.get_chr_sizes(self.from_genome) to_sizes = self.get_chr_sizes(self.to_genome) # Actually create the prechain file. cmd = ['chainPreNet', allchain, from_sizes, to_sizes, prechain] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) return prechain def get_2bit(self, fasta): ''' Simply generate a temporary 2bit file from the specified fasta file. Note the differences between this and convert_to_2bit. FIXME refactor so there's only one of these functions. ''' outfn = os.path.join(self.local_tempdir, filebasename(fasta) + '.2bit') if os.path.exists(outfn): return outfn LOGGER.info("Generating 2bit file for %s", fasta) cmd = ['faToTwoBit', fasta, outfn] call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) self._tempfiles.append(outfn) return outfn def net(self, prechain): ''' Create nets from the chained alignements and convert them to axt format. Also generate a liftOver file. ''' from_sizes = self.get_chr_sizes(self.from_genome) to_sizes = self.get_chr_sizes(self.to_genome) net = os.path.join(self.local_workdir, prechain + '.net') cmd = (('chainNet %s -minSpace=1 %s %s stdout /dev/null' + ' | netSyntenic stdin %s') % (bash_quote(prechain), bash_quote(from_sizes), bash_quote(to_sizes), bash_quote(net))) # This may fail for spurious reasons (e.g. absence of # /proc/self/stat on non-linux machines). try: LOGGER.info("Running chainNet and netSyntenic on prechain file.") call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH'], shell=True) except CalledProcessError, err: LOGGER.warning("chainNet or netSyntenic raised exception: %s", err) if not os.path.exists(net): raise StandardError( "chainNet/netSyntenic failed to create output net file %s" % net) axt = os.path.join( self.local_workdir, "%s.%s.net.axt" % (filebasename(self.from_genome), filebasename(self.to_genome))) from_2bit = self.get_2bit(self.from_genome) to_2bit = self.get_2bit(self.to_genome) LOGGER.info('Converting to axt format.') cmd = ('netToAxt %s %s %s %s stdout | axtSort stdin %s' % (bash_quote(net), bash_quote(prechain), bash_quote(from_2bit), bash_quote(to_2bit), bash_quote(axt))) call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH'], shell=True) # These are cheap to generate and store, but potentially very useful later. LOGGER.info('Creating liftOver file.') liftover = os.path.join(self.local_workdir, prechain + '.liftOver') cmd = ('netChainSubset', net, prechain, liftover) call_subprocess(cmd, tmpdir=self.local_tempdir, path=os.environ['PATH']) return axt