def _reallocate_reads(self, in_fn):
        '''
    Run the reallocateReads script, overwriting the input file with a
    new bam file in which non-unique reads are reallocated according
    to the distribution of unique reads. A pretty horrible hack,
    statistically speaking.
    '''
        # Re-distribute non-unique reads.
        tmpfile = os.path.join(self.conf.tmpdir,
                               "%s_cs_processAlignmentBwa.tmp" % in_fn)
        LOGGER.info("Re-allocating non-unique reads to a temporary file %s.",
                    tmpfile)
        cmd = (self.conf.read_reallocator, in_fn, tmpfile)
        LOGGER.debug(cmd)
        call_subprocess(cmd, path=self.conf.hostpath)

        # Re-sort output bam
        LOGGER.info("Sorting temporary file %s back to %s.", tmpfile, in_fn)
        cmd = (self.conf.read_sorter, 'sort', '-m', self.conf.meminbytes, '-o',
               in_fn, tmpfile)
        LOGGER.debug(cmd)
        call_subprocess(cmd, path=self.conf.hostpath)

        # Remove tmp file
        LOGGER.info("Removing temporary file %s.", tmpfile)
        os.unlink(tmpfile)
def update_library_bam_readgroups(libcode, update=False):
    '''
  Add or Replace the read groups for all the bam files attached to a
  given library. If update is False (the default), picard
  AddOrReplaceReadGroups is used; if update is True then just the bam
  file header is rewritten using an internal pipeline function.
  '''
    lib = Library.objects.get(code=libcode)
    bams = Alnfile.objects.filter(alignment__lane__library__code=libcode,
                                  filetype__code='bam')
    common_args = ('VALIDATION_STRINGENCY=SILENT',
                   'TMP_DIR=%s' % CONFIG.tmpdir)

    for bam in bams:
        LOGGER.info("Updating bam file: %s", bam.filename)
        checksum = checksum_file(bam.repository_file_path, unzip=False)
        if checksum != bam.checksum:
            raise ValueError(
                "Stored bam checksum does not agree with that in the repository."
            )
        if update:
            LOGGER.debug("Rewriting bam file header: %s", bam.filename)
            update_bam_readgroups(bam)
        else:
            tmpfile = "%s.update_rg" % (bam.filename, )
            cmd = ('picard', 'AddOrReplaceReadGroups', 'INPUT=%s' %
                   bam.repository_file_path, 'OUTPUT=%s' % tmpfile, 'RGLB=%s' %
                   lib.code, 'RGSM=%s' % sanitize_samplename(lib.sample.name),
                   'RGCN=%s' % bam.alignment.lane.facility.code,
                   'RGPU=%d' % int(bam.alignment.lane.lanenum),
                   'RGPL=illumina') + common_args

            LOGGER.debug("Running command: %s", " ".join(cmd))
            call_subprocess(cmd, path=os.environ['PATH'])
            update_repo_bamfile(bam, tmpfile)
Example #3
0
    def make_bed_graph(self, aln):
        '''
    Code wrapper for makeWiggle.
    '''
        aln = Alignment.objects.get(
            id=aln.id)  # Reload passed object within transaction.
        bed = aln.alnfile_set.filter(filetype=self.bedtype).exclude(
            filename__contains='chr21')[0]

        # Note makeWiggle can read gzipped bed files directly; we use that fact here.
        lib = aln.lane.library
        bedFN = bed.repository_file_path

        # Write to local directory first.
        bgrBASE = os.path.splitext(bed.filename)[0]
        bgrFN = bgrBASE + self.bgrtype.suffix
        cmd = BED2BGR % (quote(bedFN), quote(bgrBASE))
        LOGGER.debug(cmd)
        if not self.testMode:
            call_subprocess(cmd, shell=True, path=self.conf.hostpath)
            if not os.path.exists(bgrFN):
                LOGGER.error("Failed to create bgr file '%s'" % (bgrFN, ))
            else:
                chksum = checksum_file(bgrFN)
                bgr = Alnfile(filename=os.path.basename(bgrFN),
                              checksum=chksum,
                              filetype=self.bgrtype,
                              alignment=aln)
                bgrFN = rezip_file(bgrFN)
                move(bgrFN, bgr.repository_file_path)
                set_file_permissions(self.conf.group, bgr.repository_file_path)
                bgr.save()
    def get_genome_size_file(self, genome):
        '''
    Retrieve the filename containing chromosome lengths for a given
    genome. Returns two values: the name of the filename, and whether
    that filename should be treated as a temporary file, i.e. to be
    deleted once done with. Such deletion is the responsibility of the
    calling code.
    '''
        fnchrlen = os.path.join(self.conf.genomesizedir, genome + ".fa.length")
        if not os.path.exists(fnchrlen):
            tmpfile = NamedTemporaryFile(delete=False, dir=self.conf.tmpdir)
            cmd = "%s %s > %s" % ('fetchChromSizes', genome, tmpfile.name)

            # Note - assumes we're running on our primary host. FIXME?
            call_subprocess(cmd, shell=True, path=self.conf.hostpath)
            tmpfile.close()
            try:
                LOGGER.info("Storing new chromosome sizes file as %s",
                            fnchrlen)
                move(tmpfile.name, fnchrlen)
                set_file_permissions(self.conf.group, fnchrlen)
            except Exception, err:
                LOGGER.warning(
                    "Attempt to store chromosome sizes file" +
                    " as %s failed: %s", fnchrlen, err)
                return (tmpfile.name, True)
Example #5
0
    def postprocess_results(self, fns):
        '''Checks for output, add to self.output_files. Note that we want
    the compressed archive to be gzipped (*.tar.gz), not zipped. We
    also want the fastqc_report.txt file stored separately and
    uncompressed.'''

        for fpath in fns:

            fname = os.path.split(fpath)[1]
            fname = re.sub(r'\.gz$', '', fname)

            # FastQC strips '.fastq' but not '.fq', so we only remove the former here.
            fname = re.sub(r'\.fastq$', '', fname)

            base = "%s_fastqc" % fname
            bpath = os.path.join(self.workdir, base)

            if not os.path.exists(bpath):
                raise StandardError("Expected output directory not found: %s" %
                                    bpath)

            # Sort out the tar-gzipped archive.
            gzarch = "%s.tar" % bpath
            tar = tarfile.open(gzarch, mode='w')

            # A little jimmying around so we only get the directory we want.
            pwd = os.getcwd()
            os.chdir(self.workdir)
            tar.add(base)
            os.chdir(pwd)

            tar.close()
            self.output_files.append(gzarch)
            self.output_md5s.append(checksum_file(gzarch))

            # The text file containing summary results. Useful for analyses.
            resfile = "%s.txt" % bpath
            copy(os.path.join(bpath, 'fastqc_data.txt'), resfile)
            self.output_files.append(resfile)
            self.output_md5s.append(checksum_file(resfile))

            # Generating a PDF for our end-users.
            html = os.path.join(bpath, 'fastqc_report.html')
            pdf = "%s.pdf" % bpath

            # FIXME resource_filename is a little brittle, would
            # resource_string be better?
            cmd = [
                'wkhtmltopdf-amd64', '--user-style-sheet',
                resource_filename(Requirement.parse('osqpipe'),
                                  'osqpipe/pipeline/fastqc_pdf_styles.css'),
                html, pdf
            ]
            call_subprocess(cmd, path=self.path)
            self.output_files.append(pdf)
            self.output_md5s.append(checksum_file(pdf))
 def convert_to_psl(self, lav):
     '''
 Converts an input lav file to a temporary psl file.
 '''
     pslfn = os.path.join(self.local_tempdir, filebasename(lav) + '.psl')
     cmd = ['lavToPsl', lav, pslfn]
     call_subprocess(cmd,
                     tmpdir=self.local_tempdir,
                     path=os.environ['PATH'])
     return pslfn  # Delete this file in the caller code.
 def generate_bigwig_files(self, bedgraphs, chrom_sizes):
     '''
 Uses bedGraphToBigWig to make bigWig file(s).
 '''
     LOGGER.info("Creating bigWig files...")
     bigwigs = []
     for bgr_fn in bedgraphs:
         bwfile = splitext(bgr_fn)[0] + '.bw'
         cmd = ('bedGraphToBigWig', bgr_fn, chrom_sizes, bwfile)
         LOGGER.debug(cmd)
         try:  # This can fail, e.g. for very small input files.
             call_subprocess(cmd, path=self.conf.hostpath)
             bigwigs.append(bwfile)
         except CalledProcessError, err:
             LOGGER.warn("Unable to create bigWig file: %s", err)
    def convert_to_2bit(self, fasta, workdir=None):
        '''
    Runs faToTwoBit on the designated fasta file; returns the name of
    the output 2bit file.
    '''
        LOGGER.info("Converting fasta to 2bit: %s", fasta)
        if workdir is None:
            workdir = self.local_workdir

        twobitfn = os.path.join(workdir, filebasename(fasta) + '.2bit')
        cmd = ['faToTwoBit', '-noMask', fasta, twobitfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return twobitfn
 def get_2bit(self, fasta):
     '''
 Simply generate a temporary 2bit file from the specified fasta
 file. Note the differences between this and convert_to_2bit. FIXME
 refactor so there's only one of these functions.
 '''
     outfn = os.path.join(self.local_tempdir, filebasename(fasta) + '.2bit')
     if os.path.exists(outfn):
         return outfn
     LOGGER.info("Generating 2bit file for %s", fasta)
     cmd = ['faToTwoBit', fasta, outfn]
     call_subprocess(cmd,
                     tmpdir=self.local_tempdir,
                     path=os.environ['PATH'])
     self._tempfiles.append(outfn)
     return outfn
Example #10
0
    def demultiplex(self, codes, fname):
        '''Actually run the demultiplexing, using demuxIllumina.'''

        # look up adapters from database,
        # write sampleSheet file
        LOGGER.debug("Making sample sheet.")
        sheet = self.make_sample_sheet(codes, fname)
        LOGGER.info("Sample sheet created.")
        # invoke demultiplexer
        cmd = [self.demux_prog, '-d', sheet,
               fname]  # demuxIllumina v2.0 and above
        LOGGER.debug("Command for demultiplexing: %s", " ".join(cmd))
        pout = call_subprocess(cmd, path=self.conf.hostpath)
        fnpat = re.compile(r"tag\s+(\w+):\s+([^\s]+)\s*$")
        fnset = set()
        lostpat = re.compile(r"lost\s+([\/\d]+)\s+reads")
        for line in pout:
            matchobj = fnpat.match(line)
            if matchobj:
                fnset.add(matchobj.group(2))
            else:
                matchobj2 = lostpat.match(line)
                if matchobj2:
                    LOGGER.info("lost %s reads", matchobj2.group(1))

        for fname in fnset:
            set_file_permissions(self.conf.group, fname)

        # Delete the sample sheet.
        os.unlink(sheet)
Example #11
0
def fetch_mga(flowcell, flowlane, destination, nameprefix, lims_fc=None):
    """Fetches MGA report from Genologics LIMS. Returns PDF report."""
    mgafiles = []
    flowlane = int(flowlane)

    # start logging
    if TEST_MODE:
        LOGGER.setLevel(DEBUG)
    else:
        LOGGER.setLevel(INFO)

    # install lims
    if lims_fc is None:
        lims = Lims()
        if not lims.running():
            LOGGER.error("Remote LIMS access broken... cannot continue.")
            sys.exit("LIMS not running.")

        # search lims for a lane on flowcell
        lims_fc = lims.load_flowcell(flowcell)

    if TEST_MODE:
        lims_fc.dump()
    lane = lims_fc.get_lane(flowlane)

    # Retrieve Lane MGA file. See upstream_lims module for supported
    # file type strings (e.g. 'LANE_MGA').
    files = lane.lims_files('LANE_MGA')
    if len(files) == 0:
        LOGGER.info("No files to retrieve for %s_%d", flowcell, lane.lane)
    for lfile in files:
        if lfile.uri.lower()[-4:] != 'html':
            continue
        LOGGER.debug("MGA HTML URI: %s", lfile.uri)
        if destination:
            local_pdf = os.path.join(destination, ("%s.pdf" % nameprefix))
        else:
            local_pdf = nameprefix + ".pdf"

        # Convert the HTML page direct to PDF for storage in the repository.
        cmd = ['wkhtmltopdf-amd64', lfile.uri, local_pdf]
        try:
            call_subprocess(cmd, path=CONFIG.hostpath)
            mgafiles.append(local_pdf)
        except CalledProcessError, err:
            LOGGER.warning(
                "Unable to download and/or convert MGA report to PDF.")
Example #12
0
    def run_fastqc(self, fns, threads=2):
        """Executes fastqc report generation."""

        assert (len(fns) > 0)

        if len(fns) < threads:
            threads = len(fns)
        cmd = [self.program_name, '-q', '-t', threads, '-o', self.workdir]

        if len(self.program_params) > 0:
            cmd.extend(self.program_params.split())

        cmd.extend(fns)

        cmd = [str(x) for x in cmd]
        LOGGER.info("Running FastQC command: %s", " ".join(cmd))
        call_subprocess(cmd, path=self.path)
    def mask_tandem_repeats(self, fasta):
        '''
    Runs trfBig over the designated fasta file. Should return the
    newly-generated masked fasta file name. Runs quite slowly, so we
    keep the outputs following 2bit conversion.
    '''
        LOGGER.info("Masking tandem repeats for fasta: %s", fasta)
        curdir = os.getcwd()

        # trfBig writes to current working directory a lot.
        os.chdir(self.local_tempdir)
        maskfn = os.path.splitext(fasta)[0] + MASKTAG + '.fa'
        cmd = ['trfBig', fasta, maskfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        os.chdir(curdir)
        return maskfn
    def get_chr_sizes(self, fasta):
        '''
    Runs faSize on a fasta file to generate chr size data.
    '''

        # We keep a cached because we'll be using this more than once.
        if fasta in self._chr_sizes:
            return self._chr_sizes[fasta]
        LOGGER.info("Calculating chr sizes for %s", fasta)
        sizefn = os.path.join(self.local_tempdir,
                              filebasename(fasta) + '.sizes')
        cmd = 'faSize %s -detailed > %s' % (bash_quote(fasta),
                                            bash_quote(sizefn))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(sizefn)
        self._chr_sizes[fasta] = sizefn
        return sizefn
Example #15
0
  def run_analysis(self, fns):

    """Executes cross-correlation report generation."""

    assert(len(fns) == 1)

    basefn = os.path.splitext(fns[0])[0]
    pdf    = "%s_xcor.pdf" % basefn
    out    = "%s_xcor.txt" % basefn

    with NamedTemporaryFile(prefix=basefn, suffix='.bam', dir=self.workdir) as tempbam:
      with NamedTemporaryFile(suffix='.txt', dir=self.workdir) as tempout:

        mdcmd = [ 'picard',
                  'MarkDuplicates',
                  'I=%s' % fns[0],
                  'O=%s' % tempbam.name,
                  'M=%s' % tempout.name,
                  'REMOVE_DUPLICATES=true',
                  'VALIDATION_STRINGENCY=SILENT' ]

        LOGGER.info('Removing bam file duplicate reads prior to cross-correlation analysis')

        call_subprocess(mdcmd, path=self.path)

      # tempout closes and is deleted.
      cmd = [ self.program_name,
              '-c=%s' % tempbam.name,
              '-savp=%s' % pdf,
              '-out=%s' % out ]

      if len(self.program_params) > 0:
        cmd.extend( self.program_params.split() )

      cmd = [ str(x) for x in cmd ]
      LOGGER.debug("Constructed cross-correlation command: %s", " ".join(cmd))
      LOGGER.info("Running Cross-correlation analysis")
      call_subprocess(cmd, path=self.path)

    # tempbam closes and is deleted.
    self.output_files.extend([out, pdf])
 def generate_wig_files(self, beds):
     '''
 Uses makeWiggle to generate wiggle files.
 '''
     wigs = []
     for bed_fn in beds:
         cmd = ('makeWiggle', '-t', '3', bed_fn, splitext(bed_fn)[0])
         LOGGER.debug(" ".join(cmd))
         pout = call_subprocess(cmd, path=self.conf.hostpath)
         for line in pout:
             wigs.append(line.strip())
         pout.close()
     return wigs
    def chain(self, lavs):
        '''
    Chains the lastz output .lav files together.
    '''
        # We keep the filtered chain file.
        gen_from = filebasename(self.from_genome)
        gen_to = filebasename(self.to_genome)
        prechain = os.path.join(self.local_workdir,
                                '%s_vs_%s.pre.chain' % (gen_from, gen_to))
        if os.path.exists(prechain):
            LOGGER.warning(
                "Prechain file already exists." +
                " Assuming we can start from this point: %s", prechain)
            return prechain

        # Convert lavs to appropriately-organised psl files.
        psls = self.process_lavs_to_psl(lavs)

        # FIXME at some point we need to add these psls to self._tempfiles

        # Run the initial chaining.
        LOGGER.info("Running the initial chaining.")
        chaindir = os.path.join(self.local_tempdir, 'chain/')
        os.mkdir(chaindir)
        chains = []
        for psl in psls:
            chfn = os.path.join(chaindir, filebasename(psl) + '.chain')
            cmd = [
                'axtChain', '-psl',
                '-linearGap=%s' % self.linear_gap, psl, '-faQ',
                self.from_genome, '-faT', self.to_genome, chfn
            ]
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'])
            chains.append(chfn)
            self._tempfiles.append(chfn)
        self._tempfiles.append(chaindir)

        # Filter the chained alignments before returning.
        allchain = os.path.join(self.local_tempdir, 'all.chain')
        cmd = ('chainMergeSort -tempDir=%s %s > %s' %
               (bash_quote(self.local_tempdir), " ".join(
                   [bash_quote(x) for x in chains]), bash_quote(allchain)))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(allchain)

        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        # Actually create the prechain file.
        cmd = ['chainPreNet', allchain, from_sizes, to_sizes, prechain]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return prechain
def rename_files(files, fromFun):
    for fobj in files:

        old = fromFun(fobj)
        old = "/".join((SERVERDIR, old))
        new = fobj.repository_file_path
        new = "/".join((SERVERDIR, new))

        if old != new:
            d = os.path.dirname(new)

            #      print "Creating directory %s" % (d,)
            cmd = ssh_command(('mkdir', '-p', quote(d)))
            call_subprocess(cmd, shell=True, path=CONFIG.hostpath)

            #      print "Moving %s to %s" % (old, new)
            old = bash_quote(old)
            new = bash_quote(new)
            cmd = ssh_command(('mv', quote(old), quote(new)))
            try:
                call_subprocess(cmd, shell=True, path=CONFIG.hostpath)
            except CalledProcessError, err:
                print "Warning: move failed for file %s: %s" % (old, err)
    def net(self, prechain):
        '''
    Create nets from the chained alignements and convert them to axt
    format. Also generate a liftOver file.
    '''
        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        net = os.path.join(self.local_workdir, prechain + '.net')
        cmd = (('chainNet %s -minSpace=1 %s %s stdout /dev/null' +
                ' | netSyntenic stdin %s') %
               (bash_quote(prechain), bash_quote(from_sizes),
                bash_quote(to_sizes), bash_quote(net)))
        # This may fail for spurious reasons (e.g. absence of
        # /proc/self/stat on non-linux machines).
        try:
            LOGGER.info("Running chainNet and netSyntenic on prechain file.")
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'],
                            shell=True)
        except CalledProcessError, err:
            LOGGER.warning("chainNet or netSyntenic raised exception: %s", err)
Example #20
0
    def samtools_merge_bams(self, bams, output_fn):
        '''
    Use samtools to merge a set of bams locally.
    '''
        if len(bams) == 0:
            raise ValueError("Zero input bam files for merging.")
        if len(bams) == 1:
            LOGGER.warning(
                "Only one bam file supplied; copying, rather than merging.")
            copy(bams[0], output_fn)
            return

        # We assume our input bam files are appropriately sorted (which,
        # coming from the repository, they should be).
        cmd = ['samtools', 'merge', output_fn] + bams

        LOGGER.info("Using samtools to merge bam files: %s",
                    ", ".join([os.path.basename(bam) for bam in bams]))

        call_subprocess(cmd, path=CONFIG.hostpath)

        if not os.path.exists(output_fn):
            raise StandardError("Merged output file does not exist: %s",
                                output_fn)
Example #21
0
def run_picard(libcode, facility, lanenum=None, genome=None):

  bams = Alnfile.objects.filter(alignment__lane__library__code=libcode,
                                alignment__lane__facility__code=facility,
                                filetype__code='bam')
  if lanenum is not None:
    bams = bams.filter(alignment__lane__lanenum=lanenum)
  if genome is not None:
    bams = bams.filter(alignment__genome__code=genome)

  if len(bams) == 0:
    raise StandardError("Unable to find matching bam file in the repository.")

  for bam in bams:
    LOGGER.info("Confirming file checksum: %s", bam.filename)
    oldsum   = checksum_file(bam.repository_file_path, unzip=False)
    if oldsum != bam.checksum:
      raise ValueError(("MD5 checksum of bam file on disk (%s) does not agree"
                        + " with stored repository value (%s): %s")
                       % (oldsum, bam.checksum, bam.filename))
    newbam   = bam.repository_file_path + '.cleaned'
    postproc = BamPostProcessor(input_fn=bam.repository_file_path,
                                output_fn=newbam)

    # Run CleanSam
    LOGGER.info("Running CleanSam...")
    call_subprocess(postproc.clean_sam(), path=CONFIG.hostpath)

    # Run AddOrReplaceReadGroups
    LOGGER.info("Running AddOrReplaceReadGroups...")
    call_subprocess(postproc.add_or_replace_read_groups(), path=CONFIG.hostpath)
    os.unlink(postproc.cleaned_fn)

    # Run FixMateInformation
    LOGGER.info("Running FixMateInformation...")
    call_subprocess(postproc.fix_mate_information(), path=CONFIG.hostpath)
    os.unlink(postproc.rgadded_fn)

    # Quick sanity check on the output
    newcount = count_bam_reads(newbam)

    # FIXME total_reads should be total reads in bam, not in fastq.
    oldcount = bam.alignment.total_reads
    if bam.alignment.lane.paired:
      oldcount = oldcount * 2
    if newcount != oldcount:
      raise ValueError(("Read count in cleaned bam file (%d) does not agree"
                        + " with total_reads in repository (%d): %s")
                       % (newcount, oldcount, newbam))

    # Clean up and replace the old bam file with the new one.
    LOGGER.info("Replacing old bam file with new: %s", bam.repository_file_path)
    replace_repo_file(bam, newbam)
 def generate_bedgraph_files(self, beds):
     '''
 Uses makeWiggle to generate bedgraph files.
 '''
     # make bedgraph file(s)
     LOGGER.info("Creating bedgraph files...")
     bedgraphs = []
     for bed_fn in beds:
         cmd = ('makeWiggle', '-t', '3', '-B', '-1', bed_fn,
                splitext(bed_fn)[0])
         LOGGER.debug(cmd)
         try:  # Occasionally fails for poorer-quality genomes.
             pout = call_subprocess(cmd, path=self.conf.hostpath)
             for line in pout:
                 bedgraphs.append(line.strip())
             pout.close()
         except CalledProcessError, err:
             LOGGER.warn("Unable to create bedGraph file: %s", err)
    def process_lavs_to_psl(self, lavs):
        '''
    Convert .lav files to .psl, swap query and target, and split on
    target.
    '''
        # Convert lav files to psl, concatenate.
        LOGGER.info("Reorganising lav files into psl files.")
        psls = [self.convert_to_psl(x) for x in lavs]
        allpsl = os.path.join(self.local_tempdir, 'all.psl')

        # Concatenate the files. We take this opportunity to strip out the
        # junk we've added to the chromosome names.
        def repl(match):
            '''
      Regex replace function.
      '''
            return "\t%s\t" % match.group(1)

        from_sizes = self.get_chr_sizes_dict(self.from_genome)
        to_sizes = self.get_chr_sizes_dict(self.to_genome)

        # We allow for any pid prefix so we can restart in a new process
        # if needed. Also allow for genome/chrN_trfBig_masked to support
        # fill-in files generated locally.
        genstr = (
            r'(?:%s|%s)' %
            (filebasename(self.from_genome), filebasename(self.to_genome)))
        strip_re = re.compile(r'\t(?:\d+_%s_)?([^\t]*)%s\t' %
                              (genstr, MASKTAG))

        # Keep this regex in sync with the file naming scheme used in split_chrs.
        subchr_re = re.compile(r'^(.*)_\+(\d+)$')
        with open(allpsl, 'wb') as allfh:
            for inp in psls:
                with open(inp, 'rb') as pfh:
                    for line in pfh:

                        # We need to rewrite the chrnames here. Also remove the
                        # trailing newline so it doesn't confuse the processing below.
                        newline = strip_re.sub(repl, line).rstrip('\n')

                        # Parse out sub-chromosome coordinates from filenames and
                        # fix coords appropriately. This is heavily dependent on
                        # the PSL file following the specification.
                        fields = newline.split("\t")
                        if len(fields) > 1:

                            # Sort out the query positions.
                            chrA_match = subchr_re.match(fields[9])
                            if chrA_match:
                                fields[9] = chrA_match.group(1)
                                basecoord = int(chrA_match.group(2))
                                for fnum in (11, 12):
                                    fields[fnum] = str(
                                        int(fields[fnum]) + basecoord)
                                fields[19] = ','.join([
                                    str(int(x) + basecoord)
                                    for x in fields[19].split(',') if x != ''
                                ]) + ','
                                fields[10] = from_sizes[fields[9]]

                            # Sort out the target positions.
                            chrB_match = subchr_re.match(fields[13])
                            if chrB_match:
                                fields[13] = chrB_match.group(1)
                                basecoord = int(chrB_match.group(2))
                                for fnum in (15, 16):
                                    fields[fnum] = str(
                                        int(fields[fnum]) + basecoord)
                                fields[20] = ','.join([
                                    str(int(x) + basecoord)
                                    for x in fields[20].split(',') if x != ''
                                ]) + ','
                                fields[14] = to_sizes[fields[13]]

                            # Quick check on our output. This is essentially cribbed
                            # from the pslToBed code.
                            if (int(fields[11]) >= int(fields[12])
                                    or int(fields[12]) > int(fields[10])
                                    or int(fields[15]) >= int(fields[16])
                                    or int(fields[16]) > int(fields[14])):
                                raise StandardError((
                                    "Mangled PSL format output. Offending input line was in file %s:"
                                    + "\n\n%s\n\nMunged to:\n%s\n\n") %
                                                    (inp, line,
                                                     "\t".join(fields)))

                        newline = "\t".join(fields) + "\n"

                        allfh.write(newline)
                os.unlink(inp)  # Attempt to save some temp space

        # Swap target and source annotation, such that splitting on the
        # target actually splits on the query.
        swppsl = os.path.join(self.local_tempdir, 'all-swap.psl')
        cmd = ['pslSwap', allpsl, swppsl]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        os.unlink(allpsl)

        # Split psl files by target chromosome.
        psldir = os.path.join(self.local_tempdir, 'psl/')
        os.mkdir(psldir)

        # Consider -lump option for scaffolds FIXME
        cmd = ['pslSplitOnTarget', swppsl, psldir]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        target_psls = [
            os.path.join(self.local_tempdir, psldir, x)
            for x in os.listdir(psldir)
        ]
        self._tempfiles.extend(target_psls + [psldir])
        os.unlink(swppsl)

        return target_psls
Example #24
0
    def ed_get_files_by_fastqfile(self,
                                  file1,
                                  file1_md5,
                                  laneid,
                                  file2=None,
                                  file2_md5=None):
        '''Downloads all files for the userid'''
        # For each fastq file in Edinburgh Genomics, one could also download *_R1_fastqc.html file but these are little use of us
        # and the code for this below has been commented off.

        # file1_fastqc = file1.rstrip('.fastq.gz') + '_fastqc.html'
        # rfiles = [file1, file1_fastqc]
        # rmd5s = [file1_md5, None]
        rfiles = [file1]
        rmd5s = [file1_md5]

        fail_command = 'cs_edinburgh_download.py -a --file1 %s --file1_md5 %s -p %s -l %d' % (
            file1, file1_md5, self.project, laneid)

        if file2 is not None:
            # file2_fastqc = file2.rstrip('.fastq.gz') + '_fastqc.html'
            # rfiles = rfiles + [file2, file2_fastqc]
            # rmd5s = rmd5s + [file2_md5, None]
            rfiles = rfiles + [file2]
            rmd5s = rmd5s + [file2_md5]

            fail_command += " --file2 %s --file2_md5 %s" % (file2, file2_md5)

        failed = False

        # Download files in rfiles
        for rfpath, md5sum in zip(rfiles, rmd5s):
            attempts = 0
            while attempts < self.maxattempts:
                ## Depending if the file to be downloaded is .html skip downloading .md5 as there won't be one.
                #if rfpath.endswith('.html'):
                #    res = self.ed_get_file_with_md5(rfpath, md5sum, download_md5=False)
                #else:
                #    res = self.ed_get_file_with_md5(rfpath, md5sum, download_md5=True)
                res = self.ed_get_file_with_md5(rfpath,
                                                md5sum,
                                                download_md5=True)
                if res == 0:
                    break
                else:
                    attempts += 1
                    if attempts == self.maxattempts:
                        LOGGER.error(
                            "Giving up (after %d attempts) on trying to download %s"
                            % (attempts, rfpath))

                        # Set download failed
                        cmd = 'communicateStatus.py --laneid %d --status downloading_failed' % laneid
                        # Record failed command (in case its of any use)
                        cmd += ' && echo %s >> %s' % (fail_command,
                                                      self.failedcommands)
                        run_command(cmd, shell=True)
                        failed = True
                    else:
                        LOGGER.error("Trying to download %s again." % rfpath)

        # Register files as downloaded
        if not failed:
            # Get flowcell and machine info from read header of fastq file
            # (machine, flowcell, flowlane) = parse_read_header(fqfile1)
            # 5. Set flowcell status 'downloaded' or 'failed download'
            # cmd = 'communicateStatus.py --status complete --flowcell %s --flowlane 0 --library %s --facility EDG' % (flowcell, userid)
            cmd = 'communicateStatus.py --laneid %d --status downloaded' % laneid
            call_subprocess(cmd, shell=True)
class LastzAligner(ClusterJobManager):
    '''
  Class to handle all the steps required for generating an axt-format
  net alignment file using lastz as the aligner.
  '''

    # local_tempdir will need to be able to handle around 75GB when aligning
    # two typical mammalian genomes.

    def __init__(self,
                 from_genome,
                 to_genome,
                 hsp_thresh=3000,
                 length_limit=None,
                 linear_gap='loose',
                 local_tempdir=None,
                 resume=False,
                 *args,
                 **kwargs):

        super(LastzAligner, self).__init__(*args, **kwargs)

        self.from_genome = from_genome
        self.to_genome = to_genome
        self.hsp_thresh = hsp_thresh
        self.length_limit = length_limit
        self.linear_gap = linear_gap

        # Flag used to tell the object to fill in missing lav files by
        # resubmitting to the cluster, rather than just working with
        # what's available.
        self.resume = resume

        systempdir = gettempdir() if local_tempdir is None else local_tempdir
        self.local_tempdir = os.path.join(systempdir, str(os.getpid()))
        os.mkdir(self.local_tempdir)  # Fails on pre-existing directory.

        self._tempfiles = []
        self._chr_sizes = {}

    def split_chrs(self, fasta, dryrun=False):
        '''
    Split a designated fasta file by chromosome. Returns a list of the
    generated fasta files. Any chromosome whose sequence exceeds
    self.length_limit will be split appropriately. Calling with
    dryrun=True returns a list of files which would have been created;
    this may be useful when deciding on an appropriate length_limit
    parameter.
    '''
        LOGGER.info("Splitting fasta by chromosome: %s", fasta)
        # N.B. the trailing '/' is important here:
        wdir = os.path.join(self.local_tempdir,
                            '%s_chr_split/' % filebasename(fasta))
        if not dryrun:
            os.mkdir(wdir)  # Fails on pre-existing directory.
        self._tempfiles.append(wdir)

        outfiles = []
        handle = open(fasta, 'rU')
        for chromosome in SeqIO.parse(handle, 'fasta'):

            # Check whether we need to split the chromosome.
            seqlen = len(chromosome.seq)
            if self.length_limit and seqlen > self.length_limit:

                # Figure out how many chunks we need.
                denom = 2
                while (float(seqlen) / denom) > self.length_limit:
                    denom += 1

                # Output the sequences
                for segnum in range(denom):
                    start = (segnum * (seqlen / denom)) + 1
                    end = min(seqlen, (segnum + 1) * (seqlen / denom))

                    # This filename format will be parsed later, in
                    # process_lavs_to_psl. The filename coordinate needs to be
                    # added to the output psl coords.
                    new_id = "%s_+%d" % (chromosome.id, start - 1)
                    chrfile = os.path.join(wdir, "%s.fa" % new_id)
                    chrseg = chromosome[start - 1:end]
                    chrseg.id = new_id
                    if not dryrun:
                        with open(chrfile, 'w') as chrfh:
                            SeqIO.write([chrseg], chrfh, 'fasta')
                    outfiles.append(chrfile)

            else:

                # If chromosome is small enough, just dump it out in a single file.
                chrfile = os.path.join(wdir, "%s.fa" % chromosome.id)
                if not dryrun:
                    with open(chrfile, 'w') as chrfh:
                        SeqIO.write([chromosome], chrfh, 'fasta')
                outfiles.append(chrfile)

        return outfiles

    def mask_tandem_repeats(self, fasta):
        '''
    Runs trfBig over the designated fasta file. Should return the
    newly-generated masked fasta file name. Runs quite slowly, so we
    keep the outputs following 2bit conversion.
    '''
        LOGGER.info("Masking tandem repeats for fasta: %s", fasta)
        curdir = os.getcwd()

        # trfBig writes to current working directory a lot.
        os.chdir(self.local_tempdir)
        maskfn = os.path.splitext(fasta)[0] + MASKTAG + '.fa'
        cmd = ['trfBig', fasta, maskfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        os.chdir(curdir)
        return maskfn

    def convert_to_2bit(self, fasta, workdir=None):
        '''
    Runs faToTwoBit on the designated fasta file; returns the name of
    the output 2bit file.
    '''
        LOGGER.info("Converting fasta to 2bit: %s", fasta)
        if workdir is None:
            workdir = self.local_workdir

        twobitfn = os.path.join(workdir, filebasename(fasta) + '.2bit')
        cmd = ['faToTwoBit', '-noMask', fasta, twobitfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return twobitfn

    def make_cluster_filename(self, localfile):
        '''
    Generate a unique filename to be used on the cluster, without
    unnecessarily divulging local file paths.
    '''
        pathbits = os.path.split(localfile)
        hasher = md5()
        hasher.update(pathbits[0])
        clusterfile = "%d_%s_%s" % (os.getpid(), hasher.hexdigest(),
                                    pathbits[1])
        return clusterfile

    def align(self, from_list, to_list, omit_list=None):
        '''
    Actually run the alignment. Requires lastz on the cluster, and of
    course bsub/LSF et al. Note that we will generate one lastz
    process for each chr-chr combination, so this should lend itself
    well to a clustered solution.
    '''
        # Just in case, for convenience.
        if type(from_list) in (str, unicode):
            from_list = [from_list]
        if type(to_list) in (str, unicode):
            to_list = [to_list]
        if omit_list is None:
            omit_list = []

        # Make sure the filenames on the cluster won't easily collide.
        cluster_from = [self.make_cluster_filename(x) for x in from_list]
        cluster_to = [self.make_cluster_filename(x) for x in to_list]

        # Note that we copy all the files to the cluster even if we only
        # want to repeat a handful of alignments; managing the files is
        # simpler that way.
        LOGGER.info("Copying files to cluster server.")
        self.submitter.remote_copy_files(filenames=from_list + to_list,
                                         destnames=cluster_from + cluster_to)

        job_ids = []
        lavfiles = []
        for from_num in range(len(from_list)):
            for to_num in range(len(to_list)):

                # Files on localhost
                from_file = from_list[from_num]
                to_file = to_list[to_num]

                # Files on the cluster
                from_clust = cluster_from[from_num]
                to_clust = cluster_to[to_num]

                outfile = "%s_%s.lav" % (filebasename(from_file),
                                         filebasename(to_file))

                if outfile in omit_list:
                    LOGGER.warning("Skipping pre-existing lav file %s...",
                                   outfile)
                    lavfiles.append(outfile)
                    continue

                ## FIXME consider the --inner option here (ensembl-compara
                ## appears to use --inner=2200).
                LOGGER.info("Launching alignment (%s : %s).", from_file,
                            to_file)
                clusterout = "%d_%s" % (os.getpid(), outfile)

                ## We use this file to monitor lastz completion, to
                ## disambiguate lastz failure from scp failure. FIXME if this
                ## turns out to be scp failure we can add a final re-try to
                ## the monitor job.
                clusterdone = clusterout + '.done'

                ## Note that using --chain here appears to be undesirable
                ## since the lastz chaining implementation is rather too
                ## simplistic for our purposes (see lastz docs).
                cmd = [
                    'lastz',
                    to_clust,
                    from_clust,  # This is the correct order.
                    '--format=lav',
                    '--hspthresh=%d' % self.hsp_thresh,
                    '--output=%s' % clusterout
                ]

                sshcmd = self.return_file_to_localhost(clusterout,
                                                       outfile,
                                                       execute=False)
                LOGGER.debug(sshcmd)
                cmd = " ".join(cmd) + (
                    ' && touch %s && %s && rm %s %s' %
                    (clusterdone, sshcmd, clusterout, clusterdone))

                # 4GB is the default max mem for lastz. Setting mem=4000 means
                # some larger alignments fail silently; using 5000 seems much
                # more robust on our cluster.
                job_ids.append(
                    self.submitter.submit_command(cmd=cmd,
                                                  mem=self.memsize * 1024,
                                                  auto_requeue=False,
                                                  time_limit=self.time_limit))
                lavfiles.append(outfile)

                # Reduce the rate of cluster job submission, if desired.
                sleep(self.throttle)

        # Caller code tends to assume these paths are absolute.
        lavfiles = [os.path.join(self.local_workdir, x) for x in lavfiles]

        return (job_ids, lavfiles, cluster_from + cluster_to)

    def convert_to_psl(self, lav):
        '''
    Converts an input lav file to a temporary psl file.
    '''
        pslfn = os.path.join(self.local_tempdir, filebasename(lav) + '.psl')
        cmd = ['lavToPsl', lav, pslfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return pslfn  # Delete this file in the caller code.

    def process_lavs_to_psl(self, lavs):
        '''
    Convert .lav files to .psl, swap query and target, and split on
    target.
    '''
        # Convert lav files to psl, concatenate.
        LOGGER.info("Reorganising lav files into psl files.")
        psls = [self.convert_to_psl(x) for x in lavs]
        allpsl = os.path.join(self.local_tempdir, 'all.psl')

        # Concatenate the files. We take this opportunity to strip out the
        # junk we've added to the chromosome names.
        def repl(match):
            '''
      Regex replace function.
      '''
            return "\t%s\t" % match.group(1)

        from_sizes = self.get_chr_sizes_dict(self.from_genome)
        to_sizes = self.get_chr_sizes_dict(self.to_genome)

        # We allow for any pid prefix so we can restart in a new process
        # if needed. Also allow for genome/chrN_trfBig_masked to support
        # fill-in files generated locally.
        genstr = (
            r'(?:%s|%s)' %
            (filebasename(self.from_genome), filebasename(self.to_genome)))
        strip_re = re.compile(r'\t(?:\d+_%s_)?([^\t]*)%s\t' %
                              (genstr, MASKTAG))

        # Keep this regex in sync with the file naming scheme used in split_chrs.
        subchr_re = re.compile(r'^(.*)_\+(\d+)$')
        with open(allpsl, 'wb') as allfh:
            for inp in psls:
                with open(inp, 'rb') as pfh:
                    for line in pfh:

                        # We need to rewrite the chrnames here. Also remove the
                        # trailing newline so it doesn't confuse the processing below.
                        newline = strip_re.sub(repl, line).rstrip('\n')

                        # Parse out sub-chromosome coordinates from filenames and
                        # fix coords appropriately. This is heavily dependent on
                        # the PSL file following the specification.
                        fields = newline.split("\t")
                        if len(fields) > 1:

                            # Sort out the query positions.
                            chrA_match = subchr_re.match(fields[9])
                            if chrA_match:
                                fields[9] = chrA_match.group(1)
                                basecoord = int(chrA_match.group(2))
                                for fnum in (11, 12):
                                    fields[fnum] = str(
                                        int(fields[fnum]) + basecoord)
                                fields[19] = ','.join([
                                    str(int(x) + basecoord)
                                    for x in fields[19].split(',') if x != ''
                                ]) + ','
                                fields[10] = from_sizes[fields[9]]

                            # Sort out the target positions.
                            chrB_match = subchr_re.match(fields[13])
                            if chrB_match:
                                fields[13] = chrB_match.group(1)
                                basecoord = int(chrB_match.group(2))
                                for fnum in (15, 16):
                                    fields[fnum] = str(
                                        int(fields[fnum]) + basecoord)
                                fields[20] = ','.join([
                                    str(int(x) + basecoord)
                                    for x in fields[20].split(',') if x != ''
                                ]) + ','
                                fields[14] = to_sizes[fields[13]]

                            # Quick check on our output. This is essentially cribbed
                            # from the pslToBed code.
                            if (int(fields[11]) >= int(fields[12])
                                    or int(fields[12]) > int(fields[10])
                                    or int(fields[15]) >= int(fields[16])
                                    or int(fields[16]) > int(fields[14])):
                                raise StandardError((
                                    "Mangled PSL format output. Offending input line was in file %s:"
                                    + "\n\n%s\n\nMunged to:\n%s\n\n") %
                                                    (inp, line,
                                                     "\t".join(fields)))

                        newline = "\t".join(fields) + "\n"

                        allfh.write(newline)
                os.unlink(inp)  # Attempt to save some temp space

        # Swap target and source annotation, such that splitting on the
        # target actually splits on the query.
        swppsl = os.path.join(self.local_tempdir, 'all-swap.psl')
        cmd = ['pslSwap', allpsl, swppsl]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        os.unlink(allpsl)

        # Split psl files by target chromosome.
        psldir = os.path.join(self.local_tempdir, 'psl/')
        os.mkdir(psldir)

        # Consider -lump option for scaffolds FIXME
        cmd = ['pslSplitOnTarget', swppsl, psldir]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        target_psls = [
            os.path.join(self.local_tempdir, psldir, x)
            for x in os.listdir(psldir)
        ]
        self._tempfiles.extend(target_psls + [psldir])
        os.unlink(swppsl)

        return target_psls

    def get_chr_sizes(self, fasta):
        '''
    Runs faSize on a fasta file to generate chr size data.
    '''

        # We keep a cached because we'll be using this more than once.
        if fasta in self._chr_sizes:
            return self._chr_sizes[fasta]
        LOGGER.info("Calculating chr sizes for %s", fasta)
        sizefn = os.path.join(self.local_tempdir,
                              filebasename(fasta) + '.sizes')
        cmd = 'faSize %s -detailed > %s' % (bash_quote(fasta),
                                            bash_quote(sizefn))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(sizefn)
        self._chr_sizes[fasta] = sizefn
        return sizefn

    def get_chr_sizes_dict(self, fasta):
        '''
    As for get_chr_sizes, but also parses the file and returns a dict
    for convenience.
    '''
        sizefn = self.get_chr_sizes(fasta)
        sizes = dict()
        with open(sizefn, 'r') as sizefh:
            for row in sizefh:
                (chrom, size) = [x.strip() for x in row.split()]
                sizes[chrom] = size
        return sizes

    def chain(self, lavs):
        '''
    Chains the lastz output .lav files together.
    '''
        # We keep the filtered chain file.
        gen_from = filebasename(self.from_genome)
        gen_to = filebasename(self.to_genome)
        prechain = os.path.join(self.local_workdir,
                                '%s_vs_%s.pre.chain' % (gen_from, gen_to))
        if os.path.exists(prechain):
            LOGGER.warning(
                "Prechain file already exists." +
                " Assuming we can start from this point: %s", prechain)
            return prechain

        # Convert lavs to appropriately-organised psl files.
        psls = self.process_lavs_to_psl(lavs)

        # FIXME at some point we need to add these psls to self._tempfiles

        # Run the initial chaining.
        LOGGER.info("Running the initial chaining.")
        chaindir = os.path.join(self.local_tempdir, 'chain/')
        os.mkdir(chaindir)
        chains = []
        for psl in psls:
            chfn = os.path.join(chaindir, filebasename(psl) + '.chain')
            cmd = [
                'axtChain', '-psl',
                '-linearGap=%s' % self.linear_gap, psl, '-faQ',
                self.from_genome, '-faT', self.to_genome, chfn
            ]
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'])
            chains.append(chfn)
            self._tempfiles.append(chfn)
        self._tempfiles.append(chaindir)

        # Filter the chained alignments before returning.
        allchain = os.path.join(self.local_tempdir, 'all.chain')
        cmd = ('chainMergeSort -tempDir=%s %s > %s' %
               (bash_quote(self.local_tempdir), " ".join(
                   [bash_quote(x) for x in chains]), bash_quote(allchain)))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(allchain)

        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        # Actually create the prechain file.
        cmd = ['chainPreNet', allchain, from_sizes, to_sizes, prechain]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return prechain

    def get_2bit(self, fasta):
        '''
    Simply generate a temporary 2bit file from the specified fasta
    file. Note the differences between this and convert_to_2bit. FIXME
    refactor so there's only one of these functions.
    '''
        outfn = os.path.join(self.local_tempdir, filebasename(fasta) + '.2bit')
        if os.path.exists(outfn):
            return outfn
        LOGGER.info("Generating 2bit file for %s", fasta)
        cmd = ['faToTwoBit', fasta, outfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        self._tempfiles.append(outfn)
        return outfn

    def net(self, prechain):
        '''
    Create nets from the chained alignements and convert them to axt
    format. Also generate a liftOver file.
    '''
        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        net = os.path.join(self.local_workdir, prechain + '.net')
        cmd = (('chainNet %s -minSpace=1 %s %s stdout /dev/null' +
                ' | netSyntenic stdin %s') %
               (bash_quote(prechain), bash_quote(from_sizes),
                bash_quote(to_sizes), bash_quote(net)))
        # This may fail for spurious reasons (e.g. absence of
        # /proc/self/stat on non-linux machines).
        try:
            LOGGER.info("Running chainNet and netSyntenic on prechain file.")
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'],
                            shell=True)
        except CalledProcessError, err:
            LOGGER.warning("chainNet or netSyntenic raised exception: %s", err)
        if not os.path.exists(net):
            raise StandardError(
                "chainNet/netSyntenic failed to create output net file %s" %
                net)

        axt = os.path.join(
            self.local_workdir, "%s.%s.net.axt" %
            (filebasename(self.from_genome), filebasename(self.to_genome)))
        from_2bit = self.get_2bit(self.from_genome)
        to_2bit = self.get_2bit(self.to_genome)
        LOGGER.info('Converting to axt format.')
        cmd = ('netToAxt %s %s %s %s stdout | axtSort stdin %s' %
               (bash_quote(net), bash_quote(prechain), bash_quote(from_2bit),
                bash_quote(to_2bit), bash_quote(axt)))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)

        # These are cheap to generate and store, but potentially very useful later.
        LOGGER.info('Creating liftOver file.')
        liftover = os.path.join(self.local_workdir, prechain + '.liftOver')
        cmd = ('netChainSubset', net, prechain, liftover)
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])

        return axt