Python bash_quote Beispiele, osqutil.utilities.bash_quote Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: archive.py Projekt: odomlab/odom_data_processing

def _create_archive_dir_on_host(fobj):
    '''
  Create folder in foreign host over ssh.
  '''
    arch = fobj.archive
    if arch is None:
        raise ValueError(
            "Attempting to transfer file to null archive location.")
    folder = bash_quote(os.path.join(arch.host_path, fobj.libcode))

    # The ssh command expects double quoting (one for our bash prompt,
    # one for the server).
    cmd = ['ssh']
    if arch.host_port is not None:
        cmd += ['-p', arch.host_port]
    if arch.host_user is not None:
        cmd += ['%s@%s' % (arch.host_user, arch.host)]
    else:
        cmd += [arch.host]
    cmd += ['mkdir', bash_quote(folder), '&& chmod 750', bash_quote(folder)]

    subproc = Popen(cmd, stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = subproc.communicate()
    retcode = subproc.wait()
    if retcode != 0:
        if 'File exists' in stderr:
            LOGGER.info("Directory %s@%s:%s already exists.", arch.host_user,
                        arch.host, folder)
        else:
            raise StandardError(\
              "ERROR. Failed to create directory in archive"
              + (" (cmd=\"%s\").\nSTDOUT: %s\nSTDERR: %s\n"
              % (" ".join(cmd), stdout, stderr)) )

Beispiel #2

0

Datei anzeigen

Datei: lastz_pipeline.py Projekt: odomlab/odom_data_processing

    def chain(self, lavs):
        '''
    Chains the lastz output .lav files together.
    '''
        # We keep the filtered chain file.
        gen_from = filebasename(self.from_genome)
        gen_to = filebasename(self.to_genome)
        prechain = os.path.join(self.local_workdir,
                                '%s_vs_%s.pre.chain' % (gen_from, gen_to))
        if os.path.exists(prechain):
            LOGGER.warning(
                "Prechain file already exists." +
                " Assuming we can start from this point: %s", prechain)
            return prechain

        # Convert lavs to appropriately-organised psl files.
        psls = self.process_lavs_to_psl(lavs)

        # FIXME at some point we need to add these psls to self._tempfiles

        # Run the initial chaining.
        LOGGER.info("Running the initial chaining.")
        chaindir = os.path.join(self.local_tempdir, 'chain/')
        os.mkdir(chaindir)
        chains = []
        for psl in psls:
            chfn = os.path.join(chaindir, filebasename(psl) + '.chain')
            cmd = [
                'axtChain', '-psl',
                '-linearGap=%s' % self.linear_gap, psl, '-faQ',
                self.from_genome, '-faT', self.to_genome, chfn
            ]
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'])
            chains.append(chfn)
            self._tempfiles.append(chfn)
        self._tempfiles.append(chaindir)

        # Filter the chained alignments before returning.
        allchain = os.path.join(self.local_tempdir, 'all.chain')
        cmd = ('chainMergeSort -tempDir=%s %s > %s' %
               (bash_quote(self.local_tempdir), " ".join(
                   [bash_quote(x) for x in chains]), bash_quote(allchain)))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(allchain)

        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        # Actually create the prechain file.
        cmd = ['chainPreNet', allchain, from_sizes, to_sizes, prechain]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return prechain

Beispiel #3

0

Datei anzeigen

    def return_file_to_localhost(self,
                                 clusterout,
                                 outfile,
                                 execute=True,
                                 donefile=False):
        '''
    If execute is False, returns a command string that can be used to
    transfer a cluster output files back to our local working
    directory. If execute is True, the command will also be run on the
    cluster.
    '''
        myhost = getfqdn()
        myuser = getuser()
        sshcmd = "scp"

        # Transferring the files back to localhost requires an appropriate
        # passwordless ssh key to be given access on our localhost. The
        # alternative is some horrendous pexpect hack which is only a
        # little more secure (see: sshSangerTunnel.py).
        if self.ssh_key is not None:
            sshcmd += " -i %s" % self.ssh_key

        # Note that we need quoting of e.g. file paths containing
        # spaces. Also, the initial './' allows filenames to contain
        # colons.
        if not os.path.isabs(clusterout):
            clusterout = './%s' % (clusterout, )
        sshcmd += (
            r' %s %s@%s:\"' % (bash_quote(clusterout), myuser, myhost) +
            bash_quote(bash_quote(self.local_workdir + r'/%s' % outfile)) +
            r'\"')

        if donefile:
            sshcmd += " && ssh"
            if self.ssh_key is not None:
                sshcmd += " -i %s" % self.ssh_key
            sshcmd += (r' %s@%s touch ' % (myuser, myhost) + bash_quote(
                bash_quote(self.local_workdir + r'/%s.done' % outfile)))

        if execute is True:
            # This *should* die on failure.
            self.runner.run_command(sshcmd)

        return sshcmd

Beispiel #4

0

Datei anzeigen

Datei: lastz_pipeline.py Projekt: odomlab/odom_data_processing

    def get_chr_sizes(self, fasta):
        '''
    Runs faSize on a fasta file to generate chr size data.
    '''

        # We keep a cached because we'll be using this more than once.
        if fasta in self._chr_sizes:
            return self._chr_sizes[fasta]
        LOGGER.info("Calculating chr sizes for %s", fasta)
        sizefn = os.path.join(self.local_tempdir,
                              filebasename(fasta) + '.sizes')
        cmd = 'faSize %s -detailed > %s' % (bash_quote(fasta),
                                            bash_quote(sizefn))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(sizefn)
        self._chr_sizes[fasta] = sizefn
        return sizefn

Beispiel #5

0

Datei anzeigen

Datei: remapBackupFileLocations.py Projekt: odomlab/odom_data_processing

def rename_files(files, fromFun):
    for fobj in files:

        old = fromFun(fobj)
        old = "/".join((SERVERDIR, old))
        new = fobj.repository_file_path
        new = "/".join((SERVERDIR, new))

        if old != new:
            d = os.path.dirname(new)

            #      print "Creating directory %s" % (d,)
            cmd = ssh_command(('mkdir', '-p', quote(d)))
            call_subprocess(cmd, shell=True, path=CONFIG.hostpath)

            #      print "Moving %s to %s" % (old, new)
            old = bash_quote(old)
            new = bash_quote(new)
            cmd = ssh_command(('mv', quote(old), quote(new)))
            try:
                call_subprocess(cmd, shell=True, path=CONFIG.hostpath)
            except CalledProcessError, err:
                print "Warning: move failed for file %s: %s" % (old, err)

Beispiel #6

0

Datei anzeigen

Datei: lastz_pipeline.py Projekt: odomlab/odom_data_processing

    def net(self, prechain):
        '''
    Create nets from the chained alignements and convert them to axt
    format. Also generate a liftOver file.
    '''
        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        net = os.path.join(self.local_workdir, prechain + '.net')
        cmd = (('chainNet %s -minSpace=1 %s %s stdout /dev/null' +
                ' | netSyntenic stdin %s') %
               (bash_quote(prechain), bash_quote(from_sizes),
                bash_quote(to_sizes), bash_quote(net)))
        # This may fail for spurious reasons (e.g. absence of
        # /proc/self/stat on non-linux machines).
        try:
            LOGGER.info("Running chainNet and netSyntenic on prechain file.")
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'],
                            shell=True)
        except CalledProcessError, err:
            LOGGER.warning("chainNet or netSyntenic raised exception: %s", err)

Beispiel #7

0

Datei anzeigen

Datei: archive.py Projekt: odomlab/odom_data_processing

def _archive_file_via_scp(fobj, attempts=1, sleeptime=2):
    '''
  A wrapper for scp allowing multiple attempts for the transfer in case
  of recoverable error.
  '''
    unrecoverable = [
        'No such file or directory',
        'Failed to add the host to the list of known hosts',
        'Operation not permitted'
    ]

    arch = fobj.archive
    if arch is None:
        raise ValueError(
            "Attempting to transfer file to null archive location.")

    # NOTE: We may still need to double-quote spaces the destination
    # passed to scp. Double-quoting brackets ([]) does not work, though.
    host_archdir = os.path.join(arch.host_path, fobj.libcode)
    dest = os.path.join(host_archdir,
                        os.path.basename(fobj.repository_file_path))

    cmd = 'scp -p -o StrictHostKeyChecking=no'
    if arch.host_port is not None:
        cmd += ' -P %s' % str(arch.host_port)

    # Assume we're copying from the main repository to the archive.
    # Note that we need quoting of e.g. file paths containing
    # spaces.
    cmd += ' %s' % bash_quote(fobj.original_repository_file_path)

    # Double-quote the destination, as it has to get past (a) our local
    # bash, and (b) the bash on the destination machine.
    if arch.host_user is not None:
        cmd += ' %s@%s:%s' % (arch.host_user, arch.host,
                              bash_quote(bash_quote(dest)))
    else:
        cmd += ' %s:%s' % (arch.host, bash_quote(bash_quote(dest)))

    start_time = time.time()

    while attempts > 0:
        subproc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        (stdout, stderr) = subproc.communicate()
        retcode = subproc.wait()
        if stdout is not None:
            sys.stdout.write(stdout)
        if stderr is not None:
            sys.stderr.write(stderr)
        if retcode != 0:
            for mesg in unrecoverable:
                if mesg in stderr:
                    LOGGER.error(stderr)
                    attempts = 0
                    break
            attempts -= 1
            if attempts <= 0:
                break
            LOGGER.warning(\
              'Transfer failed with error code: %s\nTrying again (max %d times)',
              stderr, attempts)
            time.sleep(sleeptime)
        else:
            break

    if retcode != 0:
        raise StandardError(
            "ERROR. Failed to transfer file. Command was:\n   %s\n" %
            (" ".join(cmd), ))

    time_diff = time.time() - start_time
    LOGGER.info("Copying to archive (scp) completed in %d seconds.", time_diff)

Beispiel #8

0

Datei anzeigen

Datei: lastz_pipeline.py Projekt: odomlab/odom_data_processing

class LastzAligner(ClusterJobManager):
    '''
  Class to handle all the steps required for generating an axt-format
  net alignment file using lastz as the aligner.
  '''

    # local_tempdir will need to be able to handle around 75GB when aligning
    # two typical mammalian genomes.

    def __init__(self,
                 from_genome,
                 to_genome,
                 hsp_thresh=3000,
                 length_limit=None,
                 linear_gap='loose',
                 local_tempdir=None,
                 resume=False,
                 *args,
                 **kwargs):

        super(LastzAligner, self).__init__(*args, **kwargs)

        self.from_genome = from_genome
        self.to_genome = to_genome
        self.hsp_thresh = hsp_thresh
        self.length_limit = length_limit
        self.linear_gap = linear_gap

        # Flag used to tell the object to fill in missing lav files by
        # resubmitting to the cluster, rather than just working with
        # what's available.
        self.resume = resume

        systempdir = gettempdir() if local_tempdir is None else local_tempdir
        self.local_tempdir = os.path.join(systempdir, str(os.getpid()))
        os.mkdir(self.local_tempdir)  # Fails on pre-existing directory.

        self._tempfiles = []
        self._chr_sizes = {}

    def split_chrs(self, fasta, dryrun=False):
        '''
    Split a designated fasta file by chromosome. Returns a list of the
    generated fasta files. Any chromosome whose sequence exceeds
    self.length_limit will be split appropriately. Calling with
    dryrun=True returns a list of files which would have been created;
    this may be useful when deciding on an appropriate length_limit
    parameter.
    '''
        LOGGER.info("Splitting fasta by chromosome: %s", fasta)
        # N.B. the trailing '/' is important here:
        wdir = os.path.join(self.local_tempdir,
                            '%s_chr_split/' % filebasename(fasta))
        if not dryrun:
            os.mkdir(wdir)  # Fails on pre-existing directory.
        self._tempfiles.append(wdir)

        outfiles = []
        handle = open(fasta, 'rU')
        for chromosome in SeqIO.parse(handle, 'fasta'):

            # Check whether we need to split the chromosome.
            seqlen = len(chromosome.seq)
            if self.length_limit and seqlen > self.length_limit:

                # Figure out how many chunks we need.
                denom = 2
                while (float(seqlen) / denom) > self.length_limit:
                    denom += 1

                # Output the sequences
                for segnum in range(denom):
                    start = (segnum * (seqlen / denom)) + 1
                    end = min(seqlen, (segnum + 1) * (seqlen / denom))

                    # This filename format will be parsed later, in
                    # process_lavs_to_psl. The filename coordinate needs to be
                    # added to the output psl coords.
                    new_id = "%s_+%d" % (chromosome.id, start - 1)
                    chrfile = os.path.join(wdir, "%s.fa" % new_id)
                    chrseg = chromosome[start - 1:end]
                    chrseg.id = new_id
                    if not dryrun:
                        with open(chrfile, 'w') as chrfh:
                            SeqIO.write([chrseg], chrfh, 'fasta')
                    outfiles.append(chrfile)

            else:

                # If chromosome is small enough, just dump it out in a single file.
                chrfile = os.path.join(wdir, "%s.fa" % chromosome.id)
                if not dryrun:
                    with open(chrfile, 'w') as chrfh:
                        SeqIO.write([chromosome], chrfh, 'fasta')
                outfiles.append(chrfile)

        return outfiles

    def mask_tandem_repeats(self, fasta):
        '''
    Runs trfBig over the designated fasta file. Should return the
    newly-generated masked fasta file name. Runs quite slowly, so we
    keep the outputs following 2bit conversion.
    '''
        LOGGER.info("Masking tandem repeats for fasta: %s", fasta)
        curdir = os.getcwd()

        # trfBig writes to current working directory a lot.
        os.chdir(self.local_tempdir)
        maskfn = os.path.splitext(fasta)[0] + MASKTAG + '.fa'
        cmd = ['trfBig', fasta, maskfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        os.chdir(curdir)
        return maskfn

    def convert_to_2bit(self, fasta, workdir=None):
        '''
    Runs faToTwoBit on the designated fasta file; returns the name of
    the output 2bit file.
    '''
        LOGGER.info("Converting fasta to 2bit: %s", fasta)
        if workdir is None:
            workdir = self.local_workdir

        twobitfn = os.path.join(workdir, filebasename(fasta) + '.2bit')
        cmd = ['faToTwoBit', '-noMask', fasta, twobitfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return twobitfn

    def make_cluster_filename(self, localfile):
        '''
    Generate a unique filename to be used on the cluster, without
    unnecessarily divulging local file paths.
    '''
        pathbits = os.path.split(localfile)
        hasher = md5()
        hasher.update(pathbits[0])
        clusterfile = "%d_%s_%s" % (os.getpid(), hasher.hexdigest(),
                                    pathbits[1])
        return clusterfile

    def align(self, from_list, to_list, omit_list=None):
        '''
    Actually run the alignment. Requires lastz on the cluster, and of
    course bsub/LSF et al. Note that we will generate one lastz
    process for each chr-chr combination, so this should lend itself
    well to a clustered solution.
    '''
        # Just in case, for convenience.
        if type(from_list) in (str, unicode):
            from_list = [from_list]
        if type(to_list) in (str, unicode):
            to_list = [to_list]
        if omit_list is None:
            omit_list = []

        # Make sure the filenames on the cluster won't easily collide.
        cluster_from = [self.make_cluster_filename(x) for x in from_list]
        cluster_to = [self.make_cluster_filename(x) for x in to_list]

        # Note that we copy all the files to the cluster even if we only
        # want to repeat a handful of alignments; managing the files is
        # simpler that way.
        LOGGER.info("Copying files to cluster server.")
        self.submitter.remote_copy_files(filenames=from_list + to_list,
                                         destnames=cluster_from + cluster_to)

        job_ids = []
        lavfiles = []
        for from_num in range(len(from_list)):
            for to_num in range(len(to_list)):

                # Files on localhost
                from_file = from_list[from_num]
                to_file = to_list[to_num]

                # Files on the cluster
                from_clust = cluster_from[from_num]
                to_clust = cluster_to[to_num]

                outfile = "%s_%s.lav" % (filebasename(from_file),
                                         filebasename(to_file))

                if outfile in omit_list:
                    LOGGER.warning("Skipping pre-existing lav file %s...",
                                   outfile)
                    lavfiles.append(outfile)
                    continue

                ## FIXME consider the --inner option here (ensembl-compara
                ## appears to use --inner=2200).
                LOGGER.info("Launching alignment (%s : %s).", from_file,
                            to_file)
                clusterout = "%d_%s" % (os.getpid(), outfile)

                ## We use this file to monitor lastz completion, to
                ## disambiguate lastz failure from scp failure. FIXME if this
                ## turns out to be scp failure we can add a final re-try to
                ## the monitor job.
                clusterdone = clusterout + '.done'

                ## Note that using --chain here appears to be undesirable
                ## since the lastz chaining implementation is rather too
                ## simplistic for our purposes (see lastz docs).
                cmd = [
                    'lastz',
                    to_clust,
                    from_clust,  # This is the correct order.
                    '--format=lav',
                    '--hspthresh=%d' % self.hsp_thresh,
                    '--output=%s' % clusterout
                ]

                sshcmd = self.return_file_to_localhost(clusterout,
                                                       outfile,
                                                       execute=False)
                LOGGER.debug(sshcmd)
                cmd = " ".join(cmd) + (
                    ' && touch %s && %s && rm %s %s' %
                    (clusterdone, sshcmd, clusterout, clusterdone))

                # 4GB is the default max mem for lastz. Setting mem=4000 means
                # some larger alignments fail silently; using 5000 seems much
                # more robust on our cluster.
                job_ids.append(
                    self.submitter.submit_command(cmd=cmd,
                                                  mem=self.memsize * 1024,
                                                  auto_requeue=False,
                                                  time_limit=self.time_limit))
                lavfiles.append(outfile)

                # Reduce the rate of cluster job submission, if desired.
                sleep(self.throttle)

        # Caller code tends to assume these paths are absolute.
        lavfiles = [os.path.join(self.local_workdir, x) for x in lavfiles]

        return (job_ids, lavfiles, cluster_from + cluster_to)

    def convert_to_psl(self, lav):
        '''
    Converts an input lav file to a temporary psl file.
    '''
        pslfn = os.path.join(self.local_tempdir, filebasename(lav) + '.psl')
        cmd = ['lavToPsl', lav, pslfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return pslfn  # Delete this file in the caller code.

    def process_lavs_to_psl(self, lavs):
        '''
    Convert .lav files to .psl, swap query and target, and split on
    target.
    '''
        # Convert lav files to psl, concatenate.
        LOGGER.info("Reorganising lav files into psl files.")
        psls = [self.convert_to_psl(x) for x in lavs]
        allpsl = os.path.join(self.local_tempdir, 'all.psl')

        # Concatenate the files. We take this opportunity to strip out the
        # junk we've added to the chromosome names.
        def repl(match):
            '''
      Regex replace function.
      '''
            return "\t%s\t" % match.group(1)

        from_sizes = self.get_chr_sizes_dict(self.from_genome)
        to_sizes = self.get_chr_sizes_dict(self.to_genome)

        # We allow for any pid prefix so we can restart in a new process
        # if needed. Also allow for genome/chrN_trfBig_masked to support
        # fill-in files generated locally.
        genstr = (
            r'(?:%s|%s)' %
            (filebasename(self.from_genome), filebasename(self.to_genome)))
        strip_re = re.compile(r'\t(?:\d+_%s_)?([^\t]*)%s\t' %
                              (genstr, MASKTAG))

        # Keep this regex in sync with the file naming scheme used in split_chrs.
        subchr_re = re.compile(r'^(.*)_\+(\d+)$')
        with open(allpsl, 'wb') as allfh:
            for inp in psls:
                with open(inp, 'rb') as pfh:
                    for line in pfh:

                        # We need to rewrite the chrnames here. Also remove the
                        # trailing newline so it doesn't confuse the processing below.
                        newline = strip_re.sub(repl, line).rstrip('\n')

                        # Parse out sub-chromosome coordinates from filenames and
                        # fix coords appropriately. This is heavily dependent on
                        # the PSL file following the specification.
                        fields = newline.split("\t")
                        if len(fields) > 1:

                            # Sort out the query positions.
                            chrA_match = subchr_re.match(fields[9])
                            if chrA_match:
                                fields[9] = chrA_match.group(1)
                                basecoord = int(chrA_match.group(2))
                                for fnum in (11, 12):
                                    fields[fnum] = str(
                                        int(fields[fnum]) + basecoord)
                                fields[19] = ','.join([
                                    str(int(x) + basecoord)
                                    for x in fields[19].split(',') if x != ''
                                ]) + ','
                                fields[10] = from_sizes[fields[9]]

                            # Sort out the target positions.
                            chrB_match = subchr_re.match(fields[13])
                            if chrB_match:
                                fields[13] = chrB_match.group(1)
                                basecoord = int(chrB_match.group(2))
                                for fnum in (15, 16):
                                    fields[fnum] = str(
                                        int(fields[fnum]) + basecoord)
                                fields[20] = ','.join([
                                    str(int(x) + basecoord)
                                    for x in fields[20].split(',') if x != ''
                                ]) + ','
                                fields[14] = to_sizes[fields[13]]

                            # Quick check on our output. This is essentially cribbed
                            # from the pslToBed code.
                            if (int(fields[11]) >= int(fields[12])
                                    or int(fields[12]) > int(fields[10])
                                    or int(fields[15]) >= int(fields[16])
                                    or int(fields[16]) > int(fields[14])):
                                raise StandardError((
                                    "Mangled PSL format output. Offending input line was in file %s:"
                                    + "\n\n%s\n\nMunged to:\n%s\n\n") %
                                                    (inp, line,
                                                     "\t".join(fields)))

                        newline = "\t".join(fields) + "\n"

                        allfh.write(newline)
                os.unlink(inp)  # Attempt to save some temp space

        # Swap target and source annotation, such that splitting on the
        # target actually splits on the query.
        swppsl = os.path.join(self.local_tempdir, 'all-swap.psl')
        cmd = ['pslSwap', allpsl, swppsl]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        os.unlink(allpsl)

        # Split psl files by target chromosome.
        psldir = os.path.join(self.local_tempdir, 'psl/')
        os.mkdir(psldir)

        # Consider -lump option for scaffolds FIXME
        cmd = ['pslSplitOnTarget', swppsl, psldir]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        target_psls = [
            os.path.join(self.local_tempdir, psldir, x)
            for x in os.listdir(psldir)
        ]
        self._tempfiles.extend(target_psls + [psldir])
        os.unlink(swppsl)

        return target_psls

    def get_chr_sizes(self, fasta):
        '''
    Runs faSize on a fasta file to generate chr size data.
    '''

        # We keep a cached because we'll be using this more than once.
        if fasta in self._chr_sizes:
            return self._chr_sizes[fasta]
        LOGGER.info("Calculating chr sizes for %s", fasta)
        sizefn = os.path.join(self.local_tempdir,
                              filebasename(fasta) + '.sizes')
        cmd = 'faSize %s -detailed > %s' % (bash_quote(fasta),
                                            bash_quote(sizefn))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(sizefn)
        self._chr_sizes[fasta] = sizefn
        return sizefn

    def get_chr_sizes_dict(self, fasta):
        '''
    As for get_chr_sizes, but also parses the file and returns a dict
    for convenience.
    '''
        sizefn = self.get_chr_sizes(fasta)
        sizes = dict()
        with open(sizefn, 'r') as sizefh:
            for row in sizefh:
                (chrom, size) = [x.strip() for x in row.split()]
                sizes[chrom] = size
        return sizes

    def chain(self, lavs):
        '''
    Chains the lastz output .lav files together.
    '''
        # We keep the filtered chain file.
        gen_from = filebasename(self.from_genome)
        gen_to = filebasename(self.to_genome)
        prechain = os.path.join(self.local_workdir,
                                '%s_vs_%s.pre.chain' % (gen_from, gen_to))
        if os.path.exists(prechain):
            LOGGER.warning(
                "Prechain file already exists." +
                " Assuming we can start from this point: %s", prechain)
            return prechain

        # Convert lavs to appropriately-organised psl files.
        psls = self.process_lavs_to_psl(lavs)

        # FIXME at some point we need to add these psls to self._tempfiles

        # Run the initial chaining.
        LOGGER.info("Running the initial chaining.")
        chaindir = os.path.join(self.local_tempdir, 'chain/')
        os.mkdir(chaindir)
        chains = []
        for psl in psls:
            chfn = os.path.join(chaindir, filebasename(psl) + '.chain')
            cmd = [
                'axtChain', '-psl',
                '-linearGap=%s' % self.linear_gap, psl, '-faQ',
                self.from_genome, '-faT', self.to_genome, chfn
            ]
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'])
            chains.append(chfn)
            self._tempfiles.append(chfn)
        self._tempfiles.append(chaindir)

        # Filter the chained alignments before returning.
        allchain = os.path.join(self.local_tempdir, 'all.chain')
        cmd = ('chainMergeSort -tempDir=%s %s > %s' %
               (bash_quote(self.local_tempdir), " ".join(
                   [bash_quote(x) for x in chains]), bash_quote(allchain)))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)
        self._tempfiles.append(allchain)

        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        # Actually create the prechain file.
        cmd = ['chainPreNet', allchain, from_sizes, to_sizes, prechain]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        return prechain

    def get_2bit(self, fasta):
        '''
    Simply generate a temporary 2bit file from the specified fasta
    file. Note the differences between this and convert_to_2bit. FIXME
    refactor so there's only one of these functions.
    '''
        outfn = os.path.join(self.local_tempdir, filebasename(fasta) + '.2bit')
        if os.path.exists(outfn):
            return outfn
        LOGGER.info("Generating 2bit file for %s", fasta)
        cmd = ['faToTwoBit', fasta, outfn]
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])
        self._tempfiles.append(outfn)
        return outfn

    def net(self, prechain):
        '''
    Create nets from the chained alignements and convert them to axt
    format. Also generate a liftOver file.
    '''
        from_sizes = self.get_chr_sizes(self.from_genome)
        to_sizes = self.get_chr_sizes(self.to_genome)

        net = os.path.join(self.local_workdir, prechain + '.net')
        cmd = (('chainNet %s -minSpace=1 %s %s stdout /dev/null' +
                ' | netSyntenic stdin %s') %
               (bash_quote(prechain), bash_quote(from_sizes),
                bash_quote(to_sizes), bash_quote(net)))
        # This may fail for spurious reasons (e.g. absence of
        # /proc/self/stat on non-linux machines).
        try:
            LOGGER.info("Running chainNet and netSyntenic on prechain file.")
            call_subprocess(cmd,
                            tmpdir=self.local_tempdir,
                            path=os.environ['PATH'],
                            shell=True)
        except CalledProcessError, err:
            LOGGER.warning("chainNet or netSyntenic raised exception: %s", err)
        if not os.path.exists(net):
            raise StandardError(
                "chainNet/netSyntenic failed to create output net file %s" %
                net)

        axt = os.path.join(
            self.local_workdir, "%s.%s.net.axt" %
            (filebasename(self.from_genome), filebasename(self.to_genome)))
        from_2bit = self.get_2bit(self.from_genome)
        to_2bit = self.get_2bit(self.to_genome)
        LOGGER.info('Converting to axt format.')
        cmd = ('netToAxt %s %s %s %s stdout | axtSort stdin %s' %
               (bash_quote(net), bash_quote(prechain), bash_quote(from_2bit),
                bash_quote(to_2bit), bash_quote(axt)))
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'],
                        shell=True)

        # These are cheap to generate and store, but potentially very useful later.
        LOGGER.info('Creating liftOver file.')
        liftover = os.path.join(self.local_workdir, prechain + '.liftOver')
        cmd = ('netChainSubset', net, prechain, liftover)
        call_subprocess(cmd,
                        tmpdir=self.local_tempdir,
                        path=os.environ['PATH'])

        return axt