コード例 #1
0
    def run(self, fileStore):
        seqFile1 = fileStore.readGlobalFile(self.seqFileID1)
        seqFile2 = fileStore.readGlobalFile(self.seqFileID2)
        if self.blastOptions.compressFiles:
            seqFile1 = decompressFastaFile(seqFile1,
                                           fileStore.getLocalTempFile())
            seqFile2 = decompressFastaFile(seqFile2,
                                           fileStore.getLocalTempFile())
        blastResultsFile = fileStore.getLocalTempFile()
        runLastz(seqFile1,
                 seqFile2,
                 blastResultsFile,
                 lastzArguments=self.blastOptions.lastzArguments,
                 gpuLastz=self.blastOptions.gpuLastz)
        if self.blastOptions.realign:
            realignResultsFile = fileStore.getLocalTempFile()
            runCactusRealign(
                seqFile1,
                seqFile2,
                inputAlignmentsFile=blastResultsFile,
                outputAlignmentsFile=realignResultsFile,
                realignArguments=self.blastOptions.realignArguments)
            blastResultsFile = realignResultsFile

        resultsFile = fileStore.getLocalTempFile()
        cactus_call(parameters=[
            "cactus_blast_convertCoordinates", blastResultsFile, resultsFile,
            str(self.blastOptions.roundsOfCoordinateConversion)
        ])
        logger.info("Ran the blast okay")
        return fileStore.writeGlobalFile(resultsFile)
コード例 #2
0
def maskJobOverride(job, config_node, mask_file_path, mask_file_id,
                    min_length):
    """ return a hijacked config file that does just one preprocessing job: mask each fasta sequence with 
    the given bed file.  if paf_length is specified, the file is treated as a PAF file, and a BED is extracted
    from it using coverage gaps of at least the given length. 
    """
    # this was unzipped upstream
    if mask_file_path.endswith('.gz'):
        mask_file_path = mask_file_path[:-3]

    if mask_file_path.endswith('.paf'):
        # convert the PAF to BED
        paf_file = job.fileStore.readGlobalFile(mask_file_id)
        bed_file = job.fileStore.getLocalTempFile()

        if not min_length:
            min_length = 1

        cactus_call(
            parameters=['pafcoverage', paf_file, '-g', '-m',
                        str(min_length)],
            outfile=bed_file)

        mask_file_id = job.fileStore.writeGlobalFile(bed_file)

    # rewrite the config
    for node in config_node.findall("preprocessor"):
        config_node.remove(node)

    mask_node = ET.SubElement(config_node, 'preprocessor')
    mask_node.attrib['preprocessJob'] = 'maskFile'
    mask_node.attrib['inputBedID'] = mask_file_id

    return config_node
コード例 #3
0
    def run(self, fileStore):
        #This runs Bob's covered intervals program, which combins the lastz alignment info into intervals of the query.
        alignments = fileStore.readGlobalFile(self.alignmentsID)
        query = fileStore.readGlobalFile(self.queryID)
        maskInfo = fileStore.getLocalTempFile()
        cactus_call(infile=alignments,
                    outfile=maskInfo,
                    parameters=[
                        "cactus_covered_intervals", "--queryoffsets",
                        "--origin=one",
                        "M=%s" % (int(self.repeatMaskOptions.period * 2))
                    ])

        # the previous lastz command outputs a file of intervals (denoted with indices) to softmask.
        # we finish by applying these intervals to the input file, to produce the final, softmasked output.
        args = ["--origin=one"]
        if self.repeatMaskOptions.unmaskOutput:
            args.append("--unmask")
        args.append(maskInfo)
        maskedQuery = fileStore.getLocalTempFile()
        cactus_call(infile=query,
                    outfile=maskedQuery,
                    parameters=["cactus_fasta_softmask_intervals.py"] + args)
        tmp = fileStore.writeGlobalFile(maskedQuery)
        return tmp
コード例 #4
0
 def alignFastaFragments(self, fileStore, targetFiles, fragments):
     """
     Align each query fragment against all the target chunks, stopping
     early to avoid exponential blowup if too many alignments are found.
     """
     target = fileStore.getLocalTempFile()
     catFiles(targetFiles, target)
     lastZSequenceHandling = [
         '%s[multiple][nameparse=darkspace]' % os.path.basename(target),
         '%s[nameparse=darkspace]' % os.path.basename(fragments)
     ]
     if self.repeatMaskOptions.unmaskInput:
         lastZSequenceHandling = [
             '%s[multiple,unmask][nameparse=darkspace]' %
             os.path.basename(target),
             '%s[unmask][nameparse=darkspace]' % os.path.basename(fragments)
         ]
     alignment = fileStore.getLocalTempFile()
     # Each time a fragment aligns to a base in the sequence, that
     # base's match count is incremented.  the plus three for the
     # period parameter is a fudge to ensure sufficient alignments
     # are found
     cactus_call(
         outfile=alignment,
         parameters=["cPecanLastz"] + lastZSequenceHandling +
         self.repeatMaskOptions.lastzOpts.split() + [
             "--querydepth=keep,nowarn:%i" %
             (self.repeatMaskOptions.period + 3),
             "--format=general:name1,zstart1,end1,name2,zstart2+,end2+",
             "--markend"
         ])
     return alignment
コード例 #5
0
 def run(self, fileStore):
     # Align each fragment against a chunk of the input sequence.  Each time a fragment aligns to a base
     # in the sequence, that base's match count is incremented.
     # the plus three for the period parameter is a fudge to ensure sufficient alignments are found
     fragments = fileStore.readGlobalFile(self.fragmentsID)
     targetFiles = [
         fileStore.readGlobalFile(fileID) for fileID in self.targetIDs
     ]
     target = fileStore.getLocalTempFile()
     catFiles(targetFiles, target)
     lastZSequenceHandling = [
         '%s[multiple][nameparse=darkspace]' % os.path.basename(target),
         '%s[nameparse=darkspace]' % os.path.basename(fragments)
     ]
     if self.repeatMaskOptions.unmaskInput:
         lastZSequenceHandling = [
             '%s[multiple,unmask][nameparse=darkspace]' %
             os.path.basename(target),
             '%s[unmask][nameparse=darkspace]' % os.path.basename(fragments)
         ]
     alignment = fileStore.getLocalTempFile()
     cactus_call(
         outfile=alignment,
         parameters=["cPecanLastz"] + lastZSequenceHandling +
         self.repeatMaskOptions.lastzOpts.split() + [
             "--querydepth=keep,nowarn:%i" %
             (self.repeatMaskOptions.period + 3),
             "--format=general:name1,zstart1,end1,name2,zstart2+,end2+",
             "--markend"
         ])
     return fileStore.writeGlobalFile(alignment)
コード例 #6
0
 def testInvariants(self):
     (seqs, _) = getCactusInputs_encode(random.uniform(0, 2))
     # Chimp encode input has duplicate header names.
     seqs = [i for i in seqs if 'chimp' not in i]
     seqs = random.sample(seqs, 2)
     cigarPath = getTempFile()
     cactus_call(parameters=[
         "cPecanLastz", "--format=cigar",
         "%s[multiple]" % seqs[0],
         "%s[multiple]" % seqs[1]
     ],
                 outfile=cigarPath)
     bed = cactus_call(parameters=["cactus_coverage", seqs[1], cigarPath],
                       check_output=True)
     prevChrom = None
     prevStart = None
     prevEnd = None
     # Check that everything is sorted and there are no overlaps
     for line in bed.split("\n"):
         line.strip()
         if line == "":
             continue
         fields = line.split()
         chrom = fields[0]
         start = int(fields[1])
         end = int(fields[2])
         self.assertTrue(end - start >= 1)
         if chrom == prevChrom:
             self.assertTrue(start > prevStart)
             self.assertTrue(start >= prevEnd)
     os.remove(cigarPath)
コード例 #7
0
 def run(self, fileStore):
     blastResultsFile = fileStore.getLocalTempFile()
     seqFile = fileStore.readGlobalFile(self.seqFileID)
     runSelfLastz(seqFile,
                  blastResultsFile,
                  lastzArguments=self.blastOptions.lastzArguments,
                  gpuLastz=self.blastOptions.gpuLastz)
     if self.blastOptions.realign:
         realignResultsFile = fileStore.getLocalTempFile()
         runCactusSelfRealign(
             seqFile,
             inputAlignmentsFile=blastResultsFile,
             outputAlignmentsFile=realignResultsFile,
             realignArguments=self.blastOptions.realignArguments)
         blastResultsFile = realignResultsFile
     resultsFile = fileStore.getLocalTempFile()
     cactus_call(parameters=[
         "cactus_blast_convertCoordinates", blastResultsFile, resultsFile,
         str(self.blastOptions.roundsOfCoordinateConversion)
     ])
     if self.blastOptions.compressFiles:
         #TODO: This throws away the compressed file
         seqFile = compressFastaFile(seqFile)
     logger.info("Ran the self blast okay")
     return fileStore.writeGlobalFile(resultsFile)
コード例 #8
0
def paf_to_lastz(job, paf_file, sort_secondaries=True, mask_bed_id=None):
    """
    Makes lastz output using paf2lastz. Also splits the input paf_file into two files
    in the output, one for the primary and the other for secondary.

    sort_secondaries bool, if true, will cause fxn to return two files instead of one.
    
    """

    work_dir = job.fileStore.getLocalTempDir()
    paf_path = os.path.join(work_dir, "alignments.paf")
    lastz_path = os.path.join(work_dir, "alignments.cigar")
    secondary_lastz_path = os.path.join(work_dir, "secondary_alignments.cigar")

    job.fileStore.readGlobalFile(paf_file, paf_path)

    cmd = ['paf2lastz', paf_path, '-q']
    if sort_secondaries:
        cmd += ['-s', secondary_lastz_path]

    if mask_bed_id:
        mask_bed_path = os.path.join(work_dir, "mask.bed")
        job.fileStore.readGlobalFile(mask_bed_id, mask_bed_path)
        cmd[1] = '-'
        cmd = [['pafmask', paf_path, mask_bed_path], cmd]

    cactus_call(parameters=cmd, outfile=lastz_path)

    lastz_id = job.fileStore.writeGlobalFile(lastz_path)

    if sort_secondaries:
        secondary_id = job.fileStore.writeGlobalFile(secondary_lastz_path)
        return [lastz_id, secondary_id]
    else:
        return lastz_id
コード例 #9
0
    def run(self, fileStore):
        outChunkID = None
        if self.prepOptions.preprocessJob == "checkUniqueHeaders":
            inChunk = fileStore.readGlobalFile(self.inChunkID)
            seqPaths = [
                fileStore.readGlobalFile(fileID) for fileID in self.seqIDs
            ]
            seqString = " ".join(seqPaths)
            args = [inChunk]
            if self.prepOptions.checkAssemblyHub:
                args += ["--checkAssemblyHub"]
            cactus_call(stdin_string=seqString,
                        parameters=["cactus_checkUniqueHeaders.py"] + args)
            outChunkID = self.inChunkID
        elif self.prepOptions.preprocessJob == "lastzRepeatMask":
            repeatMaskOptions = RepeatMaskOptions(
                proportionSampled=self.prepOptions.proportionToSample,
                minPeriod=self.prepOptions.minPeriod)
            outChunkID = self.addChild(
                LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions,
                                   queryID=self.inChunkID,
                                   targetIDs=self.seqIDs)).rv()
        elif self.prepOptions.preprocessJob == "none":
            outChunkID = self.inChunkID

        return outChunkID
コード例 #10
0
def toil_call_blast(job, options, seq_file, project, event, cigar_name, dep_names, *dep_fa_ids):

    work_dir = job.fileStore.getLocalTempDir()

    # serialize the seqfile so cactus-blast can use it 
    seq_file_path = os.path.join(work_dir, 'seqfile.txt')
    with open(seq_file_path, 'w') as sf:
        sf.write(str(seq_file))

    # read the fasta files
    assert len(dep_names) == len(dep_fa_ids)
    fa_paths = [os.path.join(work_dir, "{}.pp.fa".format(name)) for name in dep_names]
    for fa_path, fa_id in zip(fa_paths, dep_fa_ids):
        job.fileStore.readGlobalFile(fa_id, fa_path)
            
    cactus_call(parameters=['cactus-blast', os.path.join(work_dir, 'js'), seq_file_path, os.path.join(work_dir, os.path.basename(cigar_name)),
                 '--root', event, '--pathOverrides'] + fa_paths+ ['--pathOverrideNames'] + dep_names +
                ['--workDir', work_dir, '--maxCores', str(int(job.cores)), '--maxDisk', bytes2humanN(job.disk), '--maxMemory', bytes2humanN(job.memory)] + options.cactusOptions.strip().split(' '))

    # scrape the output files out of the workdir
    out_nameids = []
    for out_file in [f for f in os.listdir(work_dir) if os.path.isfile(os.path.join(work_dir, f))]:
        if out_file.startswith(os.path.basename(cigar_name)):
            out_nameids.append((os.path.basename(out_file), job.fileStore.writeGlobalFile(os.path.join(work_dir, out_file))))
            
    return out_nameids
コード例 #11
0
def get_mask_bed_from_fasta(job, event, fa_id, fa_path, min_length):
    """ make a bed file from one fasta"""
    work_dir = job.fileStore.getLocalTempDir()
    bed_path = os.path.join(work_dir, os.path.basename(fa_path) + '.mask.bed')
    fa_path = os.path.join(work_dir, os.path.basename(fa_path))
    is_gz = fa_path.endswith(".gz")
    job.fileStore.readGlobalFile(fa_id, fa_path, mutable=is_gz)
    if is_gz:
        cactus_call(parameters=['gzip', '-fd', fa_path])
        fa_path = fa_path[:-3]
    with open(bed_path, 'w') as bed_file, open(fa_path, 'r') as fa_file:
        for seq_record in SeqIO.parse(fa_file, 'fasta'):
            first_mask = None
            for i, c in enumerate(seq_record.seq):
                is_mask = c.islower() or c in ['n', 'N']
                if (
                        is_mask is False or i == len(seq_record.seq) - 1
                ) and first_mask is not None and i - first_mask >= min_length:
                    # we're one past an interval: write it
                    bed_file.write('{}\t{}\t{}\n'.format(
                        'id={}|{}'.format(event, seq_record.id), first_mask,
                        i))
                    first_mask = None
                elif is_mask is True and first_mask is None:
                    # we're starting a new interval: remember start position
                    first_mask = i
    return job.fileStore.writeGlobalFile(bed_path)
コード例 #12
0
def get_mask_bed_from_fasta(job,
                            event,
                            fa_id,
                            fa_path,
                            min_length,
                            work_dir=None):
    """ make a bed file from one fasta"""
    return_id = False  # hack in a toggle (work_dir) that lets this be called as a job or a function
    if not work_dir:
        work_dir = job.fileStore.getLocalTempDir()
        return_id = True
    bed_path = os.path.join(work_dir, os.path.basename(fa_path) + '.mask.bed')
    fa_path = os.path.join(work_dir, os.path.basename(fa_path))
    is_gz = fa_path.endswith(".gz")
    if return_id:
        job.fileStore.readGlobalFile(fa_id, fa_path, mutable=is_gz)
    if is_gz:
        cactus_call(parameters=['gzip', '-fd', fa_path])
        fa_path = fa_path[:-3]
    cactus_call(parameters=[
        'cactus_softmask2hardmask', fa_path, '-b', '-m',
        str(min_length)
    ],
                outfile=bed_path)
    if return_id:
        return job.fileStore.writeGlobalFile(bed_path)
    else:
        return bed_path
コード例 #13
0
def clip_vg(job, options, config, vg_path, vg_id):
    """ run clip-vg 
    """
    work_dir = job.fileStore.getLocalTempDir()
    is_decoy = vg_path == options.decoyGraph
    vg_path = os.path.join(work_dir, os.path.basename(vg_path))
    job.fileStore.readGlobalFile(vg_id, vg_path)
    out_path = vg_path + '.clip'

    cmd = ['clip-vg', vg_path, '-f']
    if options.clipLength is not None and not is_decoy:
        cmd += ['-u', str(options.clipLength)]
    for rs in options.rename:
        cmd += ['-r', rs]
    if options.reference:
        cmd += ['-e', options.reference]

    # sort while we're at it
    cmd = [cmd, ['vg', 'ids', '-s', '-']]

    cactus_call(parameters=cmd, outfile=out_path)

    # worth it
    cactus_call(parameters=['vg', 'validate', out_path])

    return job.fileStore.writeGlobalFile(out_path)
コード例 #14
0
    def run(self, fileStore):
        assert len(self.targetIDs) >= 1
        assert self.repeatMaskOptions.fragment > 1
        queryFile = fileStore.readGlobalFile(self.queryID)

        # chop up input fasta file into into fragments of specified size.  fragments overlap by
        # half their length.
        fragOutput = fileStore.getLocalTempFile()
        cactus_call(
            infile=queryFile,
            outfile=fragOutput,
            parameters=[
                "cactus_fasta_fragments.py",
                "--fragment=%s" % str(self.repeatMaskOptions.fragment),
                "--step=%s" % (str(self.repeatMaskOptions.fragment / 2)),
                "--origin=zero"
            ])
        fragmentsID = fileStore.writeGlobalFile(fragOutput)

        alignmentJob = self.addChild(
            AlignFastaFragments(repeatMaskOptions=self.repeatMaskOptions,
                                fragmentsID=fragmentsID,
                                targetIDs=self.targetIDs))

        maskCoveredIntervalsJob = self.addChild(
            MaskCoveredIntervals(repeatMaskOptions=self.repeatMaskOptions,
                                 alignmentsID=alignmentJob.rv(),
                                 queryID=self.queryID))
        alignmentJob.addFollowOn(maskCoveredIntervalsJob)

        return maskCoveredIntervalsJob.rv()
コード例 #15
0
def merge_gafs_into_paf(job, config, gaf_file_ids):
    """ Merge GAF alignments into a single PAF, applying some filters """

    work_dir = job.fileStore.getLocalTempDir()
    paf_path = os.path.join(work_dir, "mz_alignments.paf")
    gaf_paths = []
    for i, gaf_id in enumerate(gaf_file_ids):
        gaf_paths.append("mz_alignment_{}.gaf".format(i))
        job.fileStore.readGlobalFile(gaf_id,
                                     os.path.join(work_dir, gaf_paths[-1]))

    xml_node = findRequiredNode(config.xmlRoot, "refgraph")
    mzgaf2paf_opts = []
    mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float)
    if mz_filter:
        mzgaf2paf_opts += ['-u', str(mz_filter)]
    min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int)
    if min_mz:
        mzgaf2paf_opts += ['-m', str(min_mz)]
    mapq = getOptionalAttrib(xml_node, "minMAPQ", int)
    if mapq:
        mzgaf2paf_opts += ['-q', str(mapq)]
    gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int)
    if gaf_block:
        mzgaf2paf_opts += ['-b', str(gaf_block)]

    cactus_call(work_dir=work_dir,
                outfile=paf_path,
                parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts)

    # these are big, get rid of them as soon as we can (which is now)
    for gaf_id in gaf_file_ids:
        job.fileStore.deleteGlobalFile(gaf_id)

    return job.fileStore.writeGlobalFile(paf_path)
コード例 #16
0
def map_a_to_b(job, a, b, dipcall_filter):
    """Maps fasta a to fasta b.

    Args:
        a (global file): fasta file a. In map_all_to_ref, a is an assembly fasta.
        b (global file): fasta file b. In map_all_to_ref, b is the reference.

    Returns:
        [type]: [description]
    """

    print("in map a to b. a:", a, "b:", b)
    # map_to_ref_paf = job.fileStore.writeGlobalFile(job.fileStore.getLocalTempFile())
    tmp = job.fileStore.getLocalTempFile()
    map_to_ref_paf = job.fileStore.writeGlobalFile(tmp)

    if dipcall_filter:
        # note: in dipcall, they include argument "--paf-no-hit".
        # I don't see why they would include these "mappings", only to be filtered out
        # later. I have not included the argument.
        cactus_call(parameters=[
            "minimap2", "-c", "-xasm5", "--cs", "-r2k", "-o",
            job.fileStore.readGlobalFile(map_to_ref_paf),
            job.fileStore.readGlobalFile(b),
            job.fileStore.readGlobalFile(a)
        ])
    else:
        cactus_call(parameters=[
            "minimap2", "-cx", "asm5", "-o",
            job.fileStore.readGlobalFile(map_to_ref_paf),
            job.fileStore.readGlobalFile(b),
            job.fileStore.readGlobalFile(a)
        ])

    return map_to_ref_paf
コード例 #17
0
ファイル: dnabrnnMasking.py プロジェクト: xiangyupan/cactus
    def run(self, fileStore):
        """
        mask alpha satellites with dna-brnn
        """
        fastaFile = fileStore.readGlobalFile(self.fastaID)

        cmd = ['dna-brnn', fastaFile] + self.dnabrnnOpts.split()
        if '-i' not in self.dnabrnnOpts:
            # pull up the model
            # todo: is there are more robust way?
            cmd += ['-i', os.path.join(cactusRootPath(), 'attcc-alpha.knm')]

        if self.cores:
            cmd += ['-t', str(self.cores)]

        bedFile = fileStore.getLocalTempFile()

        # run dna-brnn to make a bed file
        cactus_call(outfile=bedFile, parameters=cmd)

        maskedFile = fileStore.getLocalTempFile()

        mask_cmd = [
            'cactus_fasta_softmask_intervals.py', '--origin=zero',
            '--minLength={}'.format(self.minLength), bedFile
        ]

        # do the softmasking
        cactus_call(infile=fastaFile, outfile=maskedFile, parameters=mask_cmd)

        return fileStore.writeGlobalFile(maskedFile)
コード例 #18
0
    def maskCoveredIntervals(self, fileStore, queryFile, alignment):
        """
        Mask the query fasta using the alignments to the target. Anything with more alignments than the period gets masked.
        """
        #This runs Bob's covered intervals program, which combines the lastz alignment info into intervals of the query.
        maskInfo = fileStore.getLocalTempFile()
        cactus_call(
            infile=alignment,
            outfile=maskInfo,
            parameters=[
                "cactus_covered_intervals",
                "--queryoffsets",
                "--origin=one",
                # * 2 takes into account the effect of the overlap
                "M=%s" % (int(self.repeatMaskOptions.period * 2))
            ])

        # the previous lastz command outputs a file of intervals (denoted with indices) to softmask.
        # we finish by applying these intervals to the input file, to produce the final, softmasked output.
        args = ["--origin=one"]
        if self.repeatMaskOptions.unmaskOutput:
            args.append("--unmask")
        args.append(maskInfo)
        maskedQuery = fileStore.getLocalTempFile()
        cactus_call(infile=queryFile,
                    outfile=maskedQuery,
                    parameters=["cactus_fasta_softmask_intervals.py"] + args)
        return maskedQuery
コード例 #19
0
def clip_vg(job, options, config, vg_path, vg_id):
    """ run clip-vg 
    """
    work_dir = job.fileStore.getLocalTempDir()
    is_decoy = vg_path == options.decoyGraph
    vg_path = os.path.join(work_dir, os.path.basename(vg_path))
    job.fileStore.readGlobalFile(vg_id, vg_path)
    out_path = vg_path + '.clip'

    cmd = ['clip-vg', vg_path, '-f']
    if options.clipLength is not None and not is_decoy:
        cmd += ['-u', str(options.clipLength)]
    for rs in options.rename:
        cmd += ['-r', rs]
    if options.reference:
        cmd += ['-e', options.reference]
    
    if getOptionalAttrib(findRequiredNode(config.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False):
        # our vg file has minigraph sequences -- we'll filter them out, along with any nodes
        # that don't appear in a non-minigraph path
        graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
        cmd += ['-d', graph_event]
        
    # sort while we're at it
    cmd = [cmd, ['vg', 'ids', '-s', '-']]
        
    cactus_call(parameters=cmd, outfile=out_path)

    # worth it
    cactus_call(parameters=['vg', 'validate', out_path])

    return job.fileStore.writeGlobalFile(out_path)
コード例 #20
0
def computePAFCoverage(job, config_node, paf_id):
    """ compute the gaps in PAF coverage, store them as a bed file, and add the bed file's filestore id
    into the config's dna-brnn xml element """
    paf_file = job.fileStore.readGlobalFile(paf_id)
    bed_file = job.fileStore.getLocalTempFile()

    dnabrnn_node = None
    for node in config_node.findall("preprocessor"):
        if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
            dnabrnn_node = node
            break

    assert dnabrnn_node is not None

    min_length = max(
        1, getOptionalAttrib(dnabrnn_node, 'minLength', typeFn=int, default=0))

    cactus_call(
        parameters=['pafcoverage', paf_file, '-g', '-m',
                    str(min_length)],
        outfile=bed_file)

    dnabrnn_node.attrib["inputBedID"] = job.fileStore.writeGlobalFile(bed_file)

    return config_node
コード例 #21
0
def toil_call_hal_append_subtrees(job, options, project, root_name, root_hal_id, event_names, *event_ids):

    work_dir = job.fileStore.getLocalTempDir()

    # donload the root hal file
    root_file = os.path.join(work_dir, '{}.hal'.format(root_name))
    job.fileStore.readGlobalFile(root_hal_id, root_file, mutable=True)
    
    # download the hal files from the file store
    hal_files = []
    for event_name, event_id in zip(event_names, event_ids):
        hal_files.append(os.path.join(work_dir, '{}.hal'.format(event_name)))
        job.fileStore.readGlobalFile(event_id, hal_files[-1])

        # append to the root
        cactus_call(parameters=['halAppendSubtree', root_file, hal_files[-1], event_name, event_name, '--merge'] +
                    options.halOptions.strip().split(' '))

    # bypassing toil.exportFile for now as it only works on promises returned by the
    # start job, which isn't how this is set up. also in practice it's often more convenient
    # to output to s3
    # todo: can we just use job.fileStore?
    if options.outHal.startswith('s3://'):
        # write it directly to s3
        write_s3(root_file, options.outHal, region=get_aws_region(options.jobStore))
    else:
        # write the output to disk
        shutil.copy2(root_file,  options.outHal)

    return job.fileStore.writeGlobalFile(root_file)
コード例 #22
0
def stopKtserver(dbElem):
    """Attempt to send the terminate signal to a ktserver."""
    try:
        cactus_call(parameters=['ktremotemgr', 'set'] +
                    getRemoteParams(dbElem) + ['TERMINATE', '1'])
    except:
        # The server is likely already down.
        pass
コード例 #23
0
 def testMirrorAndOrientAlignments(self):
     cactus_call(parameters=["cactus_mirrorAndOrientAlignments", 
                              self.logLevelString, 
                              self.simpleInputCigarPath, 
                              self.simpleOutputCigarPath])
     with open(self.simpleOutputCigarPath, 'r') as fh:
         outputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines
            
     # For each input alignment check that we have the two, oriented alignments
     for inputCigar in self.inputCigars: 
         name1, start1, end1, strand1 = inputCigar.split()[5:9]
         start1, end1 = int(start1), int(end1)
         coordinates1 = name1, start1, end1, strand1
         
         name2, start2, end2, strand2 = inputCigar.split()[1:5]
         start2, end2 = int(start2), int(end2)
         coordinates2 = name2, start2, end2, strand2
         
         score = inputCigar.split()[9]
         ops = inputCigar.split()[10:]
         
         def invertStrand(coordinates):
             # cigar: simpleSeqB1 0 9 + simpleSeqA1 10 0 - 0 M 8 D 1 M 1
             # cigar: simpleSeqB1 9 0 + simpleSeqA1 0 10 - 0 M 1 D 1 M 8
             name, start, end, strand = coordinates
             assert strand in ("+", "-")
             if strand == "+":
                 return name, end, start, "-" 
             return name, end, start, "+" 
         
         def reverseOps(ops):
             l = ops[:]
             l.reverse()
             l2 = []
             for i, j in zip(l[1::2], l[::2]):
                 l2 += [ i, j ]
             return l2
         
         def invertOpStrands(ops):
             l = [ "I" if op == "D" else ("D" if op == "I" else op) for op in ops[::2] ]
             l2 = []
             for op, length in zip(l, ops[1::2]):
                 l2 += [ op, length ]
             return l2
         
         if strand1 == "+":
             self.assertTrue(self.makeCigar(coordinates1, coordinates2, score, ops) in outputCigars)
         else:
             # Invert the strands
             self.assertTrue(self.makeCigar(invertStrand(coordinates1), 
                                       invertStrand(coordinates2), score, reverseOps(ops)) in outputCigars)
             
         if strand2 == "+":
             self.assertTrue(self.makeCigar(coordinates2, coordinates1, score,
                                       invertOpStrands(ops)) in outputCigars)
         else:
             self.assertTrue(self.makeCigar(invertStrand(coordinates2), invertStrand(coordinates1), 
                                         score, invertOpStrands(reverseOps(ops))) in outputCigars)
コード例 #24
0
ファイル: ktserverControl.py プロジェクト: ricky-lim/cactus
    def tryRun(self,
               dbElem,
               logPath,
               fileStore,
               existingSnapshotID=None,
               snapshotExportID=None):
        snapshotDir = os.path.join(fileStore.getLocalTempDir(), 'snapshot')
        os.mkdir(snapshotDir)
        snapshotPath = os.path.join(snapshotDir, KTSERVER_SNAPSHOT_NAME)
        if existingSnapshotID is not None:
            # Extract the existing snapshot to the snapshot
            # directory so it will be automatically loaded
            fileStore.readGlobalFile(existingSnapshotID, userPath=snapshotPath)
        process = cactus_call(server=True,
                              shell=False,
                              parameters=getKtserverCommand(
                                  dbElem, logPath, snapshotDir),
                              port=dbElem.getDbPort())

        blockUntilKtserverIsRunning(logPath)
        if existingSnapshotID is not None:
            # Clear the termination flag from the snapshot
            cactus_call(parameters=["ktremotemgr", "remove"] +
                        getRemoteParams(dbElem) + ["TERMINATE"])

        while True:
            # Check for the termination signal
            try:
                cactus_call(parameters=["ktremotemgr", "get"] +
                            getRemoteParams(dbElem) + ["TERMINATE"])
            except:
                # No terminate signal sent yet
                pass
            else:
                # Terminate signal received
                break
            # Check that the DB is still alive
            if process.poll() is not None or isKtServerFailed(logPath):
                with open(logPath) as f:
                    raise RuntimeError("KTServer failed. Log: %s" % f.read())
            sleep(60)
        process.send_signal(signal.SIGINT)
        process.wait()
        blockUntilKtserverIsFinished(logPath)
        if snapshotExportID is not None:
            if not os.path.exists(snapshotPath):
                raise RuntimeError(
                    "KTServer did not leave a snapshot on termination,"
                    " but a snapshot was requested.")
            if len(glob(os.path.join(snapshotDir, "*.ktss"))) != 1:
                # More than one snapshot file. It's not clear what
                # conditions trigger this--if any--but we
                # don't support it right now.
                raise RuntimeError("KTServer left more than one snapshot.")

            # Export the snapshot file to the file store
            fileStore.jobStore.updateFile(snapshotExportID, snapshotPath)
コード例 #25
0
    def gpuRepeatMask(self, fileStore, targetFile):
        """
        This is the gpu version of above.  It's much simpler in that there's no chunking or fragmenting
        """

        alignment_dir = fileStore.getLocalTempDir()

        # dont think gpu lastz can handle this
        assert not self.repeatMaskOptions.unmaskInput

        # filter out some default lastz options in the config that aren't supported
        lastz_opts = self.repeatMaskOptions.lastzOpts.split()
        gpu_opts = []
        for i in range(len(lastz_opts)):
            if lastz_opts[i] == "--ungapped" or lastz_opts[i] == "--nogapped":
                pass
            elif lastz_opts[i] is None or lastz_opts[i].startswith(
                    "--queryhsplimit="):
                pass
            elif lastz_opts[i] == "--queryhsplimit":
                lastz_opts[i + 1] = None
            else:
                gpu_opts += [lastz_opts[i]]

        cmd = [
            "segalign_repeat_masker",
            targetFile,
            "--lastz_interval={}".format(
                self.repeatMaskOptions.gpuLastzInterval),
            "--markend",
            "--neighbor_proportion",
            str(self.repeatMaskOptions.proportionSampled),
            # note: segalign now includes cactus_covered_intervals, so we pass the threshold here
            # and skip running it below
            "--M",
            str(self.repeatMaskOptions.period)
        ] + gpu_opts

        cactus_call(parameters=cmd, work_dir=alignment_dir)

        # scrape the segalign output into one big file, making an effort to read in numeric order
        merged_path = fileStore.getLocalTempFile()
        with open(merged_path, "a") as merged_file:
            for work_file in sorted(
                    os.listdir(alignment_dir),
                    key=lambda x: int(re.sub("[^0-9]", "", x))):
                # segalign_repeat_masker makes files that look like "tmp10.block0.intervals"
                # (not that there should be anything else in this directory)
                if work_file.startswith("tmp") and work_file.endswith(
                        "intervals"):
                    # append it do the merged file and delete it right away to keep disk usage lower
                    with open(os.path.join(alignment_dir, work_file),
                              "r") as frag_file:
                        shutil.copyfileobj(frag_file, merged_file)
                    os.remove(os.path.join(alignment_dir, work_file))

        return merged_path
コード例 #26
0
def subtractBed(bed1, bed2, destBed):
    """Subtract two non-bed12 beds"""
    # tmp. don't really want to use bedtools
    if os.path.getsize(bed1) == 0 or os.path.getsize(bed2) == 0:
        # bedtools will complain on zero-size beds
        os.rename(bed1, destBed)
    else:
        cactus_call(outfile=destBed,
                    parameters=["subtract", "-a", bed1, "-b", bed2])
コード例 #27
0
    def run(self, fileStore):
        chunkList = [readGlobalFileWithoutCache(fileStore, fileID) for fileID in self.chunkIDList]

        #Docker expects paths relative to the work dir
        chunkList = [os.path.basename(chunk) for chunk in chunkList]
        outSequencePath = fileStore.getLocalTempFile()
        cactus_call(outfile=outSequencePath, stdin_string=" ".join(chunkList),
                    parameters=["cactus_batch_mergeChunks"])
        return fileStore.writeGlobalFile(outSequencePath)
コード例 #28
0
    def run(self, fileStore):
        chunkList = [readGlobalFileWithoutCache(fileStore, fileID) for fileID in self.chunkIDList]

        #Docker expects paths relative to the work dir
        chunkList = [os.path.basename(chunk) for chunk in chunkList]
        outSequencePath = fileStore.getLocalTempFile()
        cactus_call(outfile=outSequencePath, stdin_string=" ".join(chunkList),
                    parameters=["cactus_batch_mergeChunks"])
        return fileStore.writeGlobalFile(outSequencePath)
コード例 #29
0
def calculateCoverage(sequenceFile, cigarFile, outputFile, fromGenome=None, depthById=False, work_dir=None):
    logger.info("Calculating coverage of cigar file %s on %s, writing to %s" % (
        cigarFile, sequenceFile, outputFile))
    args = [sequenceFile, cigarFile]
    if fromGenome is not None:
        args += ["--from", fromGenome]
    if depthById:
        args += ["--depthById"]
    cactus_call(outfile=outputFile, work_dir=work_dir,
                parameters=["cactus_coverage"] + args)
コード例 #30
0
ファイル: blast.py プロジェクト: benedictpaten/cactus
def calculateCoverage(sequenceFile, cigarFile, outputFile, fromGenome=None, depthById=False, work_dir=None):
    logger.info("Calculating coverage of cigar file %s on %s, writing to %s" % (
        cigarFile, sequenceFile, outputFile))
    args = [sequenceFile, cigarFile]
    if fromGenome is not None:
        args += ["--from", fromGenome]
    if depthById:
        args += ["--depthById"]
    cactus_call(outfile=outputFile, work_dir=work_dir,
                parameters=["cactus_coverage"] + args)
コード例 #31
0
def compress_gaf(job, gaf_file_id):
    gaf_path = job.fileStore.readGlobalFile(gaf_file_id)
    zip_path = job.fileStore.getLocalTempFile()
    cactus_call(parameters=[
        'gzip',
        gaf_path,
        '-c',
    ], outfile=zip_path)
    job.fileStore.deleteGlobalFile(gaf_file_id)
    return job.fileStore.writeGlobalFile(zip_path)
コード例 #32
0
def mappingQualityRescoring(job, inputAlignmentFileID, minimumMapQValue,
                            maxAlignmentsPerSite, alpha, logLevel):
    """
    Function to rescore and filter alignments by calculating the mapping quality of sub-alignments

    Returns primary alignments and secondary alignments in two separate files.
    """
    inputAlignmentFile = job.fileStore.readGlobalFile(inputAlignmentFileID)

    job.fileStore.logToMaster("Input cigar file has %s lines" %
                              countLines(inputAlignmentFile))

    # Get temporary file
    assert maxAlignmentsPerSite >= 1
    tempAlignmentFiles = [
        job.fileStore.getLocalTempFile() for i in range(maxAlignmentsPerSite)
    ]

    # Mirror and orient alignments, sort, split overlaps and calculate mapping qualities
    cactus_call(parameters=[
        ["cat", inputAlignmentFile],
        ["cactus_mirrorAndOrientAlignments", logLevel],
        [
            "sort", "-T{}".format(job.fileStore.getLocalTempDir()), "-k6,6",
            "-k7,7n", "-k8,8n"
        ],  # This sorts by coordinate
        [
            "uniq"
        ],  # This eliminates any annoying duplicates if lastz reports the alignment in both orientations
        ["cactus_splitAlignmentOverlaps", logLevel],
        [
            "cactus_calculateMappingQualities", logLevel,
            str(maxAlignmentsPerSite),
            str(minimumMapQValue),
            str(alpha)
        ] + tempAlignmentFiles
    ])

    # Merge together the output files in order
    secondaryTempAlignmentFile = job.fileStore.getLocalTempFile()
    if len(tempAlignmentFiles) > 1:
        cactus_call(parameters=[["cat"] + tempAlignmentFiles[1:]],
                    outfile=secondaryTempAlignmentFile)

    job.fileStore.logToMaster(
        "Filtered, non-overlapping primary cigar file has %s lines" %
        countLines(tempAlignmentFiles[0]))
    job.fileStore.logToMaster(
        "Filtered, non-overlapping secondary cigar file has %s lines" %
        countLines(secondaryTempAlignmentFile))

    # Now write back alignments results file and return
    return job.fileStore.writeGlobalFile(
        tempAlignmentFiles[0]), job.fileStore.writeGlobalFile(
            secondaryTempAlignmentFile)
コード例 #33
0
def minimap_index(job, ref_name, ref_id):
    """ make a minimap2 index of a reference genome """

    work_dir = job.fileStore.getLocalTempDir()
    fa_path = os.path.join(work_dir, os.path.basename(ref_name))
    idx_path = fa_path + ".idx"
    job.fileStore.readGlobalFile(ref_id, fa_path)

    cactus_call(parameters=['minimap2', fa_path, '-d', idx_path, '-x', 'asm5'])

    return job.fileStore.writeGlobalFile(idx_path)
コード例 #34
0
ファイル: blast.py プロジェクト: benedictpaten/cactus
def subtractBed(bed1, bed2, destBed):
    """Subtract two non-bed12 beds"""
    # tmp. don't really want to use bedtools
    if os.path.getsize(bed1) == 0 or os.path.getsize(bed2) == 0:
        # bedtools will complain on zero-size beds
        os.rename(bed1, destBed)
    else:
        cactus_call(outfile=destBed,
                    parameters=["subtract",
                                "-a", bed1,
                                "-b", bed2])
コード例 #35
0
    def tryRun(self, dbElem, logPath, fileStore, existingSnapshotID=None, snapshotExportID=None):
        snapshotDir = os.path.join(fileStore.getLocalTempDir(), 'snapshot')
        os.mkdir(snapshotDir)
        snapshotPath = os.path.join(snapshotDir, KTSERVER_SNAPSHOT_NAME)
        if existingSnapshotID is not None:
            # Extract the existing snapshot to the snapshot
            # directory so it will be automatically loaded
            fileStore.readGlobalFile(existingSnapshotID, userPath=snapshotPath)
        process = cactus_call(server=True, shell=False,
                              parameters=getKtserverCommand(dbElem, logPath, snapshotDir),
                              port=dbElem.getDbPort())

        blockUntilKtserverIsRunning(logPath)
        if existingSnapshotID is not None:
            # Clear the termination flag from the snapshot
            cactus_call(parameters=["ktremotemgr", "remove"] + getRemoteParams(dbElem) + ["TERMINATE"])

        while True:
            # Check for the termination signal
            try:
                cactus_call(parameters=["ktremotemgr", "get"] + getRemoteParams(dbElem) + ["TERMINATE"],
                            swallowStdErr=True)
            except:
                # No terminate signal sent yet
                pass
            else:
                # Terminate signal received
                break
            # Check that the DB is still alive
            if process.poll() is not None or isKtServerFailed(logPath):
                with open(logPath) as f:
                    raise RuntimeError("KTServer failed. Log: %s" % f.read())
            sleep(60)
        process.send_signal(signal.SIGINT)
        process.wait()
        blockUntilKtserverIsFinished(logPath)
        if snapshotExportID is not None:
            if not os.path.exists(snapshotPath):
                with open(logPath) as f:
                    raise RuntimeError("KTServer did not leave a snapshot on termination,"
                                       " but a snapshot was requested. Log: %s" % f.read())
            if len(glob(os.path.join(snapshotDir, "*.ktss"))) != 1:
                # More than one snapshot file. It's not clear what
                # conditions trigger this--if any--but we
                # don't support it right now.
                with open(logPath) as f:
                    raise RuntimeError("KTServer left more than one snapshot. Log: %s" % f.read())

            # Export the snapshot file to the file store
            fileStore.jobStore.updateFile(snapshotExportID, snapshotPath)
コード例 #36
0
 def testCalculateMappingQualities(self):
     with open(self.simpleInputCigarPath, 'w') as fH:
         fH.write("\n".join(self.sortedNonOverlappingInputCigars) + "\n")
            
     cactus_call(parameters=[ "cactus_calculateMappingQualities", 
                              self.logLevelString, 
                              '1', '0', "1.0",
                              self.simpleOutputCigarPath,
                              self.simpleInputCigarPath ])
     
     with open(self.simpleOutputCigarPath, 'r') as fh:
         outputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines
     
     self.assertEqual(self.filteredSortedNonOverlappingInputCigars, outputCigars)
コード例 #37
0
ファイル: commonTest.py プロジェクト: benedictpaten/cactus
 def testCactusCallPipes(self):
     inputFile = getTempFile(rootDir=self.tempDir)
     with open(inputFile, 'w') as f:
         f.write('foobar\n')
     # using 'cat' here rather than infile is intentional; it tests
     # whether the directory is mounted into containers correctly.
     output = cactus_call(parameters=[['cat', inputFile],
                                      ['sed', 's/foo/baz/g'],
                                      ['awk', '{ print "quux" $0 }']],
                          check_output=True)
     self.assertEquals(output, 'quuxbazbar\n')
コード例 #38
0
ファイル: commonTest.py プロジェクト: benedictpaten/cactus
    def testCactusCall(self):
        inputFile = getTempFile(rootDir=self.tempDir)

        with open("/dev/urandom") as randText:
            with open(inputFile, 'w') as fh:
                fh.write(randText.read(1024).encode('base64'))
        input = "".join(open(inputFile).read().split("\n"))

        #Send input to container's stdin through a file, get output
        #from stdout
        output = "".join(cactus_call(infile=inputFile, check_output=True,
                                     parameters=["docker_test_script"]).split("\n"))
        self.assertEquals(input, output)


        #Send input as string, get output from stdout
        output = "".join(cactus_call(stdin_string=input, check_output=True,
                             parameters=["docker_test_script"]).split("\n"))

        self.assertEquals(input, output)
コード例 #39
0
ファイル: blast.py プロジェクト: benedictpaten/cactus
 def run(self, fileStore):   
     blastResultsFile = fileStore.getLocalTempFile()
     seqFile = fileStore.readGlobalFile(self.seqFileID)
     runSelfLastz(seqFile, blastResultsFile, lastzArguments=self.blastOptions.lastzArguments)
     if self.blastOptions.realign:
         realignResultsFile = fileStore.getLocalTempFile()
         runCactusSelfRealign(seqFile, inputAlignmentsFile=blastResultsFile,
                              outputAlignmentsFile=realignResultsFile,
                              realignArguments=self.blastOptions.realignArguments)
         blastResultsFile = realignResultsFile
     resultsFile = fileStore.getLocalTempFile()
     cactus_call(parameters=["cactus_blast_convertCoordinates",
                             blastResultsFile,
                             resultsFile,
                             str(self.blastOptions.roundsOfCoordinateConversion)])
     if self.blastOptions.compressFiles:
         #TODO: This throws away the compressed file
         seqFile = compressFastaFile(seqFile)
     logger.info("Ran the self blast okay")
     return fileStore.writeGlobalFile(resultsFile)
コード例 #40
0
ファイル: blast.py プロジェクト: benedictpaten/cactus
    def run(self, fileStore):
        seqFile1 = fileStore.readGlobalFile(self.seqFileID1)
        seqFile2 = fileStore.readGlobalFile(self.seqFileID2)
        if self.blastOptions.compressFiles:
            seqFile1 = decompressFastaFile(seqFile1, fileStore.getLocalTempFile())
            seqFile2 = decompressFastaFile(seqFile2, fileStore.getLocalTempFile())
        blastResultsFile = fileStore.getLocalTempFile()

        runLastz(seqFile1, seqFile2, blastResultsFile, lastzArguments = self.blastOptions.lastzArguments)
        if self.blastOptions.realign:
            realignResultsFile = fileStore.getLocalTempFile()
            runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=blastResultsFile,
                             outputAlignmentsFile=realignResultsFile,
                             realignArguments=self.blastOptions.realignArguments)
            blastResultsFile = realignResultsFile
            
        resultsFile = fileStore.getLocalTempFile()
        cactus_call(parameters=["cactus_blast_convertCoordinates",
                                blastResultsFile,
                                resultsFile,
                                str(self.blastOptions.roundsOfCoordinateConversion)])
        logger.info("Ran the blast okay")
        return fileStore.writeGlobalFile(resultsFile)
コード例 #41
0
def findOccupiedPorts():
    """Attempt to find all currently taken TCP ports.

    Returns a set of ints, representing taken ports."""
    netstatOutput = cactus_call(parameters=["netstat", "-tuplen"], check_output=True)
    ports = set()
    for line in netstatOutput.split("\n"):
        fields = line.split()
        if len(fields) != 9:
            # Header or other garbage line
            continue
        port = int(fields[3].split(':')[-1])
        ports.add(port)
    logger.debug('Detected ports in use: %s' % repr(ports))
    return ports
コード例 #42
0
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False):

    HALPath = "tmp_alignment.hal"

    # traverse tree to make sure we are going breadth-first
    tree = project.mcTree

    # find subtree if event specified
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in project.expMap:
            experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName])
            experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID()))
            expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True))
            assert len(expTreeString) > 1
            assert experiment.getHalID() is not None
            assert experiment.getHalFastaID() is not None
            subHALPath = job.fileStore.readGlobalFile(experiment.getHalID())
            halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID())

            args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)]

            if len(outgroups) > 0:
                args += ["--outgroups", ",".join(outgroups)]
            if cacheBytes is not None:
                args += ["--cacheBytes", cacheBytes]
            if cacheMDC is not None:
                args += ["--cacheMDC", cacheMDC]
            if cacheRDC is not None:
                args += ["--cacheRDC", cacheRDC]
            if cacheW0 is not None:
                args += ["--cacheW0", cacheW0]
            if chunk is not None:
                args += ["--chunk", chunk]
            if deflate is not None:
                args += ["--deflate", deflate]
            if inMemory is True:
                args += ["--inMemory"]

            cactus_call(parameters=["halAppendCactusSubtree"] + args)

    cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit])
    with job.fileStore.readGlobalFileStream(project.configID) as configFile:
        cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read())])

    return job.fileStore.writeGlobalFile(HALPath)
コード例 #43
0
def logAssemblyStats(job, message, name, sequenceID, preemptable=True):
    sequenceFile = job.fileStore.readGlobalFile(sequenceID)
    analysisString = cactus_call(parameters=["cactus_analyseAssembly", sequenceFile], check_output=True)
    job.fileStore.logToMaster("%s, got assembly stats for genome %s: %s" % (message, name, analysisString))
コード例 #44
0
ファイル: blast.py プロジェクト: benedictpaten/cactus
    def run(self, fileStore):
        # Trim outgroup, convert outgroup coordinates, and add to
        # outgroup fragments dir

        outgroupSequenceFiles = [fileStore.readGlobalFile(fileID) for fileID in self.outgroupSequenceIDs]
        mostRecentResultsFile = fileStore.readGlobalFile(self.mostRecentResultsID)
        trimmedOutgroup = fileStore.getLocalTempFile()
        outgroupCoverage = fileStore.getLocalTempFile()
        calculateCoverage(outgroupSequenceFiles[0],
                          mostRecentResultsFile, outgroupCoverage)
        # The windowSize and threshold are fixed at 1: anything more
        # and we will run into problems with alignments that aren't
        # covered in a matching trimmed sequence.
        trimSequences(outgroupSequenceFiles[0], outgroupCoverage,
                      trimmedOutgroup, flanking=self.blastOptions.trimOutgroupFlanking,
                      windowSize=1, threshold=1)
        outgroupConvertedResultsFile = fileStore.getLocalTempFile()
        with open(outgroupConvertedResultsFile, 'w') as f:
            upconvertCoords(cigarPath=mostRecentResultsFile,
                            fastaPath=trimmedOutgroup,
                            contigNum=1,
                            outputFile=f)

        self.outgroupFragmentIDs.append(fileStore.writeGlobalFile(trimmedOutgroup))
        sequenceFiles = [fileStore.readGlobalFile(path) for path in self.sequenceIDs]
        untrimmedSequenceFiles = [fileStore.readGlobalFile(path) for path in self.untrimmedSequenceIDs]

        # Report coverage of the latest outgroup on the trimmed ingroups.
        for trimmedIngroupSequence, ingroupSequence, ingroupName in zip(sequenceFiles, untrimmedSequenceFiles, self.ingroupNames):
            tmpIngroupCoverage = fileStore.getLocalTempFile()
            calculateCoverage(trimmedIngroupSequence, mostRecentResultsFile,
                              tmpIngroupCoverage)
            fileStore.logToMaster("Coverage on %s from outgroup #%d, %s: %s%% (current ingroup length %d, untrimmed length %d). Outgroup trimmed to %d bp from %d" % (ingroupName, self.outgroupNumber, self.outgroupNames[self.outgroupNumber - 1], percentCoverage(trimmedIngroupSequence, tmpIngroupCoverage), sequenceLength(trimmedIngroupSequence), sequenceLength(ingroupSequence), sequenceLength(trimmedOutgroup), sequenceLength(outgroupSequenceFiles[0])))

        # Convert the alignments' ingroup coordinates.
        ingroupConvertedResultsFile = fileStore.getLocalTempFile()
        if self.sequenceIDs == self.untrimmedSequenceIDs:
            # No need to convert ingroup coordinates on first run.
            shutil.copy(outgroupConvertedResultsFile,
                        ingroupConvertedResultsFile)
        else:
            cactus_call(parameters=["cactus_blast_convertCoordinates",
                                    "--onlyContig1",
                                    outgroupConvertedResultsFile,
                                    ingroupConvertedResultsFile,
                                    "1"])
        # Append the latest results to the accumulated outgroup coverage file
        if self.outgroupResultsID:
            outgroupResultsFile = fileStore.readGlobalFile(self.outgroupResultsID, mutable=True)
        else:
            outgroupResultsFile = fileStore.getLocalTempFile()
        with open(ingroupConvertedResultsFile) as results:
            with open(outgroupResultsFile, 'a') as output:
                output.write(results.read())

        self.outgroupResultsID = fileStore.writeGlobalFile(outgroupResultsFile)

        # Report coverage of the all outgroup alignments so far on the ingroups.
        ingroupCoverageFiles = []
        self.ingroupCoverageIDs = []
        for ingroupSequence, ingroupName in zip(untrimmedSequenceFiles, self.ingroupNames):
            ingroupCoverageFile = fileStore.getLocalTempFile()
            calculateCoverage(sequenceFile=ingroupSequence, cigarFile=outgroupResultsFile,
                              outputFile=ingroupCoverageFile, depthById=self.blastOptions.trimOutgroupDepth > 1)
            ingroupCoverageFiles.append(ingroupCoverageFile)
            self.ingroupCoverageIDs.append(fileStore.writeGlobalFile(ingroupCoverageFile))
            fileStore.logToMaster("Cumulative coverage of %d outgroups on ingroup %s: %s" % (self.outgroupNumber, ingroupName, percentCoverage(ingroupSequence, ingroupCoverageFile)))

        if len(self.outgroupSequenceIDs) > 1:
            # Trim ingroup seqs and recurse on the next outgroup.
            trimmedSeqs = []
            # Use the accumulated results so far to trim away the
            # aligned parts of the ingroups.
            for i, sequenceFile in enumerate(untrimmedSequenceFiles):
                outgroupCoverageFile = ingroupCoverageFiles[i]
                selfCoverageFile = fileStore.getLocalTempFile()
                coverageFile = fileStore.getLocalTempFile()
                if self.blastOptions.keepParalogs:
                    subtractBed(outgroupCoverageFile, selfCoverageFile, coverageFile)
                else:
                    coverageFile = outgroupCoverageFile

                trimmed = fileStore.getLocalTempFile()
                trimSequences(sequenceFile, coverageFile, trimmed,
                              complement=True, flanking=self.blastOptions.trimFlanking,
                              minSize=self.blastOptions.trimMinSize,
                              threshold=self.blastOptions.trimThreshold,
                              windowSize=self.blastOptions.trimWindowSize,
                              depth=self.blastOptions.trimOutgroupDepth)
                trimmedSeqs.append(trimmed)
            trimmedSeqIDs = [fileStore.writeGlobalFile(path, cleanup=True) for path in trimmedSeqs]
            return self.addChild(BlastFirstOutgroup(
                ingroupNames=self.ingroupNames,
                untrimmedSequenceIDs=self.untrimmedSequenceIDs,
                sequenceIDs=trimmedSeqIDs,
                outgroupNames=self.outgroupNames,
                outgroupSequenceIDs=self.outgroupSequenceIDs[1:],
                outgroupFragmentIDs=self.outgroupFragmentIDs,
                outgroupResultsID=self.outgroupResultsID,
                blastOptions=self.blastOptions,
                outgroupNumber=self.outgroupNumber + 1,
                ingroupCoverageIDs=self.ingroupCoverageIDs)).rv()
        else:
            # Finally, put the ingroups and outgroups results together
            return (self.outgroupResultsID, self.outgroupFragmentIDs, self.ingroupCoverageIDs)
コード例 #45
0
def stopKtserver(dbElem):
    """Attempt to send the terminate signal to a ktserver."""
    cactus_call(parameters=['ktremotemgr', 'set'] + getRemoteParams(dbElem) + ['TERMINATE', '1'])
コード例 #46
0
 def testCuTest(self):
     cactus_call(parameters=["referenceTests", getLogLevelString()])
コード例 #47
0
 def testPosetAlignerAPI(self):
     """Run all the cactus base aligner CuTests, fail if any of them fail.
     """
     cactus_call(parameters=["cactus_barTests", getLogLevelString()])
コード例 #48
0
 def testSplitAlignmentsOverlaps(self):
     self.inputCigars = [
         'cigar: simpleSeqB1 9 18 + simpleSeqA1 2 6 + 1.000000 M 3 I 5 M 1',
         'cigar: simpleSeqB1 9 18 + simpleSeqA1 2 6 + 4.000000 M 1 I 5 M 3',
         'cigar: simpleSeqZ1 0 1 + simpleSeqA1 6 7 + 3.000000 M 1',
         'cigar: simpleSeqB1 18 28 + simpleSeqA2 0 10 + 8.000000 M 1 I 2 M 2 D 2 M 5',
         'cigar: simpleSeqB1 28 30 + simpleSeqA2 6 8 + 3.000000 M 2',
         'cigar: simpleSeqB1 32 30 - simpleSeqA2 7 9 + 72.000000 M 2',
         'cigar: simpleSeqBC 9 0 - simpleSeqAC 0 10 + 5.000000 M 1 D 1 M 8',
         'cigar: simpleSeqA1 2 6 + simpleSeqB1 9 18 + 1.000000 M 3 D 5 M 1',
         'cigar: simpleSeqA1 2 6 + simpleSeqB1 9 18 + 4.000000 M 1 D 5 M 3',
         'cigar: simpleSeqA2 0 10 + simpleSeqB1 18 28 + 8.000000 M 1 D 2 M 2 I 2 M 5',
         'cigar: simpleSeqA2 6 8 + simpleSeqB1 28 30 + 3.000000 M 2',
         'cigar: simpleSeqA2 9 7 - simpleSeqB1 30 32 + 72.000000 M 2',
         'cigar: simpleSeqAC 10 0 - simpleSeqBC 0 9 + 5.000000 M 8 I 1 M 1',
         'cigar: simpleSeqD 0 5 + simpleSeqC1 0 5 + 2.000000 M 5',
         'cigar: simpleSeqNonExistent 0 10 + simpleSeqC1 0 10 + 0.500000 M 10',
         'cigar: simpleSeqD 5 10 + simpleSeqC1 5 10 + 8.000000 M 5',
         'cigar: simpleSeqC1 15 20 + simpleSeqC1 10 15 + 19.000000 M 5',
         'cigar: simpleSeqC1 10 15 + simpleSeqC1 15 20 + 19.000000 M 5',
         'cigar: simpleSeqC1 0 5 + simpleSeqD 0 5 + 2.000000 M 5',
         'cigar: simpleSeqC1 5 10 + simpleSeqD 5 10 + 8.000000 M 5',
         'cigar: simpleSeqC1 0 10 + simpleSeqNonExistent 0 10 + 0.500000 M 10',
         'cigar: simpleSeqA1 6 7 + simpleSeqZ1 0 1 + 3.000000 M 1'
     ]
     with open(self.simpleInputCigarPath, 'w') as fH:
         fH.write("\n".join(self.inputCigars) + "\n")
         
     cactus_call(parameters=["cactus_splitAlignmentOverlaps", 
                              self.logLevelString, 
                              self.simpleInputCigarPath, 
                              self.simpleOutputCigarPath])
     
     with open(self.simpleOutputCigarPath, 'r') as fh:
         outputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines
     
     # Get start and end coordinates of cigars
     ends = set()
     for inputCigar in self.inputCigars:
         name1, start1, end1, strand1 = inputCigar.split()[5:9]
         ends.add((name1, int(start1)))
         ends.add((name1, int(end1)))
         assert strand1 == "+"
     
     # Count of expected number of chopped up cigars 
     totalExpectedCigars = 0
     
     # Function to split a list of ops into a prefix and suffix list
     def splitPrefixOps(ops, cutPoint):
         pOps, sOps = [], []
         j = 0
         for i in range(0, len(ops), 2):
             op, length = ops[i], int(ops[i+1])
             assert op in ("I", "D", "M")
             if op == "I":
                 pOps.append(op)
                 pOps.append(length)
                 continue
             if j + length <= cutPoint:
                 pOps.append(op)
                 pOps.append(length)
                 j += length
                 if j == cutPoint:
                     break
             else:
                 assert j + length > cutPoint
                 pOps.append(op)
                 pOps.append(cutPoint - j)
                 sOps.append(op)
                 sOps.append(length - (cutPoint - j))
                 break
         sOps += ops[i+2:]  
         
         return pOps, sOps     
     
     # For each cigar:
     for inputCigar in self.inputCigars:
         name1, start1, end1, strand1 = inputCigar.split()[5:9]
         start1, end1 = int(start1), int(end1)
         assert strand1 == "+"
         name2, start2, end2, strand2 = inputCigar.split()[1:5]
         start2, end2 = int(start2), int(end2)
         score = float(inputCigar.split()[9])
         ops = inputCigar.split()[10:]
         
         # For each intermediate chop point
         i = start1
         for j in xrange(start1+1, end1+1):
             if (name1, j) in ends:
                 # Chop up cigar 
                 coordinates1 = name1, i, j, "+"
                 
                 # Get sublist of ops
                 pOps, subOps = splitPrefixOps(ops, i - start1)
                 subOps, sOps = splitPrefixOps(subOps, j - i)
                 
                 x = lambda ops : sum([ int(ops[k+1]) for k in range(0, len(ops), 2) if ops[k] != 'D' ])
                 k = x(pOps)
                 l = k + x(subOps)
                 
                 # Get second coordinates
                 if strand2 == "+":
                     coordinates2 = name2, start2 + k, start2 + l, strand2
                 else:
                     assert strand2 == "-"
                     coordinates2 = name2, start2 - k, start2 - l, strand2
                 
                 choppedCigar = self.makeCigar(coordinates1, coordinates2, score, subOps)
                 
                 # Check each chopped up cigar is in output
                 self.assertTrue(choppedCigar in outputCigars)
                 
                 # Inc. number of expected cigars
                 totalExpectedCigars += 1
                 
                 # Check previous coordinate
                 i = j
                 
     # Check we have the expected number of cigars  
     self.assertEquals(totalExpectedCigars, len(outputCigars))
コード例 #49
0
 def testHalGeneratorFunctions(self):
     """Run all the CuTests, fail if any of them fail.
     """
     cactus_call(parameters=["cactus_halGeneratorTests", getLogLevelString()])