예제 #1
0
def merge_pafs(job, paf_file_id_map):
    """ merge up some pafs """
    paf_paths = [
        job.fileStore.readGlobalFile(paf_id)
        for paf_id in paf_file_id_map.values()
    ]
    merged_path = job.fileStore.getLocalTempFile()
    catFiles(paf_paths, merged_path)
    return job.fileStore.writeGlobalFile(merged_path)
예제 #2
0
def runCactusProgressive(options):
    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()
            #Create the progressive cactus project
            projWrapper = ProjectWrapper(options)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)
            #import the sequences
            for genome, seq in project.inputSequenceMap.items():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([
                        os.path.join(seq, subSeq) for subSeq in os.listdir(seq)
                    ], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                project.inputSequenceIDMap[genome] = toil.importFile(seq)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            project.writeXML(pjPath)
            halID = toil.start(
                RunCactusPreprocessorThenProgressiveDown(
                    options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))
예제 #3
0
def combine_ref_contig_splits(job, original_ref_entry, remap_ref_entry):
    """ combine fa and paf files for splits for a ref contig """
    work_dir = job.fileStore.getLocalTempDir()

    # combine the event fas
    for event in original_ref_entry['fa']:
        if event in remap_ref_entry['fa']:
            orig_fa_path = os.path.join(work_dir, event + '.fa')
            remap_fa_path = os.path.join(work_dir, event + '.remap.fa')
            new_fa_path = os.path.join(work_dir, event + '.combine.fa')
            job.fileStore.readGlobalFile(original_ref_entry['fa'][event], orig_fa_path, mutable=True)
            job.fileStore.readGlobalFile(remap_ref_entry['fa'][event], remap_fa_path, mutable=True)
            catFiles([orig_fa_path, remap_fa_path], new_fa_path)
            original_ref_entry['fa'][event] = job.fileStore.writeGlobalFile(new_fa_path)
            os.remove(orig_fa_path)
            os.remove(remap_fa_path)
                
    return original_ref_entry
예제 #4
0
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig,
              reference_event, mask_bed_id):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    if not paf_ids:
        # we can bypass when, ex, doing second pass on ambiguous sequences but not are present
        return [None, None]

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")
    bed_path = os.path.join(work_dir, "mask.bed")
    log_path = os.path.join(work_dir, "split.log")
    if (mask_bed_id):
        job.fileStore.readGlobalFile(mask_bed_id, bed_path)

    if gfa_id:
        job.fileStore.readGlobalFile(gfa_id, gfa_path)

    paf_paths = []
    for i, paf_id in enumerate(paf_ids):
        paf_paths.append(
            '{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path)
        job.fileStore.readGlobalFile(paf_id, paf_paths[-1])
    if len(paf_paths) > 1:
        catFiles(paf_paths, paf_path)

    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                       "minQueryCoverage",
                                       default="0")
    small_query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                             "minQuerySmallCoverage",
                                             default="0")
    small_coverage_threshold = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                                 "minQuerySmallThreshold",
                                                 default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                         "minQueryUniqueness",
                                         default="0")
    max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                 "graphmap_split"),
                                "maxGap",
                                default="0")
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")

    cmd = [
        'rgfa-split', '-p', paf_path, '-b', out_prefix, '-n', query_coverage,
        '-N', small_query_coverage, '-T', small_coverage_threshold, '-Q',
        query_uniqueness, '-P', max_gap, '-a', amb_name, '-L', log_path
    ]
    if gfa_id:
        cmd += ['-g', gfa_path, '-G']
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]
    if mask_bed_id:
        cmd += ['-B', bed_path]
    min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "minMAPQ")
    if min_mapq:
        cmd += ['-A', min_mapq]

    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [
                ".gfa", ".paf", ".fa_contigs"
        ]:
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, out_name))

    return output_id_map, job.fileStore.writeGlobalFile(log_path)
예제 #5
0
def cat_beds(job, bed_ids):
    in_beds = [job.fileStore.readGlobalFile(bed_id) for bed_id in bed_ids]
    out_bed = job.fileStore.getLocalTempFile()
    catFiles(in_beds, out_bed)
    return job.fileStore.writeGlobalFile(out_bed)
예제 #6
0
def runCactusGraphMapSplit(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            wf_output = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # load up the contigs if any
            ref_contigs = set(options.refContigs)
            # todo: use import?
            if options.refContigsFile:
                with open(options.refContigsFile, 'r') as rc_file:
                    for line in rc_file:
                        if len(line.strip()):
                            ref_contigs.add(line.strip().split()[0])

            if options.otherContig:
                assert options.otherContig not in ref_contigs

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "graphmap"),
                                            "assemblyName",
                                            default="_MINIGRAPH_")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the paf
            paf_id = toil.importFile(makeURL(options.graphmapPAF))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            leaves = set([
                seqFile.tree.getName(node)
                for node in seqFile.tree.getLeaves()
            ])

            if graph_event not in leaves:
                raise RuntimeError(
                    "Minigraph name {} not found in seqfile".format(
                        graph_event))
            if options.reference and options.reference not in leaves:
                raise RuntimeError(
                    "Name given with --reference {} not found in seqfile".
                    format(options.reference))

            for genome, seq in seqFile.pathMap.items():
                if genome in leaves:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    logger.info("Importing {}".format(seq))
                    seqIDMap[genome] = (seq, toil.importFile(seq))

            # run the workflow
            wf_output = toil.start(
                Job.wrapJobFn(graphmap_split_workflow, options, config,
                              seqIDMap, gfa_id, options.minigraphGFA, paf_id,
                              options.graphmapPAF, ref_contigs,
                              options.otherContig))

        #export the split data
        export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:],
                          options.outDir, config)
예제 #7
0
def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.acyclic:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.acyclic not in leaves:
            raise RuntimeError(
                "Genome specified with --acyclic, {}, not found in tree leaves"
                .format(options.acyclic))

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              acyclicEvent=options.acyclic)
    return align_job
예제 #8
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--database",
                        dest="database",
                        help="Database type: tokyo_cabinet or kyoto_tycoon"
                        " [default: %(default)s]",
                        default="kyoto_tycoon")
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=None)
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument("--latest",
                        dest="latest",
                        action="store_true",
                        help="Use the latest, locally-built docker container "
                        "rather than pulling from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()
    options.cactusDir = getTempDirectory()

    setupBinaries(options)
    setLoggingFromOptions(options)

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    #Create the progressive cactus project
    projWrapper = ProjectWrapper(options)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            project.readXML(pjPath)
            #import the sequences
            seqIDs = []
            for seq in project.getInputSequencePaths():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([
                        os.path.join(seq, subSeq) for subSeq in os.listdir(seq)
                    ], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                seqIDs.append(toil.importFile(seq))
            project.setInputSequenceIDs(seqIDs)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            logger.info("Setting config id to: %s" % cactusConfigID)
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            project.writeXML(pjPath)
            halID = toil.start(
                RunCactusPreprocessorThenProgressiveDown(
                    options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))
예제 #9
0
def runCactusGraphMap(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            paf_id, gfa_fa_id, gaf_id_map = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames, options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir, 'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            #apply the maskfilter override
            if options.maskFilter is not None:
                findRequiredNode(configNode, "graphmap").attrib["maskFilter"] = str(options.maskFilter)

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            if options.refFromGFA:
                if options.refFromGFA not in seqFile.pathMap:
                    raise RuntimeError("{}, specified with --refFromGFA, was not found in the seqfile".format(options.refFromGFA))
                # we're not going to need the fasta for anything, so forget about it now
                del seqFile.pathMap[options.refFromGFA]
                
            if not options.outputFasta and graph_event not in seqFile.pathMap:
                raise RuntimeError("{} assembly not found in seqfile so it must be specified with --outputFasta".format(graph_event))

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            leaves = set([seqFile.tree.getName(node) for node in seqFile.tree.getLeaves()])
            for genome, seq in seqFile.pathMap.items():
                if genome != graph_event and genome in leaves and genome != options.refFromGFA:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    logger.info("Importing {}".format(seq))
                    seqIDMap[genome] = (seq, toil.importFile(seq))

            # run the workflow
            paf_id, gfa_fa_id, gaf_id_map = toil.start(Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap, gfa_id, graph_event))

        #export the paf
        toil.exportFile(paf_id, makeURL(options.outputPAF))
        if gfa_fa_id:
            toil.exportFile(gfa_fa_id, makeURL(options.outputFasta))

        #export the gafs
        if options.outputGAFDir:
            for event, gaf_id in gaf_id_map.items():
                gaf_path = os.path.join(options.outputGAFDir, '{}.gaf.gz'.format(event))
                toil.exportFile(gaf_id, makeURL(gaf_path))

        # update the input seqfile (in place!)
        if options.outputFasta:
            add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event)
예제 #10
0
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig, reference_event, mask_bed_id):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    if not paf_ids:
        # we can bypass when, ex, doing second pass on ambiguous sequences but not are present
        return [None, None]

    if not gfa_id and not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remap", typeFn=bool, default=False):
        # also bypass if remapping is off in the config (we know it's the second pass because gfa_id is None)
        return [None, None]
    
    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")
    bed_path = os.path.join(work_dir, "mask.bed")
    log_path = os.path.join(work_dir, "split.log")
    if (mask_bed_id):
        job.fileStore.readGlobalFile(mask_bed_id, bed_path)

    if gfa_id:
        job.fileStore.readGlobalFile(gfa_id, gfa_path)
        
    paf_paths = []
    for i, paf_id in enumerate(paf_ids):
        paf_paths.append('{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path)
        job.fileStore.readGlobalFile(paf_id, paf_paths[-1])
    if len(paf_paths) > 1:
        catFiles(paf_paths, paf_path)
    
    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0")
    small_query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallCoverage", default="0")
    small_coverage_threshold = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallThreshold", default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0")
    max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxGap", default="0")
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_")

    cmd = ['rgfa-split',
           '-p', paf_path,
           '-b', out_prefix,
           '-n', query_coverage,
           '-N', small_query_coverage,
           '-T', small_coverage_threshold,
           '-Q', query_uniqueness,
           '-P', max_gap,
           '-a', amb_name,
           '-L', log_path]
    if gfa_id:
        cmd += ['-g', gfa_path, '-G']
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]
    if mask_bed_id:
        cmd += ['-B', bed_path]
    min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ")
    if min_mapq:
        cmd += ['-A', min_mapq]
    # optional stuff added to second pass:
    if not gfa_id:
        remap_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions", default=None)
        if remap_opts:
            cmd += remap_opts.split(' ')        
    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [".gfa", ".paf", ".fa_contigs"] and \
           os.path.isfile(os.path.join(work_dir, file_name + ".fa_contigs")):
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            if ext == '.paf':
                # apply the hacky naming correction so that subpaths have no special characterse in the hal (to make hubs happy)
                # this gets undone by hal2vg
                cactus_call(parameters=['sed', '-i', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e',
                                        '-e', 's/ /\t/g', os.path.join(work_dir, out_name)]) 
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(os.path.join(work_dir, out_name))
            
    return output_id_map, job.fileStore.writeGlobalFile(log_path)
예제 #11
0
def runCactusBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            outWorkFlowArgs = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            logger.info("Experiment {}".format(ET.tostring(expXml)))
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = tree.getChildNames(tree.getRootName())
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)
            logger.info("Genomes in blastonly, {}: {}".format(
                options.root, list(genome_set)))

            print(str(project.inputSequenceMap))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in genome_set:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    project.inputSequenceIDMap[genome] = toil.importFile(seq)
                else:
                    # out-of-scope sequences will only cause trouble later on
                    del project.inputSequenceMap[genome]

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            outWorkFlowArgs = toil.start(
                CactusTrimmingBlastPhase(standAlone=True,
                                         cactusWorkflowArguments=workFlowArgs,
                                         phaseName="trimBlast"))

        # export the alignments
        toil.exportFile(outWorkFlowArgs.alignmentsID,
                        makeURL(options.outputFile))
        # optional secondary alignments
        if outWorkFlowArgs.secondaryAlignmentsID:
            toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID,
                            makeURL(options.outputFile) + '.secondary')
        # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts
        for i, outgroupFragmentID in enumerate(
                outWorkFlowArgs.outgroupFragmentIDs):
            toil.exportFile(
                outgroupFragmentID,
                makeURL(options.outputFile) + '.og_fragment_{}'.format(i))
        # cactus-align can recompute coverage on the fly, but we save them because we have them
        for i, ingroupCoverageID in enumerate(
                outWorkFlowArgs.ingroupCoverageIDs):
            toil.exportFile(
                ingroupCoverageID,
                makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
예제 #12
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--database", dest="database",
                      help="Database type: tokyo_cabinet or kyoto_tycoon"
                      " [default: %(default)s]",
                      default="kyoto_tycoon")
    parser.add_argument("--configFile", dest="configFile",
                      help="Specify cactus configuration file",
                      default=None)
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                      " must appear in NEWICK tree in <seqfile>) to use as a "
                      "root for the alignment.  Any genomes not below this node "
                      "in the tree may be used as outgroups but will never appear"
                      " in the output.  If no root is specifed then the root"
                      " of the tree is used. ", default=None)   
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()
            #Create the progressive cactus project 
            projWrapper = ProjectWrapper(options)
            projWrapper.writeXml()

            pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                                  '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)
            #import the sequences
            seqIDs = []
            print "Importing %s sequences" % (len(project.getInputSequencePaths()))
            for seq in project.getInputSequencePaths():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                seqIDs.append(toil.importFile(seq))
            project.setInputSequenceIDs(seqIDs)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()


            project.writeXML(pjPath)
            halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))