예제 #1
0
def run_prepend_unique_ids(job, cactusWorkflowArguments, project, cactus_blast_input):
    """ prepend the unique ids on the input fasta.  this is required for cactus to work (would be great to relax it though"""

    # note, there is an order dependence to everything where we have to match what was done in cactus_workflow
    # (so the code is pasted exactly as it is there)
    # this is horrible and needs to be fixed via drastic interface refactor
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()]
    sequences = [job.fileStore.readGlobalFile(id) for id in map(itemgetter(1), ingroupsAndOriginalIDs)]
    cactusWorkflowArguments.totalSequenceSize = sum(os.stat(x).st_size for x in sequences)
    renamedInputSeqDir = job.fileStore.getLocalTempDir()
    id_map = {}
    uniqueFas = prependUniqueIDs(sequences, renamedInputSeqDir, id_map)
    uniqueFaIDs = [job.fileStore.writeGlobalFile(seq, cleanup=True) for seq in uniqueFas]
    # Set the uniquified IDs for the ingroups and outgroups
    ingroupsAndNewIDs = list(zip(list(map(itemgetter(0), ingroupsAndOriginalIDs)), uniqueFaIDs[:len(ingroupsAndOriginalIDs)]))
    for event, sequenceID in ingroupsAndNewIDs:
        cactusWorkflowArguments.experimentWrapper.setSequenceID(event, sequenceID)

    # if we're not taking the blast input, then we have to apply to the cigar files too
    if not cactus_blast_input:
        alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.alignmentsID)
        renamed_alignments = prepend_cigar_ids([alignments], renamedInputSeqDir, id_map)
        cactusWorkflowArguments.alignmentsID = job.fileStore.writeGlobalFile(renamed_alignments[0], cleanup=True)
        if cactusWorkflowArguments.secondaryAlignmentsID:
            sec_alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.secondaryAlignmentsID)
            renamed_sec_alignments = prepend_cigar_ids([sec_alignments], renamedInputSeqDir, id_map)
            cactusWorkflowArguments.secondaryAlignmentsID = job.fileStore.writeGlobalFile(renamed_sec_alignments[0], cleanup=True)
        if cactusWorkflowArguments.outgroupFragmentIDs:
            og_alignments= job.fileStore.readGlobalFile(cactusWorkflowArguments.outgroupFragmentIDs)
            renamed_og_alignments = prepend_cigar_ids(og_alignments, renamedInputSeqDir, id_map)
            cactusWorkflowArguments.outgroupFragmentIDs = [job.fileStore.writeGlobalFile(rga, cleanup=True) for rga in renamed_og_alignments]
    
    return cactusWorkflowArguments
예제 #2
0
def run_prepend_unique_ids(job, assembly_files):
    """
    Ensures all input sequences from assembly_files have unique names.
    This is adapted from run_prepend_unique_ids in cactus_align, in order to maintain order-dependent renamings.
    Because cactus-reference-align assumes that all input sequences are ingroups, it does not attempt to avoid renaming outgroup genomes.  
    """
    print("assembly_files", assembly_files)
    sequences = [
        job.fileStore.readGlobalFile(id) for id in assembly_files.values()
    ]
    print("sequences", sequences)
    renamedInputSeqDir = job.fileStore.getLocalTempDir()
    id_map = {}
    uniqueFas = prependUniqueIDs(sequences, renamedInputSeqDir, id_map)
    print("uniqueFas", uniqueFas)
    uniqueFaIDs = [
        job.fileStore.writeGlobalFile(seq, cleanup=True) for seq in uniqueFas
    ]
    print("uniqueFaIDs", uniqueFaIDs)
    i = 0
    for assembly in assembly_files:
        assembly_files[assembly] = uniqueFaIDs[i]
        i += 1
    print("assembly_files", assembly_files)
    return assembly_files
예제 #3
0
    def testPrependUniqueIDs(self):
        # Create fake FASTA files with some interesting headers.
        with NamedTemporaryFile(mode='w+') as fasta1, NamedTemporaryFile(
                mode='w+') as fasta2:
            fasta1.write(
                dedent("""
            >C10856240  2.0
            ACTAGAGG
            G

            GG
            >foo bar  baz
            ACTGACGATgacgat
            >emptyseq
            """))
            fasta2.write(
                dedent("""
            > space
            GTGC
            >id=1||header
            ATCC
            """))
            fasta1.flush()
            fasta2.flush()
            outDir = mkdtemp()
            eventToFa = {0: fasta1.name, 1: fasta2.name}
            outputPaths = prependUniqueIDs(eventToFa, outDir)

        assert len(outputPaths) == 2
        with open(outputPaths[0]) as f:
            self.assertEqual(
                f.read(),
                dedent("""
            >id=0|C10856240  2.0
            ACTAGAGG
            G

            GG
            >id=0|foo bar  baz
            ACTGACGATgacgat
            >id=0|emptyseq
            """))
        with open(outputPaths[1]) as f:
            self.assertEqual(
                f.read(),
                dedent("""
            >id=1| space
            GTGC
            >id=1|id=1||header
            ATCC
            """))
        shutil.rmtree(outDir)
예제 #4
0
def minigraph_map_one(job, config, event_name, fa_path, fa_file_id, gfa_file_id, gaf_output, paf_output):
    """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    fa_dir = job.fileStore.getLocalTempDir()
    fa_path = os.path.join(fa_dir, os.path.basename(fa_path))
    gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name))
    
    job.fileStore.readGlobalFile(gfa_file_id, gfa_path)
    job.fileStore.readGlobalFile(fa_file_id, fa_path)

    if fa_path.endswith('.gz'):
        fa_path = fa_path[:-3]
        cactus_call(parameters = ['gzip', '-d', '-c', fa_path + '.gz'], outfile=fa_path)

    # prepend the unique id before mapping so the GAF has cactus-compatible event names
    fa_path = prependUniqueIDs({event_name : fa_path}, work_dir, eventNameAsID=True)[event_name]

    # parse options from the config
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    minigraph_opts = getOptionalAttrib(xml_node, "minigraphMapOptions", str, default="")     
    opts_list = minigraph_opts.split()
    # add required options if not present
    if "-S" not in opts_list:
        opts_list += ["-S"]
    if "--write-mz" not in opts_list:
        opts_list += ["--write-mz"]
    if "-t" not in opts_list:
        opts_list += ["-t", str(int(job.cores))]

    cmd = ["minigraph",
           os.path.basename(gfa_path),
           os.path.basename(fa_path),
           "-o", os.path.basename(gaf_path)] + opts_list

    mask_filter = getOptionalAttrib(xml_node, "maskFilter", int, default=-1)
    if mask_filter >= 0:
        cmd[2] = '-'
        cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path), '-m', str(mask_filter)], cmd]
    
    cactus_call(work_dir=work_dir, parameters=cmd)

    paf_id, gaf_id = None, None
    if paf_output:
        # optional gaf->paf step.  we are not piping directly out of minigraph because mzgaf2paf's overlap filter
        # (which is usually on) requires 2 passes so it won't read stdin when it's enabled
        paf_id =  merge_gafs_into_paf(job, config, None, [gaf_path])
    if gaf_output:
        gaf_id = job.fileStore.writeGlobalFile(gaf_path)

    return gaf_id, paf_id
예제 #5
0
def run_prepend_unique_ids(job, cactusWorkflowArguments, project, renameCigars, eventNameAsID):
    """ prepend the unique ids on the input fasta.  this is required for cactus to work (would be great to relax it though"""

    # note, there is an order dependence to everything where we have to match what was done in cactus_workflow
    # (so the code is pasted exactly as it is there)
    # this is horrible and needs to be fixed via drastic interface refactor
    # update: this has been somewhat fixed with a minor refactor: prependUniqueIDs is no longer order dependent (but takes dict instead of list)
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()]
    eventToSequence = {}
    for g, seqID in ingroupsAndOriginalIDs:
        seqPath = job.fileStore.getLocalTempFile() + '.fa'
        if project.inputSequenceMap[g].endswith('.gz'):
            seqPath += '.gz'
        job.fileStore.readGlobalFile(seqID, seqPath)
        if seqPath.endswith('.gz'):
            cactus_call(parameters=['gzip', '-d', '-c', seqPath], outfile=seqPath[:-3])
            seqPath = seqPath[:-3]
        eventToSequence[g] = seqPath
    cactusWorkflowArguments.totalSequenceSize = sum(os.stat(x).st_size for x in eventToSequence.values())
    # need to have outgroups in there just for id naming (don't need their sequence)
    for g in exp.getOutgroupGenomes():
        eventToSequence[g] = None
    renamedInputSeqDir = job.fileStore.getLocalTempDir()
    id_map = {}
    eventToUnique = prependUniqueIDs(eventToSequence, renamedInputSeqDir, idMap=id_map, eventNameAsID=eventNameAsID)
    # Set the uniquified IDs for the ingroups and outgroups
    for event, uniqueFa in eventToUnique.items():
        uniqueFaID = job.fileStore.writeGlobalFile(uniqueFa, cleanup=True)
        cactusWorkflowArguments.experimentWrapper.setSequenceID(event, uniqueFaID)

    # if we're not taking cactus-[blast|refmap] input, then we have to apply to the cigar files too
    if renameCigars:
        alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.alignmentsID)
        renamed_alignments = prepend_cigar_ids([alignments], renamedInputSeqDir, id_map)
        cactusWorkflowArguments.alignmentsID = job.fileStore.writeGlobalFile(renamed_alignments[0], cleanup=True)
        if cactusWorkflowArguments.secondaryAlignmentsID:
            sec_alignments = job.fileStore.readGlobalFile(cactusWorkflowArguments.secondaryAlignmentsID)
            renamed_sec_alignments = prepend_cigar_ids([sec_alignments], renamedInputSeqDir, id_map)
            cactusWorkflowArguments.secondaryAlignmentsID = job.fileStore.writeGlobalFile(renamed_sec_alignments[0], cleanup=True)
        if cactusWorkflowArguments.outgroupFragmentIDs:
            og_alignments= job.fileStore.readGlobalFile(cactusWorkflowArguments.outgroupFragmentIDs)
            renamed_og_alignments = prepend_cigar_ids(og_alignments, renamedInputSeqDir, id_map)
            cactusWorkflowArguments.outgroupFragmentIDs = [job.fileStore.writeGlobalFile(rga, cleanup=True) for rga in renamed_og_alignments]
    
    return cactusWorkflowArguments
예제 #6
0
    def testPrependUniqueIDs(self):
        # Create fake FASTA files with some interesting headers.
        with NamedTemporaryFile() as fasta1, NamedTemporaryFile() as fasta2:
            fasta1.write(dedent("""
            >C10856240  2.0
            ACTAGAGG
            G

            GG
            >foo bar  baz
            ACTGACGATgacgat
            >emptyseq
            """))
            fasta2.write(dedent("""
            > space
            GTGC
            >id=1||header
            ATCC
            """))
            fasta1.flush()
            fasta2.flush()
            outDir = mkdtemp()
            outputPaths = prependUniqueIDs([fasta1.name, fasta2.name], outDir)

        assert len(outputPaths) == 2
        with open(outputPaths[0]) as f:
            self.assertEquals(f.read(), dedent("""
            >id=0|C10856240  2.0
            ACTAGAGG
            G

            GG
            >id=0|foo bar  baz
            ACTGACGATgacgat
            >id=0|emptyseq
            """))
        with open(outputPaths[1]) as f:
            self.assertEquals(f.read(), dedent("""
            >id=1| space
            GTGC
            >id=1|id=1||header
            ATCC
            """))
        shutil.rmtree(outDir)
예제 #7
0
def run_prepend_unique_ids(job, assembly_files):
    """
    Ensures all input sequences from assembly_files have unique names.
    This is adapted from run_prepend_unique_ids in cactus_align, in order to maintain order-dependent renamings.
    Because cactus-reference-align assumes that all input sequences are ingroups, it does not attempt to avoid renaming outgroup genomes.  
    """
    # download all the sequence files
    event_to_path = {}
    for event, assembly_id in assembly_files.items():
        event_to_path[event] = job.fileStore.readGlobalFile(assembly_id)

    # prepend unique id to each one (using event name instead of numeric id, as it's more stable across tools)
    event_to_unique_path = prependUniqueIDs(event_to_path,
                                            job.fileStore.getLocalTempDir(),
                                            eventNameAsID=True)

    # write the prepended files back to the job store and return the dict
    for event, prepended_sequence_path in event_to_unique_path.items():
        assembly_files[event] = job.fileStore.writeGlobalFile(
            prepended_sequence_path, cleanup=True)
    return assembly_files
예제 #8
0
def run_prepend_unique_ids(job, cactusWorkflowArguments, project, renameCigars,
                           eventNameAsID, mask_beds):
    """ prepend the unique ids on the input fasta.  this is required for cactus to work (would be great to relax it though"""

    # note, there is an order dependence to everything where we have to match what was done in cactus_workflow
    # (so the code is pasted exactly as it is there)
    # this is horrible and needs to be fixed via drastic interface refactor
    # update: this has been somewhat fixed with a minor refactor: prependUniqueIDs is no longer order dependent (but takes dict instead of list)
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g))
                              for g in exp.getGenomesWithSequence()
                              if g not in exp.getOutgroupGenomes()]
    eventToSequence = {}
    for g, seqID in ingroupsAndOriginalIDs:
        seqPath = job.fileStore.getLocalTempFile() + '.fa'
        job.fileStore.readGlobalFile(seqID, seqPath)
        eventToSequence[g] = seqPath
    cactusWorkflowArguments.totalSequenceSize = sum(
        os.stat(x).st_size for x in eventToSequence.values())
    # need to have outgroups in there just for id naming (don't need their sequence)
    for g in exp.getOutgroupGenomes():
        eventToSequence[g] = None
    renamedInputSeqDir = job.fileStore.getLocalTempDir()
    id_map = {}
    eventToUnique = prependUniqueIDs(eventToSequence,
                                     renamedInputSeqDir,
                                     idMap=id_map,
                                     eventNameAsID=eventNameAsID)
    # Set the uniquified IDs for the ingroups and outgroups
    for event, uniqueFa in eventToUnique.items():
        uniqueFaID = job.fileStore.writeGlobalFile(uniqueFa, cleanup=True)
        cactusWorkflowArguments.experimentWrapper.setSequenceID(
            event, uniqueFaID)

    # if we're not taking cactus-[blast|refmap] input, then we have to apply to the cigar files too
    if renameCigars:
        alignments = job.fileStore.readGlobalFile(
            cactusWorkflowArguments.alignmentsID)
        renamed_alignments = prepend_cigar_ids([alignments],
                                               renamedInputSeqDir, id_map)
        cactusWorkflowArguments.alignmentsID = job.fileStore.writeGlobalFile(
            renamed_alignments[0], cleanup=True)
        if cactusWorkflowArguments.secondaryAlignmentsID:
            sec_alignments = job.fileStore.readGlobalFile(
                cactusWorkflowArguments.secondaryAlignmentsID)
            renamed_sec_alignments = prepend_cigar_ids([sec_alignments],
                                                       renamedInputSeqDir,
                                                       id_map)
            cactusWorkflowArguments.secondaryAlignmentsID = job.fileStore.writeGlobalFile(
                renamed_sec_alignments[0], cleanup=True)
        if cactusWorkflowArguments.outgroupFragmentIDs:
            og_alignments = job.fileStore.readGlobalFile(
                cactusWorkflowArguments.outgroupFragmentIDs)
            renamed_og_alignments = prepend_cigar_ids(og_alignments,
                                                      renamedInputSeqDir,
                                                      id_map)
            cactusWorkflowArguments.outgroupFragmentIDs = [
                job.fileStore.writeGlobalFile(rga, cleanup=True)
                for rga in renamed_og_alignments
            ]

    # if we have some masking beds (maps event name to bed file id), then apply the naming there too
    mask_bed_id = None
    if mask_beds:
        mask_bed_path = job.fileStore.getLocalTempFile()
        with open(mask_bed_path, 'w') as mask_bed_file:
            for g, bed_id in mask_beds.items():
                bed_path = job.fileStore.readGlobalFile(bed_id)
                with open(bed_path, 'r') as bed_file:
                    for line in bed_file:
                        toks = line.split('\t')
                        if len(toks) > 2:
                            if toks[0] not in id_map:
                                raise RuntimeError(
                                    'bed id {} not found in id-map {}'.format(
                                        toks[0],
                                        str(idMap)[:1000]))
                            toks[0] = id_map[toks[0]]
                            mask_bed_file.write('\t'.join(toks))
        mask_bed_id = job.fileStore.writeGlobalFile(mask_bed_path)

    return cactusWorkflowArguments, mask_bed_id