Example #1
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: cogent.core.alignment.SequenceCollection object, or data that can be
    used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Clustal app controller.
    
    Result will be a cogent.core.alignment.Alignment object.
    """
    # create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    # Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    # Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    # Create Clustalw app.
    app = Clustalw(InputHandler="_input_as_multiline_string", params=params)
    # Get results using int_map as input to app
    res = app(int_map.toFasta())
    # Get alignment as dict out of results
    alignment = dict(ClustalParser(res["Align"].readlines()))
    # Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    # Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    # Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment)

    return new_alignment
Example #2
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None):
    """Returns a tree from alignment
    
    Will check MolType of aln object
    """
    if params is None:
        params = {}

    if moltype == DNA or moltype == RNA:
        params['-nt'] = True
    elif moltype == PROTEIN:
        params['-nt'] = False
    else:
        raise ValueError, \
                "FastTree does not support moltype: %s" % moltype.label

    if best_tree:
        params['-slow'] = True

    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = aln.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)

    app = FastTree(params=params)

    result = app(int_map.toFasta())
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
    #remap tip names
    for tip in tree.tips():
        tip.Name = int_keys[tip.Name]

    return tree
    def test_seqs_to_flows(self):
        """seqs_to_flows should take a list of seqs and probs and return """
        seqs = [('a', 'ATCGT'), ('b', 'ACCCAG'), ('c', 'GTAATG')]
        a = SequenceCollection(seqs)

        flows = seqs_to_flows(a.items())
        assert isinstance(flows, FlowgramCollection)

        for f, i in zip(flows, [
                '0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0',
                '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0',
                '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0'
        ]):
            self.assertEqual(f, i)

        probs = {
            0: [1.0, 0, 0, 0, 0],
            1: [0, 1.0, 0, 0, 0],
            2: [0, 0, 1.0, 0, 0],
            3: [0, 0, 0, 1.0, 0]
        }

        flows = seqs_to_flows(a.items(), probs=probs, bin_size=1.0)
        assert isinstance(flows, FlowgramCollection)

        for f, i in zip(flows, [
                '0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0',
                '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0',
                '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0'
        ]):
            self.assertEqual(f, i)
Example #4
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None):
    """Returns a tree from alignment
    
    Will check MolType of aln object
    """
    if params is None:
        params = {}

    if moltype == DNA or moltype == RNA:
        params["-nt"] = True
    elif moltype == PROTEIN:
        params["-nt"] = False
    else:
        raise ValueError, "FastTree does not support moltype: %s" % moltype.label

    if best_tree:
        params["-slow"] = True

    # Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = aln.getIntMap()
    # Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)

    app = FastTree(params=params)

    result = app(int_map.toFasta())
    tree = DndParser(result["Tree"].read(), constructor=PhyloNode)
    # remap tip names
    for tip in tree.tips():
        tip.Name = int_keys[tip.Name]

    return tree
    def test_seqs_to_flows(self):
        """seqs_to_flows should take a list of seqs and probs and return """
        seqs = [("a", "ATCGT"), ("b", "ACCCAG"), ("c", "GTAATG")]
        a = SequenceCollection(seqs)

        flows = seqs_to_flows(a.items())
        assert isinstance(flows, FlowgramCollection)

        for f, i in zip(
            flows,
            [
                "0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0",
                "0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0",
                "0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0",
            ],
        ):
            self.assertEqual(f, i)

        probs = {0: [1.0, 0, 0, 0, 0], 1: [0, 1.0, 0, 0, 0], 2: [0, 0, 1.0, 0, 0], 3: [0, 0, 0, 1.0, 0]}

        flows = seqs_to_flows(a.items(), probs=probs, bin_size=1.0)
        assert isinstance(flows, FlowgramCollection)

        for f, i in zip(
            flows,
            [
                "0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0",
                "0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0",
                "0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0",
            ],
        ):
            self.assertEqual(f, i)
Example #6
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None):
    """Returns a tree from Alignment object aln.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one.

    moltype: cogent.core.moltype.MolType object

    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clustal app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    # Create instance of app controller, enable tree, disable alignment
    app = Clustalw(InputHandler="_input_as_multiline_string", params=params, WorkingDir="/tmp")
    app.Parameters["-align"].off()

    # Set params to empty dict if None.
    if params is None:
        params = {}

    if moltype == DNA or moltype == RNA:
        params["-type"] = "d"
    elif moltype == PROTEIN:
        params["-type"] = "p"
    else:
        raise ValueError, "moltype must be DNA, RNA, or PROTEIN"

    # best_tree -> bootstrap
    if best_tree:
        if "-bootstrap" not in params:
            app.Parameters["-bootstrap"].on(1000)
        if "-seed" not in params:
            app.Parameters["-seed"].on(randint(0, 1000))
        if "-bootlabels" not in params:
            app.Parameters["-bootlabels"].on("nodes")
    else:
        app.Parameters["-tree"].on()

    # Setup mapping. Clustalw clips identifiers. We will need to remap them.
    seq_collection = SequenceCollection(aln)
    int_map, int_keys = seq_collection.getIntMap()
    int_map = SequenceCollection(int_map)

    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result["Tree"].read(), constructor=PhyloNode)
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del (seq_collection, app, result, int_map, int_keys)

    return tree
Example #7
0
def add_seqs_to_alignment(seqs, aln, moltype, params=None):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
    be used to build one.

    aln: a cogent.core.alignment.Alignment object, or data that can be used to
    build one

    params: dict of parameters to pass in to the Clustal app controller.
    """
    # create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    # Create mapping between abbreviated IDs and full IDs
    seq_int_map, seq_int_keys = seq_collection.getIntMap()
    # Create SequenceCollection from int_map.
    seq_int_map = SequenceCollection(seq_int_map, MolType=moltype)

    # create Alignment object from aln
    aln = Alignment(aln, MolType=moltype)
    # Create mapping between abbreviated IDs and full IDs
    aln_int_map, aln_int_keys = aln.getIntMap(prefix="seqn_")
    # Create SequenceCollection from int_map.
    aln_int_map = Alignment(aln_int_map, MolType=moltype)

    # Update seq_int_keys with aln_int_keys
    seq_int_keys.update(aln_int_keys)

    # Create Mafft app.
    app = Clustalw(InputHandler="_input_as_multiline_string", params=params, SuppressStderr=True)
    app.Parameters["-align"].off()
    app.Parameters["-infile"].off()
    app.Parameters["-sequences"].on()

    # Add aln_int_map as profile1
    app.Parameters["-profile1"].on(app._tempfile_as_multiline_string(aln_int_map.toFasta()))

    # Add seq_int_map as profile2
    app.Parameters["-profile2"].on(app._tempfile_as_multiline_string(seq_int_map.toFasta()))
    # Get results using int_map as input to app
    res = app()

    # Get alignment as dict out of results
    alignment = dict(ClustalParser(res["Align"].readlines()))

    # Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[seq_int_keys[k]] = v
    # Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    # Clean up
    res.cleanUp()
    remove(app.Parameters["-profile1"].Value)
    remove(app.Parameters["-profile2"].Value)
    del (seq_collection, seq_int_map, seq_int_keys, aln, aln_int_map, aln_int_keys, app, res, alignment)

    return new_alignment
Example #8
0
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None):
    """Returns a tree from Alignment object aln with bootstrap support values.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one.

    seed: an interger, seed value to use

    num_trees: an integer, number of trees to bootstrap against

    params: dict of parameters to pass in to the Clustal app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.

    If seed is not specifed in params, a random integer between 0-1000 is used.
    """
    # Create instance of controllor, enable bootstrap, disable alignment,tree
    app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir='/tmp')
    app.Parameters['-align'].off()
    app.Parameters['-tree'].off()

    if app.Parameters['-bootstrap'].isOff():
        if num_trees is None:
            num_trees = 1000

        app.Parameters['-bootstrap'].on(num_trees)

    if app.Parameters['-seed'].isOff():
        if seed is None:
            seed = randint(0,1000)

        app.Parameters['-seed'].on(seed)

    if app.Parameters['-bootlabels'].isOff():
        app.Parameters['-bootlabels'].on("node")

    # Setup mapping. Clustalw clips identifiers. We will need to remap them.
    seq_collection = SequenceCollection(aln)
    int_map, int_keys = seq_collection.getIntMap()
    int_map = SequenceCollection(int_map)

    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del(seq_collection, app, result, int_map, int_keys)

    return tree
Example #9
0
def clustal_from_alignment(aln, interleave_len=None):
    """Returns a string in Clustal format.
    
        - aln: can be an Alignment object or a dict.
        - interleave_len: sequence line width.  Only available if sequences are
            aligned.
    """
    if not aln:
        return ''
    
     # get seq output order
    try:
        order = aln.RowOrder
    except:
        order = aln.keys()
        order.sort()
    
    seqs = SequenceCollection(aln)
    clustal_list = ["CLUSTAL\n"]
    
    if seqs.isRagged():
        raise ValueError,\
             "Sequences in alignment are not all the same length." +\
             "Cannot generate Clustal format."
    
    aln_len = seqs.SeqLen
    #Get all labels
    labels = copy(seqs.Names)
    
    #Find all label lengths in order to get padding.
    label_lengths = [len(l) for l in labels]
    label_max = max(label_lengths)
    max_spaces = label_max+4
    
    #Get ordered seqs
    ordered_seqs = [seqs.NamedSeqs[label] for label in order]
    
    if interleave_len is not None:
        curr_ix = 0
        while curr_ix < aln_len:
            clustal_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),\
                y[curr_ix:curr_ix+ \
                interleave_len]) for x,y in zip(order,ordered_seqs)])
            clustal_list.append("")
            curr_ix += interleave_len
    else:
        clustal_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),y) \
            for x,y in zip(order,ordered_seqs)])
        clustal_list.append("")
    
    return '\n'.join(clustal_list)    
        
        
Example #10
0
def clustal_from_alignment(aln, interleave_len=None):
    """Returns a string in Clustal format.
    
        - aln: can be an Alignment object or a dict.
        - interleave_len: sequence line width.  Only available if sequences are
            aligned.
    """
    if not aln:
        return ''
    
     # get seq output order
    try:
        order = aln.RowOrder
    except:
        order = aln.keys()
        order.sort()
    
    seqs = SequenceCollection(aln)
    clustal_list = ["CLUSTAL\n"]
    
    if seqs.isRagged():
        raise ValueError,\
             "Sequences in alignment are not all the same length." +\
             "Cannot generate Clustal format."
    
    aln_len = seqs.SeqLen
    #Get all labels
    labels = copy(seqs.Names)
    
    #Find all label lengths in order to get padding.
    label_lengths = [len(l) for l in labels]
    label_max = max(label_lengths)
    max_spaces = label_max+4
    
    #Get ordered seqs
    ordered_seqs = [seqs.NamedSeqs[label] for label in order]
    
    if interleave_len is not None:
        curr_ix = 0
        while curr_ix < aln_len:
            clustal_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),\
                y[curr_ix:curr_ix+ \
                interleave_len]) for x,y in zip(order,ordered_seqs)])
            clustal_list.append("")
            curr_ix += interleave_len
    else:
        clustal_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),y) \
            for x,y in zip(order,ordered_seqs)])
        clustal_list.append("")
    
    return '\n'.join(clustal_list)    
Example #11
0
def load_from_clustal(data, seq_constructor=Sequence, strict=True):
    recs = [(name, seq_constructor(seq, )) for name, seq in\
        ClustalParser(data, strict)]
    lengths = [len(i[1]) for i in recs]
    if lengths and max(lengths) == min(lengths):
        return Alignment(recs, MolType=BYTES)
    else:
        return SequenceCollection(recs, MolType=BYTES)
Example #12
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None):
    """Returns a tree from Alignment object aln.
    
    aln: a cogent.core.alignment.Alignment object, or data that can be used 
    to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: unsupported
    
    params: dict of parameters to pass in to the Muscle app controller.
    
    The result will be an cogent.core.tree.PhyloNode object, or None if tree 
    fails.
    """
    # Create instance of app controller, enable tree, disable alignment
    app = Muscle(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir='/tmp')

    app.Parameters['-clusteronly'].on()
    app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir))
    app.Parameters['-seqtype'].on(moltype.label)

    seq_collection = SequenceCollection(aln, MolType=moltype)

    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)


    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode)
    
    for tip in tree.tips():
        tip.Name = int_keys[tip.Name]

    # Clean up
    result.cleanUp()
    del(seq_collection, app, result)

    return tree
Example #13
0
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False):
    """Aligns unaligned sequences

    seqs: either list of sequence objects or list of strings
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_multiline_string',params=params)
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
    
    #Do not report progress
    app.Parameters['--quiet'].on()
    
    #More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value=1000
    
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['StdOut']))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        new_alignment[int_keys[k]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    del(seq_collection,int_map,int_keys,app,res,alignment)

    return new_alignment
Example #14
0
def cdhit_from_seqs(seqs, moltype, params=None):
    """Returns the CD-HIT results given seqs

    seqs    : dict like collection of sequences
    moltype : cogent.core.moltype object
    params  : cd-hit parameters

    NOTE: This method will call CD_HIT if moltype is PROTIEN,
        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
        moltype is passed.
    """
    # keys are not remapped. Tested against seq_ids of 100char length
    seqs = SequenceCollection(seqs, MolType=moltype)

    # setup params and make sure the output argument is set
    if params is None:
        params = {}
    if '-o' not in params:
        params['-o'] = get_tmp_filename()

    # call the correct version of cd-hit base on moltype
    working_dir = get_tmp_filename()
    if moltype is PROTEIN:
        app = CD_HIT(WorkingDir=working_dir, params=params)
    elif moltype is RNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    elif moltype is DNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    else:
        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"

    # grab result
    res = app(seqs.toFasta())
    new_seqs = dict(MinimalFastaParser(res['FASTA'].readlines()))

    # perform cleanup
    res.cleanUp()
    shutil.rmtree(working_dir)
    remove(params['-o'] + '.bak.clstr')

    return SequenceCollection(new_seqs, MolType=moltype)
 def getSeqCollection(self, feature_types=None, where_feature=None):
     """returns a SequenceCollection instance of the unaligned sequences"""
     seqs = []
     for member in self.Members:
         if feature_types:
             seq = member.getAnnotatedSeq(feature_types, where_feature)
         else:
             seq = member.Seq
         if seq is None:
             continue
         seqs.append((seq.Name, seq))
     return SequenceCollection(data=seqs, MolType=DNA)
Example #16
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: cogent.core.alignment.SequenceCollection object, or data that can be
    used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Clustal app controller.
    
    Result will be a cogent.core.alignment.Alignment object.
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    #Create Clustalw app.
    app = Clustalw(InputHandler='_input_as_multiline_string', params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(ClustalParser(res['Align'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment)

    return new_alignment
Example #17
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: SequenceCollection object, or data that can be used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Muscle app controller.
    
    Result will be an Alignment object.
    """
    if not params:
        params = {}
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    #get temporary filename
    params.update({'-out':get_tmp_filename()})
    #Create Muscle app.
    app = Muscle(InputHandler='_input_as_multiline_string',\
                 params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        new_alignment[int_keys[k]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    del(seq_collection,int_map,int_keys,app,res,alignment,params)

    return new_alignment
    def test_seqs_to_flows(self):
        """seqs_to_flows should take a list of seqs and probs and return """
        seqs = [('a','ATCGT'), ('b','ACCCAG'), ('c','GTAATG')]
        a = SequenceCollection(seqs)

        flows = seqs_to_flows(a.items())
        assert isinstance(flows,FlowgramCollection)
        
        for f,i in zip(flows,['0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0',
                            '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0',
                            '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0']):
            self.assertEqual(f,i)

        probs ={0:[1.0,0,0,0,0],1:[0,1.0,0,0,0],2:[0,0,1.0,0,0],3:[0,0,0,1.0,0]}
        
        flows = seqs_to_flows(a.items(), probs = probs, bin_size = 1.0)
        assert isinstance(flows,FlowgramCollection)
        
        for f,i in zip(flows,['0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0',
                            '0.0 1.0 3.0 0.0 0.0 1.0 0.0 1.0',
                            '0.0 0.0 0.0 1.0 1.0 2.0 0.0 0.0 1.0 0.0 0.0 1.0']):
            self.assertEqual(f,i)
Example #19
0
def bootstrap_tree_from_alignment(aln, seed=None, num_trees=None, params=None):
    """Returns a tree from Alignment object aln with bootstrap support values.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one.

    seed: an interger, seed value to use
    
    num_trees: an integer, number of trees to bootstrap against

    params: dict of parameters to pass in to the Clustal app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.

    If seed is not specifed in params, a random integer between 0-1000 is used.
    """
    # Create instance of controllor, enable bootstrap, disable alignment,tree
    app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir='/tmp')
    app.Parameters['-align'].off()
    app.Parameters['-tree'].off()

    if app.Parameters['-bootstrap'].isOff():
        if num_trees is None:
            num_trees = 1000

        app.Parameters['-bootstrap'].on(num_trees)

    if app.Parameters['-seed'].isOff():
        if seed is None:
            seed = randint(0, 1000)

        app.Parameters['-seed'].on(seed)

    if app.Parameters['-bootlabels'].isOff():
        app.Parameters['-bootlabels'].on("node")

    # Setup mapping. Clustalw clips identifiers. We will need to remap them.
    seq_collection = SequenceCollection(aln)
    int_map, int_keys = seq_collection.getIntMap()
    int_map = SequenceCollection(int_map)

    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del (seq_collection, app, result, int_map, int_keys)

    return tree
Example #20
0
def LoadSeqs(filename=None, format=None, data=None, moltype=None,
            name=None, aligned=True, label_to_name=None, parser_kw={},
            constructor_kw={}, **kw):
    """Initialize an alignment or collection of sequences.
    
    Arguments:
    - filename: name of the sequence file
    - format: format of the sequence file
    - data: optional explicit provision of sequences
    - moltype: the MolType, eg DNA, PROTEIN
    - aligned: set True if sequences are already aligned and have the same
      length, results in an Alignment object. If False, a SequenceCollection
      instance is returned instead. If callable, will use as a constructor
      (e.g. can pass in DenseAlignment or CodonAlignment).
    - label_to_name: function for converting original name into another
      name. Default behavior is to preserve the original FASTA label and
      comment. 
      To remove all FASTA label comments, and pass in only the label, pass in: 
            label_to_name=lambda x: x.split()[0]
      To look up names in a dict, pass in:
            label_to_name = lambda x: d.get(x, default_name)
      ...where d is a dict that's in scope, and default_name is what you want
      to assign any sequence that isn't in the dict.
    
    If format is None, will attempt to infer format from the filename
    suffix. If label_to_name is None, will attempt to infer correct
    conversion from the format.
    """
    
    if filename is None:
        assert data is not None
        assert format is None
        assert not kw, kw
    else:
        assert data is None, (filename, data)
        data = list(FromFilenameParser(filename, format, **parser_kw))

    # the following is a temp hack until we have the load API sorted out.
    if aligned: #if callable, call it -- expect either f(data) or bool
        if hasattr(aligned, '__call__'):
            return aligned(data=data, MolType=moltype, Name=name,
                label_to_name=label_to_name, **constructor_kw)
        else:   #was not callable, but wasn't False
            return Alignment(data=data, MolType=moltype, Name=name,
                label_to_name=label_to_name, **constructor_kw)
    else:   #generic case: return SequenceCollection
        return SequenceCollection(data, MolType=moltype, Name=name,
            label_to_name=label_to_name, **constructor_kw)
Example #21
0
def cdhit_clusters_from_seqs(seqs, moltype, params=None):
    """Returns the CD-HIT clusters given seqs

    seqs        : dict like collection of sequences
    moltype     : cogent.core.moltype object
    params      : cd-hit parameters

    NOTE: This method will call CD_HIT if moltype is PROTIEN,
        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
        moltype is passed.
    """
    # keys are not remapped. Tested against seq_ids of 100char length
    seqs = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seqs.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    
    # setup params and make sure the output argument is set
    if params is None:
        params = {}
    if '-o' not in params:
        params['-o'] = get_tmp_filename()

    # call the correct version of cd-hit base on moltype
    working_dir = get_tmp_filename()
    if moltype is PROTEIN:
        app = CD_HIT(WorkingDir=working_dir, params=params)
    elif moltype is RNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    elif moltype is DNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    else:
        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"

    # grab result
    res = app(int_map.toFasta())
    clusters = parse_cdhit_clstr_file(res['CLSTR'].readlines())

    remapped_clusters = []
    for c in clusters:
        curr = [int_keys[i] for i in c]
        remapped_clusters.append(curr)

    # perform cleanup
    res.cleanUp()
    shutil.rmtree(working_dir)
    remove(params['-o'] + '.bak.clstr')

    return remapped_clusters
Example #22
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None):
    """Returns a tree from Alignment object aln.
    
    aln: a cogent.core.alignment.Alignment object, or data that can be used 
    to build one.
    
    moltype: cogent.core.moltype.MolType object

    best_tree: unsupported
    
    params: dict of parameters to pass in to the Muscle app controller.
    
    The result will be an cogent.core.tree.PhyloNode object, or None if tree 
    fails.
    """
    # Create instance of app controller, enable tree, disable alignment
    app = Muscle(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir='/tmp')

    app.Parameters['-cluster'].on()
    app.Parameters['-tree1'].on(get_tmp_filename(app.WorkingDir))
    app.Parameters['-seqtype'].on(moltype.label)

    seq_collection = SequenceCollection(aln, MolType=moltype)

    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)


    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result['Tree1Out'].read(), constructor=PhyloNode)
    
    for tip in tree.tips():
        tip.Name = int_keys[tip.Name]

    # Clean up
    result.cleanUp()
    del(seq_collection, app, result)

    return tree
Example #23
0
def align_unaligned_seqs(seqs,moltype,params=None,accurate=False):
    """Aligns unaligned sequences

    seqs: either list of sequence objects or list of strings
    add_seq_names: boolean. if True, sequence names are inserted in the list
        of sequences. if False, it assumes seqs is a list of lines of some
        proper format that the program can handle
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map,MolType=moltype)
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_multiline_string',params=params)
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
    
    #Do not report progress
    app.Parameters['--quiet'].on()
    
    #More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value=1000
    
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['StdOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in list(alignment.items()):
        new_alignment[int_keys[k]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    del(seq_collection,int_map,int_keys,app,res,alignment)

    return new_alignment
Example #24
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: SequenceCollection object, or data that can be used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Muscle app controller.
    
    Result will be an Alignment object.
    """
    if not params:
        params = {}
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    #get temporary filename
    params.update({'-out': get_tmp_filename()})
    #Create Muscle app.
    app = Muscle(InputHandler='_input_as_multiline_string',\
                 params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment, params)

    return new_alignment
Example #25
0
def create_locarnap_alignment(seqs,moltype,struct=False,params=None):
    """Returns mlocarna results given an unaligned SequenceCollection.
    
        - seqs: A SequenceCollection object or something that behaves like one.
        - moltype: cogent.core.moltype object.
        -struct: Boolean whether or not to also output vienna structure string
    """
    #Construct SequenceCollection object.
    seqs = SequenceCollection(seqs,MolType=moltype)

    #need to make int map.
    int_map, int_keys = seqs.getIntMap()
    #construct SequenceCollection object from int map to use functionality
    int_map = SequenceCollection(int_map, MolType=moltype)
    
    #Create application.
    app = MLocarna(InputHandler='_input_as_multiline_string',params=params)
    #Get temporary directory to write all mlocarna files.
    mlocarna_dir = get_tmp_filename(suffix='')
    app.Parameters['--tgtdir'].on(mlocarna_dir)

    #set parameters to run locarna-p
    app.Parameters['--write-structure'].on()
    app.Parameters['--probabilistic'].on()
    app.Parameters['--consistency-transformation'].on()
    res = app(int_map.toFasta())
    #get the structure from the results if necessary
    if struct:
        structfile = open(res['ProbabilisticAlignment'].name, 'U')
        structure = ""
        newstrline = True
        for line in structfile:
            line = line.strip()
            #read in structure lines of alignment (--write-structure)
            if len(line) > 0 and (line[0] == "." or line[0] == "("):
                #only append if new structure aspect, since struct is 
                #written both above and below blocks in alignment
                if newstrline:
                    structure += line
                    newstrline = not newstrline
                else:
                    newstrline = not newstrline
        


    aligned = dict(ClustalParser(res['ClustalAlignment']))
    
    #Make new dict mapping original IDs
    new_alignment={}
    for k,v in aligned.items():
        new_alignment[int_keys.get(k,k)]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)

    #Clean up after MlocARNA
    res.cleanUp()
    shutil.rmtree(mlocarna_dir)

    #output alignment and structure if asked for, else outout just alignment
    if struct:
        return new_alignment, structure
    else:
        return new_alignment
Example #26
0
def align_two_alignments(aln1, aln2, params=None):
    """Returns an Alignment object from two existing Alignments.
    
    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be 
    used to build them.
    
    params: dict of parameters to pass in to the Muscle app controller.
    """
    if not params:
        params = {}

    #create SequenceCollection object from aln1
    aln1_collection = SequenceCollection(aln1)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_')
    #Create SequenceCollection from int_map.
    aln1_int_map = SequenceCollection(aln1_int_map)

    #create SequenceCollection object from aln2
    aln2_collection = SequenceCollection(aln2)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_')
    #Create SequenceCollection from int_map.
    aln2_int_map = SequenceCollection(aln2_int_map)

    #set output and profile options
    params.update({'-out':get_tmp_filename(), '-profile':True})

    #save aln1 to tmp file
    aln1_filename = get_tmp_filename()
    aln1_out = open(aln1_filename,'w')
    aln1_out.write(aln1_int_map.toFasta())
    aln1_out.close()

    #save aln2 to tmp file
    aln2_filename = get_tmp_filename()
    aln2_out = open(aln2_filename, 'w')
    aln2_out.write(aln2_int_map.toFasta())
    aln2_out.close()

    #Create Muscle app and get results
    app = Muscle(InputHandler='_input_as_multifile', params=params)
    res = app((aln1_filename, aln2_filename))

    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines()))

    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        if k in aln1_int_keys:
            new_alignment[aln1_int_keys[k]] = v
        else:
            new_alignment[aln2_int_keys[k]] = v

    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment)

    #Clean up
    res.cleanUp()
    del(aln1_collection, aln1_int_map, aln1_int_keys)
    del(aln2_collection, aln2_int_map, aln2_int_keys)
    del(app, res, alignment, params)
    remove(aln1_filename)
    remove(aln2_filename)

    return new_alignment
Example #27
0
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: a cogent.core.sequence.Sequence object, or data that can be used
    to build one.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one

    params: dict of parameters to pass in to the Mafft app controller.
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    seq_int_map, seq_int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    seq_int_map = SequenceCollection(seq_int_map,MolType=moltype)
    
    #create Alignment object from aln
    aln = Alignment(aln,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln_int_map = Alignment(aln_int_map,MolType=moltype)
    
    #Update seq_int_keys with aln_int_keys
    seq_int_keys.update(aln_int_keys)
    
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_multiline_string',\
        params=params,
        SuppressStderr=True)
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
    
    #Do not report progress
    app.Parameters['--quiet'].on()
    
    #Add aln_int_map as seed alignment
    app.Parameters['--seed'].on(\
        app._tempfile_as_multiline_string(aln_int_map.toFasta()))
        
    #More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value=1000
    
    #Get results using int_map as input to app
    res = app(seq_int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['StdOut'].readlines()))
    
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in list(alignment.items()):
        key = k.replace('_seed_','')
        new_alignment[seq_int_keys[key]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(app.Parameters['--seed'].Value)
    del(seq_collection,seq_int_map,seq_int_keys,\
        aln,aln_int_map,aln_int_keys,app,res,alignment)

    return new_alignment
Example #28
0
 def __call__(self, seq_path, result_path=None, log_path=None, \
     failure_path=None, cmbuild_params=None, cmalign_params=None):
     
     log_params = []
     # load candidate sequences
     candidate_sequences = dict(MinimalFastaParser(open(seq_path,'U')))
     
     # load template sequences
     try:
         info, template_alignment, struct = list(MinimalRfamParser(open(\
             self.Params['template_filepath'],'U'),\
             seq_constructor=ChangedSequence))[0]
     except RecordError:
         raise ValueError, "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner."
     
     moltype = self.Params['moltype']
     
     #Need to make separate mapping for unaligned sequences
     unaligned = SequenceCollection(candidate_sequences,MolType=moltype)
     int_map, int_keys = unaligned.getIntMap(prefix='unaligned_')
     int_map = SequenceCollection(int_map,MolType=moltype)
     
     #Turn on --gapthresh option in cmbuild to force alignment to full model
     if cmbuild_params is None:
         cmbuild_params = {}
     cmbuild_params.update({'--gapthresh':1.0})
     
     #record cmbuild parameters
     log_params.append('cmbuild parameters:')
     log_params.append(str(cmbuild_params))
     
     #Turn on --sub option in Infernal, since we know the unaligned sequences
     # are fragments.
     #Also turn on --gapthresh to use same gapthresh as was used to build
     # model
     
     if cmalign_params is None:
         cmalign_params = {}
     cmalign_params.update({'--sub':True,'--gapthresh':1.0})
     
     #record cmalign parameters
     log_params.append('cmalign parameters:')
     log_params.append(str(cmalign_params))
     
     #Align sequences to alignment including alignment gaps.
     aligned, struct_string = cmalign_from_alignment(aln=template_alignment,\
         structure_string=struct,\
         seqs=int_map,\
         moltype=moltype,\
         include_aln=True,\
         params=cmalign_params,\
         cmbuild_params=cmbuild_params)
     
     #Pull out original sequences from full alignment.
     infernal_aligned={}
     aligned_dict = aligned.NamedSeqs
     for key in int_map.Names:
         infernal_aligned[int_keys.get(key,key)]=aligned_dict[key]
     
     #Create an Alignment object from alignment dict
     infernal_aligned = Alignment(infernal_aligned,MolType=moltype)
     
     if log_path is not None:
         log_file = open(log_path,'w')
         log_file.write('\n'.join(log_params))
         log_file.close()
     
     if result_path is not None:
         result_file = open(result_path,'w')
         result_file.write(infernal_aligned.toFasta())
         result_file.close()
         return None
     else:
         try:
             return infernal_aligned
         except ValueError:
             return {}
Example #29
0
def align_two_alignments(aln1, aln2, params=None):
    """Returns an Alignment object from two existing Alignments.
    
    aln1, aln2: cogent.core.alignment.Alignment objects, or data that can be 
    used to build them.
    
    params: dict of parameters to pass in to the Muscle app controller.
    """
    if not params:
        params = {}

    #create SequenceCollection object from aln1
    aln1_collection = SequenceCollection(aln1)
    #Create mapping between abbreviated IDs and full IDs
    aln1_int_map, aln1_int_keys = aln1_collection.getIntMap(prefix='aln1_')
    #Create SequenceCollection from int_map.
    aln1_int_map = SequenceCollection(aln1_int_map)

    #create SequenceCollection object from aln2
    aln2_collection = SequenceCollection(aln2)
    #Create mapping between abbreviated IDs and full IDs
    aln2_int_map, aln2_int_keys = aln2_collection.getIntMap(prefix='aln2_')
    #Create SequenceCollection from int_map.
    aln2_int_map = SequenceCollection(aln2_int_map)

    #set output and profile options
    params.update({'-out': get_tmp_filename(), '-profile': True})

    #save aln1 to tmp file
    aln1_filename = get_tmp_filename()
    aln1_out = open(aln1_filename, 'w')
    aln1_out.write(aln1_int_map.toFasta())
    aln1_out.close()

    #save aln2 to tmp file
    aln2_filename = get_tmp_filename()
    aln2_out = open(aln2_filename, 'w')
    aln2_out.write(aln2_int_map.toFasta())
    aln2_out.close()

    #Create Muscle app and get results
    app = Muscle(InputHandler='_input_as_multifile', params=params)
    res = app((aln1_filename, aln2_filename))

    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines()))

    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        if k in aln1_int_keys:
            new_alignment[aln1_int_keys[k]] = v
        else:
            new_alignment[aln2_int_keys[k]] = v

    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment)

    #Clean up
    res.cleanUp()
    del (aln1_collection, aln1_int_map, aln1_int_keys)
    del (aln2_collection, aln2_int_map, aln2_int_keys)
    del (app, res, alignment, params)
    remove(aln1_filename)
    remove(aln2_filename)

    return new_alignment
Example #30
0
def add_seqs_to_alignment(seqs, aln, moltype, params=None):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
    be used to build one.

    aln: a cogent.core.alignment.Alignment object, or data that can be used to
    build one

    params: dict of parameters to pass in to the Clustal app controller.
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    seq_int_map, seq_int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    seq_int_map = SequenceCollection(seq_int_map, MolType=moltype)

    #create Alignment object from aln
    aln = Alignment(aln, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln_int_map = Alignment(aln_int_map, MolType=moltype)

    #Update seq_int_keys with aln_int_keys
    seq_int_keys.update(aln_int_keys)

    #Create Mafft app.
    app = Clustalw(InputHandler='_input_as_multiline_string',\
        params=params,
        SuppressStderr=True)
    app.Parameters['-align'].off()
    app.Parameters['-infile'].off()
    app.Parameters['-sequences'].on()

    #Add aln_int_map as profile1
    app.Parameters['-profile1'].on(\
        app._tempfile_as_multiline_string(aln_int_map.toFasta()))

    #Add seq_int_map as profile2
    app.Parameters['-profile2'].on(\
        app._tempfile_as_multiline_string(seq_int_map.toFasta()))
    #Get results using int_map as input to app
    res = app()

    #Get alignment as dict out of results
    alignment = dict(ClustalParser(res['Align'].readlines()))

    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[seq_int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(app.Parameters['-profile1'].Value)
    remove(app.Parameters['-profile2'].Value)
    del(seq_collection,seq_int_map,seq_int_keys,\
        aln,aln_int_map,aln_int_keys,app,res,alignment)

    return new_alignment
Example #31
0
def stockholm_from_alignment(aln, interleave_len=None, GC_annotation=None):
    """Returns a string in Stockholm format.
    
        - aln: can be an Alignment object or a dict.
        - interleave_len: sequence line width.  Only available if sequences are
            aligned.
        - GC_annotation: dict containing Per-column annotation {<tag>:<s>},
            added to Stockholm file in the following format: #=GC <tag> <s>
            - <s> is an aligned text line of annotation type <tag>.
            - #=GC lines are associated with a sequence alignment block;
            - <s> is aligned to the residues in the alignment block, and has the 
            same length as the rest of the block. #=GC lines are
            placed at the end of each block. 
    """
    if not aln:
        return ''

    # get seq output order
    try:
        order = aln.RowOrder
    except:
        order = aln.keys()
        order.sort()

    seqs = SequenceCollection(aln)
    stockholm_list = ["# STOCKHOLM 1.0\n"]

    if seqs.isRagged():
        raise ValueError,\
             "Sequences in alignment are not all the same length." +\
             "Cannot generate Stockholm format."

    aln_len = seqs.SeqLen
    #Get all labels
    labels = copy(seqs.Names)

    #Get ordered seqs
    ordered_seqs = [seqs.NamedSeqs[label] for label in order]

    if GC_annotation is not None:
        GC_annotation_list = \
            [(k,GC_annotation[k]) for k in sorted(GC_annotation.keys())]
        #Add GC_annotation to list of labels.
        labels.extend(['#=GC ' + k for k in GC_annotation.keys()])
        for k, v in GC_annotation.items():
            if len(v) != aln_len:
                raise ValueError, """GC annotation %s is not same length as alignment. Cannot generate Stockholm format.""" % (
                    k)

    #Find all label lengths in order to get padding.
    label_lengths = [len(l) for l in labels]
    label_max = max(label_lengths)
    max_spaces = label_max + 4

    if interleave_len is not None:
        curr_ix = 0
        while curr_ix < aln_len:
            stockholm_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),\
                y[curr_ix:curr_ix+ \
                interleave_len]) for x,y in zip(order, ordered_seqs)])
            if GC_annotation is not None:
                stockholm_list.extend(["#=GC %s%s%s"%(x,\
                    ' '*(max_spaces-len(x)-5),\
                    y[curr_ix:curr_ix + interleave_len]) for x,y in\
                    GC_annotation_list])
            stockholm_list.append("")
            curr_ix += interleave_len
    else:
        stockholm_list.extend(["%s%s%s"%(x,' '*(max_spaces-len(x)),y) \
            for x,y in zip(order, ordered_seqs)])
        if GC_annotation is not None:
            stockholm_list.extend(["#=GC %s%s%s"%(x,' '*(max_spaces-len(x)-5),\
                y) for x,y in GC_annotation_list])
        stockholm_list.append("")

    return '\n'.join(stockholm_list) + '//'
Example #32
0
def build_tree_from_alignment(aln, moltype, best_tree=False, params=None):
    """Returns a tree from Alignment object aln.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one.

    moltype: cogent.core.moltype.MolType object

    best_tree: if True (default:False), uses a slower but more accurate
    algorithm to build the tree.

    params: dict of parameters to pass in to the Clustal app controller.

    The result will be an cogent.core.tree.PhyloNode object, or None if tree
    fails.
    """
    # Create instance of app controller, enable tree, disable alignment
    app = Clustalw(InputHandler='_input_as_multiline_string', params=params, \
                   WorkingDir='/tmp')
    app.Parameters['-align'].off()

    #Set params to empty dict if None.
    if params is None:
        params = {}

    if moltype == DNA or moltype == RNA:
        params['-type'] = 'd'
    elif moltype == PROTEIN:
        params['-type'] = 'p'
    else:
        raise ValueError, "moltype must be DNA, RNA, or PROTEIN"

    # best_tree -> bootstrap
    if best_tree:
        if '-bootstrap' not in params:
            app.Parameters['-bootstrap'].on(1000)
        if '-seed' not in params:
            app.Parameters['-seed'].on(randint(0, 1000))
        if '-bootlabels' not in params:
            app.Parameters['-bootlabels'].on('nodes')
    else:
        app.Parameters['-tree'].on()

    # Setup mapping. Clustalw clips identifiers. We will need to remap them.
    seq_collection = SequenceCollection(aln)
    int_map, int_keys = seq_collection.getIntMap()
    int_map = SequenceCollection(int_map)

    # Collect result
    result = app(int_map.toFasta())

    # Build tree
    tree = DndParser(result['Tree'].read(), constructor=PhyloNode)
    for node in tree.tips():
        node.Name = int_keys[node.Name]

    # Clean up
    result.cleanUp()
    del (seq_collection, app, result, int_map, int_keys)

    return tree
Example #33
0
def stockholm_from_alignment(aln, interleave_len=None, GC_annotation=None):
    """Returns a string in Stockholm format.
    
        - aln: can be an Alignment object or a dict.
        - interleave_len: sequence line width.  Only available if sequences are
            aligned.
        - GC_annotation: dict containing Per-column annotation {<tag>:<s>},
            added to Stockholm file in the following format: #=GC <tag> <s>
            - <s> is an aligned text line of annotation type <tag>.
            - #=GC lines are associated with a sequence alignment block;
            - <s> is aligned to the residues in the alignment block, and has the 
            same length as the rest of the block. #=GC lines are
            placed at the end of each block. 
    """
    if not aln:
        return ""

    # get seq output order
    try:
        order = aln.RowOrder
    except:
        order = aln.keys()
        order.sort()

    seqs = SequenceCollection(aln)
    stockholm_list = ["# STOCKHOLM 1.0\n"]

    if seqs.isRagged():
        raise ValueError, "Sequences in alignment are not all the same length." + "Cannot generate Stockholm format."

    aln_len = seqs.SeqLen
    # Get all labels
    labels = copy(seqs.Names)

    # Get ordered seqs
    ordered_seqs = [seqs.NamedSeqs[label] for label in order]

    if GC_annotation is not None:
        GC_annotation_list = [(k, GC_annotation[k]) for k in sorted(GC_annotation.keys())]
        # Add GC_annotation to list of labels.
        labels.extend(["#=GC " + k for k in GC_annotation.keys()])
        for k, v in GC_annotation.items():
            if len(v) != aln_len:
                raise ValueError, """GC annotation %s is not same length as alignment. Cannot generate Stockholm format.""" % (
                    k
                )

    # Find all label lengths in order to get padding.
    label_lengths = [len(l) for l in labels]
    label_max = max(label_lengths)
    max_spaces = label_max + 4

    if interleave_len is not None:
        curr_ix = 0
        while curr_ix < aln_len:
            stockholm_list.extend(
                [
                    "%s%s%s" % (x, " " * (max_spaces - len(x)), y[curr_ix : curr_ix + interleave_len])
                    for x, y in zip(order, ordered_seqs)
                ]
            )
            if GC_annotation is not None:
                stockholm_list.extend(
                    [
                        "#=GC %s%s%s" % (x, " " * (max_spaces - len(x) - 5), y[curr_ix : curr_ix + interleave_len])
                        for x, y in GC_annotation_list
                    ]
                )
            stockholm_list.append("")
            curr_ix += interleave_len
    else:
        stockholm_list.extend(["%s%s%s" % (x, " " * (max_spaces - len(x)), y) for x, y in zip(order, ordered_seqs)])
        if GC_annotation is not None:
            stockholm_list.extend(
                ["#=GC %s%s%s" % (x, " " * (max_spaces - len(x) - 5), y) for x, y in GC_annotation_list]
            )
        stockholm_list.append("")

    return "\n".join(stockholm_list) + "//"
Example #34
0
def add_seqs_to_alignment(seqs, aln, params=None):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: a cogent.core.alignment.SequenceCollection object, or data that can
    be used to build one.

    aln: a cogent.core.alignment.Alignment object, or data that can be used
    to build one

    params: dict of parameters to pass in to the Muscle app controller.
    """
    if not params:
        params = {}

    #create SequenceCollection object from seqs
    seqs_collection = SequenceCollection(seqs)
    #Create mapping between abbreviated IDs and full IDs
    seqs_int_map, seqs_int_keys = seqs_collection.getIntMap(prefix='seq_')
    #Create SequenceCollection from int_map.
    seqs_int_map = SequenceCollection(seqs_int_map)

    #create SequenceCollection object from aln
    aln_collection = SequenceCollection(aln)
    #Create mapping between abbreviated IDs and full IDs
    aln_int_map, aln_int_keys = aln_collection.getIntMap(prefix='aln_')
    #Create SequenceCollection from int_map.
    aln_int_map = SequenceCollection(aln_int_map)

    #set output and profile options
    params.update({'-out':get_tmp_filename(), '-profile':True})

    #save seqs to tmp file
    seqs_filename = get_tmp_filename()
    seqs_out = open(seqs_filename,'w')
    seqs_out.write(seqs_int_map.toFasta())
    seqs_out.close()

    #save aln to tmp file
    aln_filename = get_tmp_filename()
    aln_out = open(aln_filename, 'w')
    aln_out.write(aln_int_map.toFasta())
    aln_out.close()

    #Create Muscle app and get results
    app = Muscle(InputHandler='_input_as_multifile', params=params,
                 WorkingDir=tempfile.gettempdir())
    res = app((aln_filename, seqs_filename))

    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['MuscleOut']))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        if k in seqs_int_keys:
            new_alignment[seqs_int_keys[k]] = v
        else:
            new_alignment[aln_int_keys[k]] = v

    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment)

    #Clean up
    res.cleanUp()
    del(seqs_collection, seqs_int_map, seqs_int_keys)
    del(aln_collection, aln_int_map, aln_int_keys)
    del(app, res, alignment, params)
    remove(seqs_filename)
    remove(aln_filename)

    return new_alignment
Example #35
0
 def toSequenceCollection(self, Bases = False):
     names = self.Names
     flow_dict = self.NamedFlows
     flows = [flow_dict[f].toSeq(Bases = Bases) for f in names]
     return SequenceCollection(flows)
Example #36
0
def create_locarnap_alignment(seqs, moltype, struct=False, params=None):
    """Returns mlocarna results given an unaligned SequenceCollection.
    
        - seqs: A SequenceCollection object or something that behaves like one.
        - moltype: cogent.core.moltype object.
        -struct: Boolean whether or not to also output vienna structure string
    """
    #Construct SequenceCollection object.
    seqs = SequenceCollection(seqs, MolType=moltype)

    #need to make int map.
    int_map, int_keys = seqs.getIntMap()
    #construct SequenceCollection object from int map to use functionality
    int_map = SequenceCollection(int_map, MolType=moltype)

    #Create application.
    app = MLocarna(InputHandler='_input_as_multiline_string', params=params)
    #Get temporary directory to write all mlocarna files.
    mlocarna_dir = get_tmp_filename(suffix='')
    app.Parameters['--tgtdir'].on(mlocarna_dir)

    #set parameters to run locarna-p
    app.Parameters['--write-structure'].on()
    app.Parameters['--probabilistic'].on()
    app.Parameters['--consistency-transformation'].on()
    res = app(int_map.toFasta())
    #get the structure from the results if necessary
    if struct:
        structfile = open(res['ProbabilisticAlignment'].name, 'U')
        structure = ""
        newstrline = True
        for line in structfile:
            line = line.strip()
            #read in structure lines of alignment (--write-structure)
            if len(line) > 0 and (line[0] == "." or line[0] == "("):
                #only append if new structure aspect, since struct is
                #written both above and below blocks in alignment
                if newstrline:
                    structure += line
                    newstrline = not newstrline
                else:
                    newstrline = not newstrline

    aligned = dict(ClustalParser(res['ClustalAlignment']))

    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in aligned.items():
        new_alignment[int_keys.get(k, k)] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)

    #Clean up after MlocARNA
    res.cleanUp()
    shutil.rmtree(mlocarna_dir)

    #output alignment and structure if asked for, else outout just alignment
    if struct:
        return new_alignment, structure
    else:
        return new_alignment
Example #37
0
def add_seqs_to_alignment(seqs, aln, moltype, params=None, accurate=False):
    """Returns an Alignment object from seqs and existing Alignment.

    seqs: a cogent.core.sequence.Sequence object, or data that can be used
    to build one.

    aln: an cogent.core.alignment.Alignment object, or data that can be used
    to build one

    params: dict of parameters to pass in to the Mafft app controller.
    """
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    seq_int_map, seq_int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    seq_int_map = SequenceCollection(seq_int_map,MolType=moltype)
    
    #create Alignment object from aln
    aln = Alignment(aln,MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    aln_int_map, aln_int_keys = aln.getIntMap(prefix='seqn_')
    #Create SequenceCollection from int_map.
    aln_int_map = Alignment(aln_int_map,MolType=moltype)
    
    #Update seq_int_keys with aln_int_keys
    seq_int_keys.update(aln_int_keys)
    
    #Create Mafft app.
    app = Mafft(InputHandler='_input_as_multiline_string',\
        params=params,
        SuppressStderr=True)
    
    #Turn on correct moltype
    moltype_string = moltype.label.upper()
    app.Parameters[MOLTYPE_MAP[moltype_string]].on()
    
    #Do not report progress
    app.Parameters['--quiet'].on()
    
    #Add aln_int_map as seed alignment
    app.Parameters['--seed'].on(\
        app._tempfile_as_multiline_string(aln_int_map.toFasta()))
        
    #More accurate alignment, sacrificing performance.
    if accurate:
        app.Parameters['--globalpair'].on()
        app.Parameters['--maxiterate'].Value=1000
    
    #Get results using int_map as input to app
    res = app(seq_int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(parse_fasta(res['StdOut']))
    
    #Make new dict mapping original IDs
    new_alignment = {}
    for k,v in alignment.items():
        key = k.replace('_seed_','')
        new_alignment[seq_int_keys[key]]=v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment,MolType=moltype)
    #Clean up
    res.cleanUp()
    remove(app.Parameters['--seed'].Value)
    del(seq_collection,seq_int_map,seq_int_keys,\
        aln,aln_int_map,aln_int_keys,app,res,alignment)

    return new_alignment