Ejemplo n.º 1
0
def remove(sequence, hmmfile, domNumber):
    """
    Deletes a specified domain in the input sequence. 

    Args:
        sequence: The full gene sequence (including both domain and linker regions)
        hmmfile: hmm file containing hmm of domain to be removed
        domNumber: The position of the domain to be duplicated w.r.t. the other domains
    Returns sequence with specified duplication
    """
    if HMMER:
        starts, ends = findDomains(sequence, hmmfile)[:2]
    else:
        starts, ends = findMotifs(sequence, hmmfile)[:2]

    #Removes one of the linkers if necessary
    if domNumber > 0:
        sequence = sequence[:starts[domNumber] -
                            5] + sequence[ends[domNumber] + 1:]
    elif domNumber < len(starts) - 1:
        sequence = sequence[:starts[domNumber]] + sequence[ends[domNumber] +
                                                           1 + 5:]
    else:
        sequence = sequence[:starts[domNumber]] + sequence[ends[domNumber] +
                                                           1:]
    return sequence
Ejemplo n.º 2
0
def genRandomSequence(numDoms):
    """
    Generates a random zf-C2H2 protein sequence with the given number of domains.
    Includes sequence before and after zf domain.
    """
    files = ls(DATAPATH)
    f = list(open(DATAPATH + choice(files)))[1::2]
    sequence = choice(f).strip()
    sequence.translate(None, '-')

    starts, ends, seqs = findDomains(sequence, hmmfile)
    if len(starts) < numDoms:
        return genRandomSequence(numDoms)
    prefix = sequence[:starts[0]]
    suffix = sequence[ends[-1]:]
    if prefix == '' or suffix == '':
        return genRandomSequence(numDoms)
    linkers = []
    for i in range(len(starts) - 1):
        linkers.append(sequence[ends[i] + 1:starts[i + 1]])

    middle = ''
    for _ in range(numDoms - 1):
        middle += choice(seqs) + choice(linkers)
    middle += choice(seqs)

    newSeq = prefix + middle + suffix
    newSeq = ''.join(newSeq.split('-'))

    #Deletes all lowercase letters
    newSeq = newSeq.translate(None, string.ascii_lowercase)
    #Deletes all illegal AA characters
    newSeq = newSeq.translate(None, 'BJOUXZ')

    return newSeq
Ejemplo n.º 3
0
def duplicate(sequence, hmmfile, domNumber, length):
    """
    Given a sequence and a domain number (ith domain in sequence), duplicates this 
    domain in the sequence.

    Args:
        sequence: The full gene sequence (including both domain and linker regions)
        hmmfile: hmm file containing hmm of domain to be duplicated
        domNumber: The position of the domain to be duplicated w.r.t. the other domains
		length: Length (#domains) involved in duplication

    Returns:
        sequence (str ): The sequence after the specified duplication.	
    """
    BASELINKER = 'TGEVK'
    #BASELINKER = ''
    if HMMER:
        starts, ends = findDomains(sequence, hmmfile)[:2]
    else:
        starts, ends = findMotifs(sequence, hmmfile)[:2]

    sequence = sequence[:ends[domNumber + length - 1] + 1] + BASELINKER + \
                    sequence[starts[domNumber]:]

    return sequence
Ejemplo n.º 4
0
def generateIQTree():

    sd = 1 #startingDomains

    hostTree = createRandomTopology(1, 1, lambda x: x)
    guestTree, nodeMap = buildGuestTree(hostTree, s2, expfunc, .2, gaussNoise, sd)

    rootSequence = grs(sd)
    evolveAlongTree(hostTree, guestTree, nodeMap, rootSequence, hmmfile, emissionProbs, transmat)

    names, seqs = [], []

    for node in hostTree:
        if HMMER:
            seqs += findDomains(node.sequence, hmmfile)[2]
        else:
            seqs += findMotifs(node.sequence, hmmfile)[2]
        gnodes = findLeaves(nodeMap[node])
        n = [(leaf.position, leaf.name) for leaf in gnodes if leaf.event != 'LOSS']
        n.sort()
        names += [name[1] for name in n]

    guestTree = prune(guestTree, names)
    outgroup = Tree()
    outgroup.up = guestTree
    guestTree.children.append(outgroup)
    outgroup.name = 'Outgroup'
    outseq = evolveSequence(rootSequence, .1, 2, emissionProbs, hmmfile, transmat)
    if HMMER:
        outseq = findDomains(outseq, hmmfile)[2][0]
    else:
        outseq = findMotifs(outseq, hmmfile)[2][0]
    outgroup.add_feature('sequence', outseq)
    seqs.insert(0, outseq)
    names.insert(0, 'Outgroup')

    guestTree.write(outfile = 'testtree.nwk')
    hostTree.write(outfile='hosttree.nwk')
    addRandomTrees('testtree.nwk')

    writeFasta(names, seqs, 'testfasta.fa', False)
    mlTree('testfasta.fa', 'testtree.nwk', True)
    iqtree = Tree('testfasta.fa.treefile')
    iqtree.set_outgroup(iqtree&('Outgroup'))

    return hostTree, guestTree, iqtree
Ejemplo n.º 5
0
def evolveSequence(sequence, rate, branchLength, emissionProbs, hmmfile,
                   transmat):
    """
    Putting the previous steps together, simulates evolutoin of a full sequence including 
    both domains and non-domain sequence. 

    Args:
        sequence (str):  The full sequence to be evolved
        emissionsProbs:  matrix with dimensions (n x 20) where n is the length of 
                         the domain. Each row contains the probability of each 
                         aa appearing at that position (in pfam hmm order) 
        transmat (list): aa transition matrix with dimensions (20 x 20)
    """
    #FOR TESTING
    original_sequence = sequence
    #END TESTING BLOCK

    #Find domains, check if sequence begins and/or ends with a domain
    if HMMER:
        domains = findDomains(sequence, hmmfile)[2]
    else:
        domains = findMotifs(sequence, hmmfile)[2]

    #split on all domains
    for seq in domains:
        sequence = sequence.replace(seq, "xxx")
    sequences = sequence.split("xxx")

    #Evolve sequence fragments individually
    for i in range(len(domains)):
        domains[i] = evolveDomain(domains[i], rate, branchLength,
                                  emissionProbs, transmat, hmmfile)

    for i in range(len(sequences)):
        if sequences[i] == '':
            sequences[i] = evolveEmptyLinker(branchLength)
        else:
            sequences[i] = evolveLinker(sequences[i], branchLength)

    #Reassemble full sequence post evolution
    sequence = ''
    try:
        for i in range(len(domains)):
            sequence += sequences[i] + domains[i]
    except:
        print original_sequence
        print domains
        print sequences
        printDomSeq(original_sequence, hmmfile)
        raise Exception

    sequence += sequences[-1] if len(sequences) > len(domains) else domains[-1]

    return sequence
Ejemplo n.º 6
0
def findAndAlign(hmmfile, sequence):
    possStarts, possEnds, possSequences = findDomains(sequence, hmmfile)
    starts = []
    ends = []
    sequences = []
    for i in range(len(possSequences)):
        if len(possSequences[i]) == 23 and any(
                base.islower() for base in possSequences[i]) == False:
            starts.append(possStarts[i])
            ends.append(possEnds[i])
            sequences.append(possSequences[i])

    origCount = len(sequences)
Ejemplo n.º 7
0
def genRandomSequence2(numDoms):
    """
    Generates a zf-C2H2 protein sequence with the given number of domains.
    Picks 1 domain per sequence, one sequence per orthogroup
    """
    files = ls(DATAPATH)
    for i in range(len(files))[::-1]:
        if '.fa' not in files[i]:
            files.pop(i)
    pool = []
    i = 0
    while i < numDoms:
        f = list(open(DATAPATH + choice(files)))[1::2]
        f = choice(f).strip()
        if len(findDomains(f, hmmfile)[0]) > 1:
            pool.append(string.translate(f, None, '-'))
            i += 1

    starts, ends = findDomains(pool[0], hmmfile)[:2]
    prefix = pool[0][:starts[0]]
    suffix = pool[0][ends[-1]:]
    if prefix == '' or suffix == '':
        return genRandomSequence2(numDoms)

    i, j = starts[0], starts[1]
    middle = fixzf(pool[0][i:j])

    for sequence in pool[1:]:
        starts = findDomains(sequence, hmmfile)[0]
        i, j = starts[0], starts[1]
        middle += fixzf(sequence[i:j])

    newSeq = prefix + middle + suffix
    newSeq = newSeq.translate(None, '-')  #''.join(newSeq.split('-'))
    newSeq = newSeq.translate(None, string.ascii_lowercase)
    newSeq = newSeq.translate(None, 'BJOUXZ')

    return newSeq
Ejemplo n.º 8
0
def seqDiff(n=10, bl=1):

    RED = '\033[91m'
    NORMAL = '\033[0m'

    seq = grs(1)
    if HMMER:
        dom = findDomains(seq, hmmfile)[2][0]
    else:
        dom = findMotifs(seq, hmmfile)[2][0]
    print dom

    iterations = 0
    while iterations < n:
        try:
            temp = evolveSequence(seq, .05, bl, emissionProbs, hmmfile, transmat)
            if HMMER:
                tempdom = findDomains(temp, hmmfile)[2][0]
            else:
                tempdom = findMotifs(temp, hmmfile)[2][0]
            out = ""
            nMuts = 0
            for i in range(len(dom)):
                if tempdom[i] == dom[i]:
                    out += tempdom[i]
                else:
                    out += RED + tempdom[i] + NORMAL
                    nMuts += 1
            totalMuts = 0
            for i in range(len(temp)):
                if temp[i] != seq[i]:
                    totalMuts += 1
            print out, nMuts, totalMuts, round(totalMuts / float(nMuts) / (len(temp) / 23.))
        except:
            continue
        iterations += 1
Ejemplo n.º 9
0
def withHost(numLeaves = 4, bl = .5, hostTree = None):
    sd = 1 #startingDomains
    extralen = .05

    if hostTree is None:
        hostTree = createRandomTopology(numLeaves, bl, lambda x: x)
        for leaf in hostTree:
            leaf.dist += extralen

    dupFunc = lambda x, y: 1
    #guestTree, nodeMap = buildGuestTree(hostTree, s2, expfunc, .1, gaussNoise, sd)
    guestTree, nodeMap = buildGuestTree(hostTree, s2, dupFunc, .1, gaussNoise, sd)
    
    for leaf in guestTree:
        leaf.dist += extralen

    #rootSequence = grs(sd)
    rootSequence = gfs('60_emissions.fa', 40)

    evolveAlongTree(hostTree, guestTree, nodeMap, rootSequence, hmmfile, emissionProbs, transmat)
    names = [(leaf.position, leaf.name) for leaf in guestTree if leaf.event != 'LOSS']
    names.sort()
    names = [i[1] for i in names]
    names.sort()

    seqs = []
    hnodes = sorted([i.name for i in hostTree]) 
    for node in hnodes:
        if HMMER:
            seqs += findDomains((hostTree&node).sequence, hmmfile)[2]
        else:
            seqs += findMotifs((hostTree&node).sequence, hmmfile)[2]

    for node in hostTree.traverse():
        node.del_feature('leaves')

    guestTree = guestTree.children[0]
    guestTree.up = None
    writeTree(hostTree, 'host.nwk')
    writeTree(guestTree, 'guest.nwk')
    writeFasta(names, seqs, 'sequences.fa')

    return hostTree, guestTree, names, seqs
Ejemplo n.º 10
0
def groupDomains(names, sequences, hmmfile):
    """
    Takes a list of input sequences from an msa and returns a list of domain strings for 
    each. Leaves an empty string at position i of the jth list if the jth sequence does 
    not have a copy of domain i. This aligns all existing domains and makes it clear
    which domains are present in which sequence

    Example (domains marked as xxx):

    sequences = ["AAxxxAAxxxAAAAAAA",
                 "AAAAAAAxxxAAxxxAA",
                 "AAxxxAAAAAAAxxxAA"]


    grouped = [[dom1, dom2, ''  ],
               [''  , dom2, dom3],
               [dom1, ''  , dom3]]

    Args:
        sequences (list): A list of sequences 
        hmmfile   (str ): The name of the hmmfile containing the desired domain model

    Returns:
        grouped   (list): A list of lists of all domain sequences from each domain
        domNames  (list): A list of lists of domain names for each domain in each sequence
    """

    domStarts = [findDomains(i, hmmfile)[0] for i in sequences]
    domNames = []
    allStarts = sorted(list(set.union(*[set(i) for i in domStarts])))
    grouped = []
    for i in range(len(domStarts)):
        domains = ['' for _ in range(len(allStarts))]
        dnames = ['' for _ in range(len(allStarts))]
        for start in domStarts[i]:
            domains[allStarts.index(start)] = sequences[i][start:start + 23]
            dnames[allStarts.index(start)] = names[i] + "_" + str(start)
        grouped.append(domains)
        domNames.append(dnames)
    return grouped, domNames
Ejemplo n.º 11
0
def selfSimilarity(name, sequence, hmmfile, heatmap=False):
    """
    Given a single sequence, checks the level of self similarity between 
    its constituent domains. Optionally creates a heatmap of this similarity

    Args:
        sequence (str ): An amino acid string representing a protein
        hmmfile  (str ): File path of the hmm used to find domains
        heatmap  (bool): (optional, default False) If true, displays a heatmap
                         of self similarity between domains on the sequence

    Output:
        simMatrix (list): A 
    """
    domains = findDomains(sequence, hmmfile)[2]
    numDomains = len(domains)
    simMatrix = np.zeros((numDomains, numDomains))

    for i in range(numDomains):
        for j in range(i, numDomains):
            simMatrix[i][j] = domainSim(domains[i], domains[j])
            simMatrix[j][i] = simMatrix[i][j]

    #Normalization step
    if len(simMatrix) > 1:
        bias = np.min(simMatrix)
        scale = np.max(simMatrix) - bias

        for i in range(len(simMatrix)):
            for j in range(len(simMatrix)):
                simMatrix[i][j] = (simMatrix[i][j] - bias) / scale

    if heatmap:
        sns.heatmap(simMatrix, cmap='viridis')
        #plt.savefig('tmp/' + name + '.pdf')
        plt.show()
        plt.close()

    return simMatrix
Ejemplo n.º 12
0
def domainEvolution(host, guest, hmmfile, nodemap, sequence):
    possStarts, possEnds, possSequences = findDomains(sequence, hmmfile)
    starts = []
    ends = []
    sequences = []
    positions = {}
    for i in range(len(possSequences)):
        if len(possSequences[i]) == 23 and any(
                base.islower() for base in possSequences[i]) == False:
            # if len(possSequences[i]) == 23 and any(base.islower() for base in possSequences[i]) == False:
            starts.append(possStarts[i])
            ends.append(possEnds[i])
            sequences.append(possSequences[i])

    origCount = len(sequences)
    print("ORIG: " + str(origCount))
    linkerStarts, linkerEnds, linkerSequences = findLinkers(
        starts, ends, sequence)

    orthogroup = []
    bookkeeping = {}
    internalNodes = {}

    guestData = {}
    print("*******************************************")
    print(host.get_tree_root())
    for node in host.traverse():
        print("HOST NODE: " + node.name)
        print("HOST NODE CHILDREN: " + str(node.children))

        # find the root sequence
        mapped_guest = nodemap[node]
        print("NODEMAP: " + str(mapped_guest))
        guestRoots = []

        if node.is_root():
            guestRoots.append(guest.get_tree_root())
        else:
            for guestNode in mapped_guest:

                if guestNode.up not in mapped_guest:
                    guestRoots.append(guestNode)

        print("GUEST ROOTS: " + str(guestRoots))

        # # map guestTree node to domains
        if node.is_root():
            node.add_feature('sequences', sequences[:])
            node.add_feature('starts', starts[:])
            node.add_feature('ends', ends[:])
            node.add_feature('positions', dict(positions))
            node.add_feature('linkerStarts', linkerStarts[:])
            node.add_feature('linkerEnds', linkerEnds[:])
            node.add_feature('linkerSequences', linkerSequences[:])
            for i in range(len(guestRoots)):
                guestRoots[i].add_feature('sequences', sequences[:][i])
                guestRoots[i].add_feature('starts', starts[:][i])
                guestRoots[i].add_feature('ends', ends[:][i])
        else:
            betweenNodeLinkers = node.up.linkerSequences[:]
            betweenNodeSequences = node.up.sequences[:]
            distance = node.up.get_distance(node)
            # sequence level linker evolution
            for i in range(len(betweenNodeLinkers)):
                betweenNodeLinkers[i] = evolveLinker(betweenNodeLinkers[i],
                                                     distance)

            # domain level evolution
            for i in range(len(betweenNodeSequences)):
                betweenNodeSequences[i] = mutateDomain(betweenNodeSequences[i],
                                                       hmmfile, distance)
            node.add_feature('sequences', betweenNodeSequences[:])
            node.add_feature('starts', node.up.starts[:])
            node.add_feature('ends', node.up.ends[:])
            node.add_feature('positions', dict(node.up.positions))
            node.add_feature('linkerStarts', node.up.linkerStarts[:])
            node.add_feature('linkerEnds', node.up.linkerEnds[:])
            node.add_feature('linkerSequences', betweenNodeLinkers[:])

        current_sequences = node.sequences
        current_starts = node.starts
        current_ends = node.ends
        current_positions = node.positions
        current_linkerStarts = node.linkerStarts
        current_linkerEnds = node.linkerEnds
        current_linkerSequences = node.linkerSequences

        print("INPUT POSITIONS: " + str(current_positions))

        print("INPUT: " + str(sequences))

        print("MAPPING: " + str(mapped_guest))

        # initialize positions
        pos_init = 0

        extra = dict({})

        print("-------------------------------------------")
        for root in guestRoots:
            print("CURRENTLY EXAMINING GUEST ROOT: " + root.name)
            # for a subtree by traversing from a root node,
            # see if node has chilren that are in the list,
            # cut off when it doesn't to form subtree

            subtree = root.copy("deepcopy")
            # print(subtree.write(format=8))
            for newbie in subtree.iter_descendants():
                if newbie not in mapped_guest:
                    newbie.detach()
            # print(subtree.write(format=8))
            # initialize distances
            distances = []
            distances.append([root, 0])
            closestNode = distances[0][0]
            closestDistance = distances[0][1]
            index = 0

            # obtain domain information (to be updated later)
            if node.is_root():
                root_sequences = root.sequences
                root_starts = root.starts
                root_ends = root.ends
                # index = guestRoots.index(root)
            else:
                for i in current_positions:
                    if i.name == root.up.name:
                        index = current_positions[i]
                        del current_positions[i]
                        current_positions[root] = index
                # index = current_positions[root.up]
                # if root.up in current_positions:
                #     del current_positions[root.up]
                root_sequences = current_sequences[index]
                root_starts = current_starts[index]
                root_ends = current_ends[index]

            if pos_init == 0 and node.is_root():
                current_positions[root] = 0
                pos_init = 1

            length = len(root_sequences)

            count = 1

            # iterate by minimum distance
            while True:
                print("EVENT TITLE: " + closestNode.event)
                # print("CURRENT POSITIONS: " + str(current_positions))
                # print(index)
                # check event node, update positions list
                if closestNode.event == "DUPLICATION":
                    for position in current_positions:
                        if current_positions[position] > index:
                            current_positions[position] += 1
                    if node.is_root and root.is_root:
                        oldPosition = current_positions[closestNode]
                        del current_positions[closestNode]
                    else:
                        oldPosition = current_positions[closestNode.up]
                        del current_positions[closestNode.up]
                    current_positions[closestNode.children[0]] = oldPosition
                    current_positions[
                        closestNode.children[1]] = oldPosition + 1
                    linkerLength = 0
                    if index == 0:
                        linkerLength = 5
                    else:
                        linkedLength = len(current_linkerSequences[index])

                    current_starts.append(root_starts + linkerLength + length)
                    current_ends.append(root_ends + linkerLength + length)
                    current_sequences.append(root_sequences)
                    current_linkerStarts.append(current_linkerStarts[index] +
                                                linkerLength + length)
                    current_linkerEnds.append(current_linkerEnds[index] +
                                              linkerLength + length)
                    current_linkerSequences.append(
                        current_linkerSequences[index])

                elif closestNode.event == "LOSS":
                    current_starts.pop(index)
                    current_ends.pop(index)
                    current_sequences.pop(index)
                    current_linkerStarts.pop(index)
                    current_linkerEnds.pop(index)
                    current_linkerSequences.pop(index)
                    for position in current_positions:
                        if current_positions[position] > index:
                            current_positions[position] -= 1
                    del current_positions[current_positions.keys()[index]]
                elif closestNode.event == "SPECIATION":
                    closestNodeSearch = guest & closestNode.name
                    if closestNodeSearch.up in current_positions:
                        del current_positions[closestNodeSearch.up]
                        current_positions[closestNode] = index

                # # sort to maintain original order in case of duplication
                current_sequences = sortBy(current_sequences, current_starts)
                current_starts.sort()
                current_ends.sort()
                current_positions = dict(
                    sorted(current_positions.items(), key=lambda x: x[1]))
                current_linkerSequences = sortBy(current_linkerSequences,
                                                 current_linkerStarts)
                current_linkerStarts.sort()
                current_linkerEnds.sort()

                print("POSITIONS: " + str(current_positions))
                print("STARTS: " + str(current_starts))
                print("ENDS: " + str(current_ends))

                print("CLOSEST NAME AND DISTANCE:")
                print(closestNode.name)
                print(closestDistance)
                guestNode = closestNode.name
                guestData[closestNode.name] = [
                    current_sequences[index], current_starts[index],
                    current_ends[index]
                ]

                print("PRE-DELETION DISTANCES:")
                print(distances)

                del distances[distances.index([closestNode, closestDistance])]
                print("POST-DELETION DISTANCES:")
                print(distances)
                closestChildren = closestNode.children
                print("CLOSEST CHILDREN:")
                print(closestChildren)

                # subtract distance to closest from every remaining considered gene
                for gene in distances:
                    gene[1] -= closestDistance
                # add the closest's childrento the list
                for child in closestChildren:
                    # print(mapped_guest)
                    if child in mapped_guest:
                        distances.append([child, child.dist])
                print("UNSORTED DISTANCES: " + str(distances))
                sortedDistances = sorted(distances, key=lambda x: x[1])
                print("SORTED DISTANCES: " + str(sortedDistances))
                if len(sortedDistances) > 0:
                    closestNode = sortedDistances[0][0]
                    closestDistance = sortedDistances[0][1]
                    # sequence level linker evolution
                    for i in range(len(current_linkerSequences)):
                        current_linkerSequences[i] = evolveLinker(
                            current_linkerSequences[i], closestDistance)

                    # domain level evolution
                    for i in range(len(current_sequences)):
                        old = current_sequences[i]
                        current_sequences[i] = mutateDomain(
                            current_sequences[i], hmmfile, closestDistance)

                    distances = sortedDistances
                    sequence = reconstructSequence(current_starts,
                                                   current_ends,
                                                   current_sequences,
                                                   current_linkerStarts,
                                                   current_linkerEnds,
                                                   current_linkerSequences)
                    internalNodes[guestNode] = sequence
                    guestData[closestNode.name] = [
                        current_sequences[index], current_starts[index],
                        current_ends[index]
                    ]

                    print("-------------------------------------------")
                    current_positions[closestNode] = index + 1
                    index += 1
                    if closestNode not in mapped_guest:
                        break
                else:
                    distances = sortedDistances
                    sequence = reconstructSequence(current_starts,
                                                   current_ends,
                                                   current_sequences,
                                                   current_linkerStarts,
                                                   current_linkerEnds,
                                                   current_linkerSequences)
                    internalNodes[guestNode] = sequence
                    guestData[closestNode.name] = [
                        current_sequences[index], current_starts[index],
                        current_ends[index]
                    ]
                    print("-------------------------------------------")
                    break

            print("Finished examining guest root: " + root.name)

            # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

        node.add_feature("positions", current_positions)
        node.add_feature("sequences", current_sequences)
        node.add_feature("starts", current_starts)
        node.add_feature("ends", current_ends)

        if node.is_leaf() == 1:
            bookkeeping[node.name] = [
                current_sequences, current_starts, current_ends
            ]
        print("*******************************************")
        print("ENDING sequences: " + str(current_sequences))
        finalSequence = reconstructSequence(current_starts, current_ends,
                                            current_sequences,
                                            current_linkerStarts,
                                            current_linkerEnds,
                                            current_linkerSequences)
        node.add_feature("final", finalSequence)
        print
        print

        if node.is_leaf() == True:
            if len(finalSequence) > 10:
                orthogroup.append(finalSequence)
            # orthogroup[node.name] = finalSequence
        # print(orthogroup)

    return orthogroup, bookkeeping, internalNodes, guestData