Example #1
0
def CyclicWithRC():
    """ Generates cyclic syperstring of minimal length
    with every read OR its reverse complement """
    kmers = f.LoadFile('\\rosalind_gasm.txt').splitlines()
    Graph = p.DeBruijnRC(kmers)
    
    # Get first node
    cycle = []
    node = random.choice(list(Graph.keys()))
    
    # extend cycle
    for i in range(len(Graph)//2):
        cycle.append(node)
        if Graph[node][0] in Graph:
            node = Graph[node][0]
        else:
            # Find node with most overlap
            # Use that one          
            node = MaxOverlap(Graph[node][0],Graph)
    
    # Merge into one string based on overlap
    superstring = cycle[0]
    for i in cycle[1:]:
        superstring = p.Combine(superstring,i)
    
    # Get rid of overlap at end of string
    k = len(superstring)
    for i in range(k-1,0,-1):
        if superstring[:i] == superstring[k-i:]:
            f.ExportToFile('rosalind_gasm_output.txt',superstring[:k-i])
            return
Example #2
0
def TT():
    """ Given 2 DNA strings of equal length, FASTA format,
    returns the transition / transversion ratio
    Transitions: A<->G, C<->T
    Transversions: A<->T, A<->C, G<->C, G<->T """
    input = f.LoadFile('\\rosalind_tran.txt')
    [Labels, DNA] = f.FASTA(input)
    p = DNA[0]
    q = DNA[1]

    transition = 0
    transversion = 0
    for i in range(len(p)):
        if p[i] == q[i]:
            continue
        else:
            if (p[i] in 'AG') and (q[i] in 'AG'):
                transition += 1
            elif (p[i] in 'CT') and (q[i] in 'CT'):
                transition += 1
            else:
                transversion += 1
    ratio = str(transition / transversion)
    f.ExportToFile('rosalind_tran_output.txt', ratio)
    return
Example #3
0
def Lex():
    """ Given a collection of symbols and a +integer n, returns
    all strings of length n that can be formed from the alphabet,
    ordered lexicographically"""
    input = f.LoadFile('\\rosalind_lexf.txt').splitlines()
    sym = input[0].split()
    r = int(input[1])

    n = len(sym)
    lex_list = []

    # for first round
    for _ in range((n**r) // n):  # should match 1/n of total in list
        for i in sym:
            lex_list.append(i)
    lex_list = sorted(lex_list)

    # First go through whole list correct number of times (j)
    m = (n**r) // n
    k = 0
    for j in range(1, r):
        i = -1  # index resets each go-through
        m = m // n  # Smaller number of each sym in a row each round
        k += 1  # More cycles each time to get through list

        for __ in range(n**k):
            for s in sym:
                for _ in range(m):
                    i += 1
                    lex_list[i] += s

    f.ExportToFile('rosalind_lexf_output.txt', '\n'.join(lex_list))
    return
Example #4
0
def UnrootedTree():
    """ Given positive integer n, returns the number of internal
    nodes of any unrooted binary tree having n leaves.
    n leaves --> n-2 internal nodes"""
    n = int(f.LoadFile('\\rosalind_inod.txt'))    
    f.ExportToFile('rosalind_inod_output.txt',str(n-2))
    return
Example #5
0
def Founder():
    """ Returns matrix representing log(prob) that after i
    generations, no copies of the recessive allele will 
    remain in the population """
    input = f.LoadFile('\\rosalind_foun.txt').splitlines()
    [N, m] = [int(x) for x in input[0].split()]
    A = [int(x) for x in input[1].split()]

    # initialize matrix
    B = []
    for i in range(m):  #possible generations
        B.append([])
        for j in range(len(A)):  #possible initial copies
            B[i].append(0)

    # Add DriftToNone to correct box
    for i in range(1, m + 1):
        for j in range(len(A)):
            B[i - 1][j] = str(log10(DriftToNone(N, i, A[j])))

    # print in format
    B_print = []
    for line in B:
        B_print.append(' '.join(line))
    f.ExportToFile('rosalind_foun_output.txt', '\n'.join(B_print))
    return
Example #6
0
def DeBruijnRC():
    """ Returns adjacency list, based on
    given DNA strings and their reverse complements"""
    S = f.LoadFile('\\rosalind_dbru.txt').splitlines()

    # Make list of S U Src
    SuRC = []
    for i in S:
        SuRC.append(ReverseComplement(i))
    SuRC.extend(S)
    SuRC = RemoveDuplicates(SuRC)

    # Add all prefixes to adj_dict
    adj_dict = {}
    for kmer in SuRC:
        adj_dict[kmer[:-1]] = []

    for i in adj_dict:
        for j in SuRC:
            if i == j[:-1]:  # Look for strings with that prefix
                adj_dict[i].append(j[1:])  #If so, add suffix

    # Return in format
    output = []
    for i in adj_dict:
        for j in adj_dict[i]:
            output.append(('(%s, %s)' % (i, j)))

    f.ExportToFile('rosalind_dbru_output.txt', '\n'.join(output))
    return
Example #7
0
def OverlapGraph():
    """ Returns adjacency list of labels of DNA in FASTA format"""
    input = f.LoadFile('\\rosalind_grph.txt')
    [Labels, DNA] = f.FASTA(input)

    temp_dict = {}
    adj_dict = {}
    for kmer in DNA:
        temp_dict[kmer] = []
    for kmer in temp_dict:
        for i in DNA:
            if (kmer[-3:] == i[:3]  # if overlap by 3
                    and kmer != i):  # don't include self!
                temp_dict[kmer].append(i)
        # Remove any without matches
        if temp_dict[kmer] != []:
            adj_dict[kmer] = temp_dict[kmer]

    # Replace with labels
    name_dict = {}
    for kmer in adj_dict:
        kmer_ind = DNA.index(kmer)
        val_inds = []
        for value in adj_dict[kmer]:
            val_inds.append(DNA.index(value))
        name_dict[Labels[kmer_ind]] = [Labels[i] for i in val_inds]

    # Return in format
    output = []
    for name in name_dict:
        for i in name_dict[name]:
            output.append(' '.join([name, i]))
    f.ExportToFile('rosalind_grph_output.txt', '\n'.join(output))
    return
Example #8
0
def CompareSpectra():
    """ Given 2 spectra, returns:
    1. the largest mutliplicity of set1(-)set2
    2. abs(x) which maximizes (set1(-)set2)(x) """
    input = f.LoadFile('\\rosalind_conv.txt').splitlines()
    temp_spec = [float(x) for x in input[0].split()]
    test_spec = [float(x) for x in input[1].split()]

    # Find all possible differences between spectra
    differences = []
    for i in temp_spec:
        for j in test_spec:
            differences.append(round(i - j, 5))

    # Find diff that occurs most frequently
    mode = max(set(differences), key=differences.count)

    # Count how frequently
    count = 0
    for i in differences:
        if i == mode:
            count += 1

    #print(count, mode, sep = '\n')
    f.ExportToFile('rosalind_conv_output.txt',
                   '\n'.join([str(count), str(mode)]))
    return
Example #9
0
def Drift():
    """ Predicts probability that in a population of N diploid
    individuals initially possessing m copies of a dominant allele,
    we will observe after g generations at least k copies
    of a recessive allele (assuming Wright-Fisher model) """
    input = f.LoadFile('\\rosalind_wfmd.txt').split()
    N = int(input[0]) * 2
    m = int(input[1])  # initial num of copies of dom allele in pop (i)
    g = int(input[2])  # after g generations...
    k = int(input[3])  # prob that at least k copies of recessive (j)

    # Calculate probability of number of dominant alleles
    # Start with generation 0
    curr_gen = [0 for i in range(N + 1)]  # initialize as 0
    #-we know there is a 100% prob that there are m alleles
    #-everything else is 0
    curr_gen[m] = 1

    # iterate over generations
    for gen in range(g):
        next_gen = [0 for i in range(N + 1)]  #initialize as 0

        for i in range(N + 1):  #starting point
            for j in range(N + 1):  #ending point
                # temp-term = markov transition probability
                temp_term = nCr(N, i) * (j / N)**i * (1 - (j / N))**(N - i)
                # add to previous p (pA + pB = Ptotal)
                next_gen[i] += temp_term * curr_gen[j]

        curr_gen = next_gen  # update as current generation

    prob = str(sum(curr_gen[:-k]))  #sum = 'at least k'
    f.ExportToFile('rosalind_wfmd_output.txt', prob)
    return
Example #10
0
def Subsets():
    """ Given positive int n, returns total number of subsets
    1:n modulo 1000000"""
    n = int(f.LoadFile('\\rosalind_sset.txt'))
    P = 2**n % 1000000
    f.ExportToFile('rosalind_sset_output.txt', str(P))
    return
Example #11
0
def MatchSpectrum():
    """ Given:
    1) A positive integer n
    2) n protein strings
    3) A multiset corresponding to the complete spectrum of some
    unknown protein string...
    ... Returns the maximum multiplicity, and the string where this occurs """
    input = f.LoadFile('\\rosalind_prsm.txt').splitlines()
    n = int(input[0])
    proteins = input[1:n+1]
    spectrum = [float(x) for x in input[n+2:]]
    
    # Find the masses for each protein
    masses = []
    for p in proteins:
        masses.append(GetMasses(p))
    
    # Find mode for each
    modes = []
    for m in masses:
        modes.append(CompareSpectra(m,spectrum))
    
    # Return protein w max modes, and that max
    max_mode = max(modes)
    max_index = modes.index(max_mode)
    max_protein = proteins[max_index]
    
    f.ExportToFile('rosalind_prsm_output.txt','\n'.join([str(max_index),max_protein]))
    return
Example #12
0
def DistanceMatrix():
    """ Given n DNA strings (FASTA), returns distance matrix """
    input = f.LoadFile('\\rosalind_pdst.txt')
    [Label, DNA] = f.FASTA(input)

    # Initialize distance matrix
    D = []
    for _ in DNA:
        D.append([])
    for i in range(len(D)):
        for _ in range(len(D)):
            D[i].append(0)

    # Calculate Hamming Distance, add to matrix
    for i in range(len(DNA)):
        for j in range(len(DNA)):
            dist = HammingDistance(DNA[i], DNA[j])
            D[i][j] = str(dist / len(DNA[0]))

    # Properly format
    D_formatted = []
    for line in D:
        D_formatted.append(' '.join(line))
    f.ExportToFile('rosalind_pdst_output.txt', '\n'.join(D_formatted))
    return
Example #13
0
def NewickDistanceWeights():
    """ Gives distances between pair of nodes in trees (Newick) """
    input = f.LoadFile('\\rosalind_nkew.txt').splitlines()

    # Separate into Trees and Pairs
    Trees = []
    Pairs = []
    for line in input:
        if ';' in line:
            Trees.append(line)
        elif line != '':
            Pairs.append(line.split())

    # For each tree in the file
    distances = []
    for i in range(len(Trees)):
        tree = Phylo.read(io.StringIO(Trees[i]), 'newick')
        # If no edgeweights specified, use code below (weight=1)
        """clades = tree.find_clades()
        for clade in clades:
            clade.branch_length = 1"""

        d = tree.distance(Pairs[i][0], Pairs[i][1])
        distances.append(str(d))

    f.ExportToFile('rosalind_nkew_output.txt', ' '.join(distances))
    return
Example #14
0
def Sets():
    """ Returns 6 sets:
    1. A U B
    2. A intersection B
    3. A - B
    4. B - A
    5. Ac
    6. Bc """
    input = f.LoadFile('\\rosalind_seto.txt').splitlines()
    n = int(input[0])
    A = input[1].replace('{', '').replace('}', '').split(', ')
    B = input[2].replace('{', '').replace('}', '').split(', ')

    # Make Union set
    AB_union = RemoveDuplicates(A + B)  # either A or B (or both)
    AB_intersect = [i for i in A if i in B]  #both A & B
    AB_diff = [i for i in A if i not in B]  # A not B
    BA_diff = [i for i in B if i not in A]  # B not A

    U = [str(i) for i in range(1, n + 1)]  # for set complements
    A_comp = [i for i in U if i not in A]  # U not A
    B_comp = [i for i in U if i not in B]  # U not B

    # Return in format
    Sets = [AB_union, AB_intersect, AB_diff, BA_diff, A_comp, B_comp]
    output = []
    for set in Sets:
        output.append('{%s}\n' % ', '.join(set))
    f.ExportToFile('rosalind_seto_output.txt', ''.join(output))
    return
Example #15
0
def ErrorCorrection():
    """ Given list of DNA (FASTA) with correct reads occuring at least twice,
    returns incorrect reads and the corrected version."""
    input = f.LoadFile('\\rosalind_corr.txt')
    [Labels, DNA] = f.FASTA(input)

    correct_DNA = []
    # Read is correct if it appears at least twice,
    #-possibly as reverse complement
    for i in DNA:
        if Freq(i, DNA) > 1:
            correct_DNA.append(i)

    # Add all reverse complements to correct_DNA
    new_correct = []
    for i in correct_DNA:
        new_correct.append(i)
        new_correct.append(ReverseComplement(i))
    correct_DNA = RemoveDuplicates(new_correct)

    # Compare each read against the correct ones
    output = []
    for read in DNA:
        # If its in correct_Dna, ignore
        if read not in correct_DNA:
            # Find which string it matches best
            match = MinimumDistance(read, correct_DNA)
            # print in format
            output.append('%s->%s' % (read, match))
    f.ExportToFile('rosalind_corr_output.txt', '\n'.join(output))
    return
Example #16
0
def InterleavingMotifs():
    [p,q] = f.LoadFile('\\rosalind_scsp.txt').splitlines()
    
    k = len(p)
    l = len(q)
    matrix = []
    matrix = MakeMatrixSCS(matrix,k,l,p,q)

    scs = InterpretMatrixSCS(matrix,k,l,p,q)
    f.ExportToFile('rosalind_scsp_output.txt',scs)
    return
Example #17
0
def CompletingaTree():
    """ Given positive integer n and an adjacency list
    corresponding to a graph on n nodes that contains no cycles,
    returns the minimum number of edges that can be added to
    the graph to product a tree"""
    input = f.LoadFile('\\rosalind_tree.txt').splitlines()
    n = int(input[0])
    edges = len(input[1:])
    minimum = str(n - edges - 1)
    f.ExportToFile('rosalind_tree_output.txt', minimum)
    return
Example #18
0
def HammingDistance():
    """Returns the Hamming Distance between 2 strings"""
    input = f.LoadFile('\\rosalind_hamm.txt').splitlines()
    p = input[0]
    q = input[1]
    dist = 0
    for i in range(len(p)):
        if p[i] != q[i]:
            dist += 1
    f.ExportToFile('rosalind_hamm_output.txt', str(dist))
    return
Example #19
0
def ProteinTomRNA():
    """ Returns total number of different RNA strings from which the
    protein could have been translated, modulo 1000000"""
    protein = f.LoadFile('\\rosalind_mrna.txt')
    protein += 'X' # add stop codon to end
    combo = 1
    for aa in protein:
        if aa in mRNA_dict:
            combo = combo*mRNA_dict[aa]
    f.ExportToFile('rosalind_mrna_output.txt', str(combo % 1000000))
    return
Example #20
0
def ExpectedVal():
    """ Given positive int n and array P representing probabilities
    corresponding to an allel frequency, returns array B representing
    the expected allele frequency of the next generation """
    input = f.LoadFile('\\rosalind_ebin.txt').splitlines()
    n = int(input[0])
    P = [float(x) for x in input[1].split()]

    B = [str(round(i * n, 4)) for i in P]
    f.ExportToFile('rosalind_ebin_output.txt', ' '.join(B))
    return
Example #21
0
def Spectrum():
    """ Given prefix spectrum of protein, returns protein string"""
    L = f.LoadFile('\\rosalind_spec.txt').splitlines()
    L = list(reversed(sorted([float(x) for x in L])))

    protein = []
    for i in range(len(L) - 1):
        aa = round(L[i] - L[i + 1], 4)
        protein.insert(0, inv_massdict[aa])

    f.ExportToFile('rosalind_spec_output.txt', ''.join(protein))
    return
Example #22
0
def IndependentAlleles():
    input = f.LoadFile('\\rosalind_lia.txt').split()
    k = int(input[0])
    N = int(input[1])

    P = 2**k
    prob = 0
    for i in range(N, P + 1):
        prob += nCr(P, i) * (0.25**i) * (0.75**(P - i)
                                         )  # formula for Mendel's 2nd Law
    f.ExportToFile('rosalind_lia_output.txt', str(prob))
    return
Example #23
0
def GlobalAlignment():
    """ Uses MakeMatrix to return the maximum alignment score
    between 2 DNA strings (FASTA)"""
    input = f.LoadFile('\\rosalind_glob.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    maxalign = MakeMatrixGlobal(matrix, k, l, p, q)
    f.ExportToFile('rosalind_glob_output.txt', str(maxalign))
    return
Example #24
0
def Splicing():
    """ Returns sum of combinations C(n,k) for m<=k<=n, modulo 1000000 """
    [n, m] = f.LoadFile('\\rosalind_aspc.txt').split()
    n = int(n)
    m = int(m)

    count = 0
    for k in range(m, n + 1):
        count += nCr(n, k)

    f.ExportToFile('rosalind_aspc_output.txt', str(count % 1000000))
    return
Example #25
0
def Splicing():
    """ Given a DNA substring and a collection of substrings acting as introns,
    returns a protein string from transcribing and translating exons"""
    input = f.LoadFile('\\rosalind_splc.txt')
    [Label, DNA] = f.FASTA(input)

    t = DNA[0]  # original string
    for substr in DNA[1:]:
        t = t.replace(substr, '')  # remove introns
    RNA = DNAtoRNA(t)
    f.ExportToFile('rosalind_splc_output.txt', RNAtoProtein(RNA))
    return
Example #26
0
def EditDistance():
    """ Given 2 strings, FASTA, returns the edit distance """
    input = f.LoadFile('\\rosalind_edit.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []

    result = MakeMatrixDist(matrix, k, l, p, q)
    f.ExportToFile('rosalind_edit_output.txt', str(result))
    return
Example #27
0
def GenotypeFromPedigree(newick):
    """ Combine all previous functions, and convert to exportable format"""
    input = f.LoadFile('\\rosalind_mend.txt')

    tree = ReduceTree(input)
    while CountParantheses(tree):
        tree = SolveProbabilities(tree)
    result = tree.replace('(', '')
    result = result.replace(')', '')
    result = result.replace(';', '')
    result = result.split(',')
    f.ExportToFile('rosalind_mend_output.txt', ' '.join(result))
    return
Example #28
0
def RNAtoProtein():
    """ Uses RNA_dict to convert RNA string to protein"""
    s = f.LoadFile('\\rosalind_prot.txt')
    codons = []
    protein = ''
    for i in range(0, len(s), 3):
        codons.append(s[i:i + 3])  # separate into codons
    for triplet in codons:
        if triplet in ['UAA', 'UAG', 'UGA']:  # stop at stop codons
            break
        else:
            protein += RNA_dict[triplet]
    f.ExportToFile('rosalind_prot_output.txt', protein)
    return
Example #29
0
def SortingByReversals():
    """ Uses ReversalDistanceWithPairs() to report back
    reversal distance and pairs encoding the reversal """
    input = f.LoadFile('\\rosalind_sort.txt').splitlines()
    q = [int(x) for x in input[0].split()]
    p = [int(x) for x in input[1].split()]

    output = []
    [distance, indices] = ReversalDistanceWithPairs(p, q)
    output.append(str(distance))
    for i in indices:
        output.append(' '.join(str(x) for x in i))
    f.ExportToFile('rosalind_sort_output.txt', '\n'.join(output))
    return
Example #30
0
def Trie():
    strings = f.LoadFile('\\rosalind_trie.txt').splitlines()
    """ Puts together all of above functions to make a trie! """
    trie = [(0, 1, '')]  # root!
    for string in strings:
        [parent, x] = FindParent(trie, string)
        trie = BranchOff(trie, string, parent, x)

    # Format for printing!
    trief = []
    for tup in trie[1:]:  # Don't include root
        trief.append(' '.join([str(x) for x in tup]))
    f.ExportToFile('rosalind_trie_output.txt', '\n'.join(trief))
    return