Ejemplo n.º 1
0
def DistanceMatrix():
    """ Given n DNA strings (FASTA), returns distance matrix """
    input = f.LoadFile('\\rosalind_pdst.txt')
    [Label, DNA] = f.FASTA(input)

    # Initialize distance matrix
    D = []
    for _ in DNA:
        D.append([])
    for i in range(len(D)):
        for _ in range(len(D)):
            D[i].append(0)

    # Calculate Hamming Distance, add to matrix
    for i in range(len(DNA)):
        for j in range(len(DNA)):
            dist = HammingDistance(DNA[i], DNA[j])
            D[i][j] = str(dist / len(DNA[0]))

    # Properly format
    D_formatted = []
    for line in D:
        D_formatted.append(' '.join(line))
    f.ExportToFile('rosalind_pdst_output.txt', '\n'.join(D_formatted))
    return
Ejemplo n.º 2
0
def OverlapGraph():
    """ Returns adjacency list of labels of DNA in FASTA format"""
    input = f.LoadFile('\\rosalind_grph.txt')
    [Labels, DNA] = f.FASTA(input)

    temp_dict = {}
    adj_dict = {}
    for kmer in DNA:
        temp_dict[kmer] = []
    for kmer in temp_dict:
        for i in DNA:
            if (kmer[-3:] == i[:3]  # if overlap by 3
                    and kmer != i):  # don't include self!
                temp_dict[kmer].append(i)
        # Remove any without matches
        if temp_dict[kmer] != []:
            adj_dict[kmer] = temp_dict[kmer]

    # Replace with labels
    name_dict = {}
    for kmer in adj_dict:
        kmer_ind = DNA.index(kmer)
        val_inds = []
        for value in adj_dict[kmer]:
            val_inds.append(DNA.index(value))
        name_dict[Labels[kmer_ind]] = [Labels[i] for i in val_inds]

    # Return in format
    output = []
    for name in name_dict:
        for i in name_dict[name]:
            output.append(' '.join([name, i]))
    f.ExportToFile('rosalind_grph_output.txt', '\n'.join(output))
    return
Ejemplo n.º 3
0
def ErrorCorrection():
    """ Given list of DNA (FASTA) with correct reads occuring at least twice,
    returns incorrect reads and the corrected version."""
    input = f.LoadFile('\\rosalind_corr.txt')
    [Labels, DNA] = f.FASTA(input)

    correct_DNA = []
    # Read is correct if it appears at least twice,
    #-possibly as reverse complement
    for i in DNA:
        if Freq(i, DNA) > 1:
            correct_DNA.append(i)

    # Add all reverse complements to correct_DNA
    new_correct = []
    for i in correct_DNA:
        new_correct.append(i)
        new_correct.append(ReverseComplement(i))
    correct_DNA = RemoveDuplicates(new_correct)

    # Compare each read against the correct ones
    output = []
    for read in DNA:
        # If its in correct_Dna, ignore
        if read not in correct_DNA:
            # Find which string it matches best
            match = MinimumDistance(read, correct_DNA)
            # print in format
            output.append('%s->%s' % (read, match))
    f.ExportToFile('rosalind_corr_output.txt', '\n'.join(output))
    return
Ejemplo n.º 4
0
def TT():
    """ Given 2 DNA strings of equal length, FASTA format,
    returns the transition / transversion ratio
    Transitions: A<->G, C<->T
    Transversions: A<->T, A<->C, G<->C, G<->T """
    input = f.LoadFile('\\rosalind_tran.txt')
    [Labels, DNA] = f.FASTA(input)
    p = DNA[0]
    q = DNA[1]

    transition = 0
    transversion = 0
    for i in range(len(p)):
        if p[i] == q[i]:
            continue
        else:
            if (p[i] in 'AG') and (q[i] in 'AG'):
                transition += 1
            elif (p[i] in 'CT') and (q[i] in 'CT'):
                transition += 1
            else:
                transversion += 1
    ratio = str(transition / transversion)
    f.ExportToFile('rosalind_tran_output.txt', ratio)
    return
Ejemplo n.º 5
0
def Splicing():
    """ Given a DNA substring and a collection of substrings acting as introns,
    returns a protein string from transcribing and translating exons"""
    input = f.LoadFile('\\rosalind_splc.txt')
    [Label, DNA] = f.FASTA(input)

    t = DNA[0]  # original string
    for substr in DNA[1:]:
        t = t.replace(substr, '')  # remove introns
    RNA = DNAtoRNA(t)
    f.ExportToFile('rosalind_splc_output.txt', RNAtoProtein(RNA))
    return
Ejemplo n.º 6
0
def EditDistance():
    """ Given 2 strings, FASTA, returns the edit distance """
    input = f.LoadFile('\\rosalind_edit.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []

    result = MakeMatrixDist(matrix, k, l, p, q)
    f.ExportToFile('rosalind_edit_output.txt', str(result))
    return
Ejemplo n.º 7
0
def GlobalAlignment():
    """ Uses MakeMatrix to return the maximum alignment score
    between 2 DNA strings (FASTA)"""
    input = f.LoadFile('\\rosalind_glob.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    maxalign = MakeMatrixGlobal(matrix, k, l, p, q)
    f.ExportToFile('rosalind_glob_output.txt', str(maxalign))
    return
Ejemplo n.º 8
0
def CountOptimalAlignments():
    """ Uses MakeMatrix and ConstructPath to return
    number of optimal alignments. Repeats ConstructPath multiple times to get
    all possible optimal alignments """
    input = f.LoadFile('\\rosalind_ctea.txt')
    [Labels,[p,q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    [editdistance,matrix] = MakeMatrixDist(matrix,k,l,p,q)
    
    return ConstructPath(matrix,k,l,p,q) % 134217727
Ejemplo n.º 9
0
def EditDistanceAlignment():
    """ Uses MakeMatrix and InterpretMatrix to return alignments """
    input = f.LoadFile('\\rosalind_edta.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    [editdistance, matrix] = MakeMatrixDist(matrix, k, l, p, q)

    [p_aligned, q_aligned] = InterpretMatrix(matrix, k, l, p, q)
    output = [str(editdistance), ''.join(p_aligned), ''.join(q_aligned)]
    f.ExportToFile('rosalind_edta_output.txt', '\n'.join(output))
    return
Ejemplo n.º 10
0
def SharedMotif():
    """ Finds the longest motif shared by all DNA strings in list, FASTA format """
    input = f.LoadFile('\\rosalind_lcsm.txt')
    [Labels,DNA] = f.FASTA(input)
    
    t = min(DNA) # shortest string
    k = len(t) # length of shortest string
    
    # From length of shortest sequence to 0
    for j in range(k,0,-1): # Do all at longest length first
        # From 0 to end of first sequence
        for i in range(k-j+1): # Adjust window
            template = t[i:i+j] #i + j be less than the length of t
            if InAll(template,DNA) == True:
                f.ExportToFile('rosalind_lcsm_output.txt',template)
                return
Ejemplo n.º 11
0
def MultipleAlignment():
    """ Returns multiple alignment score, and aligned scores.
    Input is meant to be very short, so this will take advantage of that.
    Thus, not useful for longer or more numerous sequences. """
    input = f.LoadFile('\\rosalind_mult.txt')
    [labels, strings] = f.FASTA(input)
    [a, b, c, d] = GappedStrings(strings)
    alignment_combos = list(k.product(a, b, c, d))
    maxscore = -100000000000000000

    for combo in alignment_combos:
        if MultiAlignScore(combo) > maxscore:
            maxscore = MultiAlignScore(combo)
            minalignments = combo

    output = [str(maxscore), '\n'.join(minalignments)]
    f.ExportToFile('rosalind_mult_output.txt', '\n'.join(output))
    return
Ejemplo n.º 12
0
def MaxMatching():
    """ Given RNA string (FASTA), return total
    possible number of maximum matchings"""
    input = f.LoadFile('\\rosalind_mmch.txt')
    [Label, s] = f.FASTA(input)

    nuc_dict = {'A': 0, 'U': 0, 'C': 0, 'G': 0}
    for nuc in s:
        nuc_dict[nuc] += 1

    minAU = min([nuc_dict['A'], nuc_dict['U']])
    minCG = min([nuc_dict['C'], nuc_dict['G']])
    maxAU = max([nuc_dict['A'], nuc_dict['U']])
    maxCG = max([nuc_dict['C'], nuc_dict['G']])

    matches = str(nPr(maxAU, minAU) * nPr(maxCG, minCG))
    f.ExportToFile('rosalind_mmch_output.txt', matches)
    return
Ejemplo n.º 13
0
def kmerComp():
    """ Given DNA string s (FASTA), returns the 4-mer composition of s"""
    input = f.LoadFile('\\rosalind_kmer.txt')
    [Label, s] = f.FASTA(input)

    # Generate all 4-mers, ordered lexographically
    lex_list = Lex('ACGT', 4)
    A = []
    # go through lex_list, count frequency of each 4-mer
    for kmer in lex_list:
        count = 0
        for i in range(len(s) - 3):  #-k+1
            test = s[i:i + 4]
            if test == kmer:
                count += 1
        A.append(str(count))
    f.ExportToFile('rosalind_kmer_output.txt', ' '.join(A))
    return
Ejemplo n.º 14
0
def ReversePalindromes():
    """ Identifies all reverse palindromes in DNA string, FASTA format,
    of length 4 - 12"""
    input = f.LoadFile('\\rosalind_revp.txt')
    [Label, s] = f.FASTA(input)

    k = len(s)
    pal_tuples = []
    # Adjust start of window
    for i in range(k):
        # Adjust size of window
        for j in range(i + 3, k + 1):
            if j - i <= 12:
                if s[i:j] == ReverseComplement(s[i:j]):
                    pal_tuples.append(' '.join((str(i + 1), str(j - 1))))

    f.ExportToFile('rosalind_revp_output.txt', '\n'.join(pal_tuples))
    return
Ejemplo n.º 15
0
def ConsensusandProfile():
    """ Returns conensus string and profile matrix for a collection of
    FASTA format DNA strings"""
    input = f.LoadFile('\\rosalind_cons.txt')
    [Labels, DNA] = f.FASTA(input)

    k = len(DNA[0])

    # Initialize profile
    profile = {'A': [], 'C': [], 'G': [], 'T': []}
    for _ in range(k):
        profile['A'].append(0)
        profile['C'].append(0)
        profile['G'].append(0)
        profile['T'].append(0)
    # Fill in profile
    for line in DNA:
        i = -1
        for nuc in line:
            i += 1
            profile[nuc][i] += 1
    # Find consensus
    consensus = ''
    for i in range(k):
        nuc_l = [
            profile['A'][i], profile['C'][i], profile['G'][i], profile['T'][i]
        ]
        for key in profile:
            if max(nuc_l) == profile[key][i]:
                consensus += key
                break
    # Report results in proper format
    output = [
        consensus,
        'A: %s' % (' '.join(map(str, profile['A']))),
        'C: %s' % (' '.join(map(str, profile['C']))),
        'G: %s' % (' '.join(map(str, profile['G']))),
        'T: %s' % (' '.join(map(str, profile['T'])))
    ]

    f.ExportToFile('rosalind_cons_output.txt', '\n'.join(output))
    return
Ejemplo n.º 16
0
def GC():
    """Computes GC content of set of DNA strings.
    Returns name and content of string with highest GC-content"""
    input = f.LoadFile('\\rosalind_gc.txt')
    [Labels, DNA] = f.FASTA(input)

    # Calculate GC content
    gc_content = []
    for seq in DNA:
        count = 0
        for nuc in seq:
            if nuc in 'GC':
                count += 1
        gc_content.append(count / len(seq) * 100)
    # Find & report max GC + label
    maxgc = max(gc_content)
    maxgc_index = gc_content.index(maxgc)
    label = Labels[maxgc_index]
    f.ExportToFile('rosalind_gc_output.txt', '\n'.join([label, str(maxgc)]))
    return
Ejemplo n.º 17
0
def FailureArray():
    """ Given DNA string (FASTA), returns failure array"""
    input = f.LoadFile('\\rosalind_kmp.txt')
    [Label, s] = f.FASTA(input)

    # initialize P
    P = []
    for _ in range(len(s)):
        P.append(0)

    k = 0

    for i in range(2, len(s) + 1):

        while k > 0 and s[k] != s[i - 1]:
            k = P[k - 1]

        if s[k] == s[i - 1]:
            k += 1
        P[i - 1] = k
    f.ExportToFile('rosalind_kmp_output.txt', ' '.join(str(x) for x in P))
    return
Ejemplo n.º 18
0
def ORF():
    """ Returns all distinct candidate protein strings that can be 
    translated from ORFs of the given DNA string, FASTA format"""
    input = f.LoadFile('\\rosalind_orf.txt')
    [Labels,DNA] = f.FASTA(input)
    allDNA = []
    allDNA.append(ReverseComplement(DNA))
    
    # Convert to RNA
    RNA = []
    for i in allDNA:
        RNA.append(DNAtoRNA(i))
        
    # Get all strings from start codon to first stop
    seq = []
    for r in RNA:
        start = []
        stop = []
        for i in range(len(r)):
            if r[i:i+3] == 'AUG':
                start.append(i)
            elif r[i:i+3] in ['UAA', 'UAG', 'UGA']:
                stop.append(i)          
        
    for i in start:
        for j in stop:
            if (    j > i
                and (j-i) % 3 == 0):
                seq.append(r[i:j])
                break

    # Convert to Protein
    proteins = []
    for s in seq:
        proteins.append(RNAtoProtein(s))
    proteins = RemoveDuplicates(proteins)
    f.ExportToFile('rosalind_orf_output.txt', '\n'.join(proteins))
    return
Ejemplo n.º 19
0
def SplicedMotif():
    """ Given DNA strings s & t, FASTA format, returns
    a collection of indices in which symbols of t appears as a 
    subsequence of s """
    input = f.LoadFile('\\rosalind_sseq.txt')
    [Labels, DNA] = f.FASTA(input)
    s = DNA[0]
    t = DNA[1]

    loc = []
    i = 0
    # Go through s one nucleotide at a time
    for sym in s:
        i += 1
        # once first symbol
        if sym == t[0]:
            loc.append(str(i))
            t = t[1:]
            if t == '':
                break

    f.ExportToFile('rosalind_sseq_output.txt', ' '.join(loc))
    return
Ejemplo n.º 20
0
def SuperString():
    """ Given several DNA strings, FASTA format,
    returns shortest possible superstring"""
    input = f.LoadFile('\\rosalind_long.txt')
    [Labels, DNA] = f.FASTA(input)

    while len(DNA) > 2:  # Repeat cycle until only one string left
        # Find pair of strings with greatest overlap
        ## Initialize overlap matrix
        overlap_matrix = []
        for i in range(len(DNA)):
            overlap_matrix.append([])
            for j in range(len(DNA)):
                overlap_matrix[i].append(0)

        ## Fill in with overlaps
        for i in DNA:
            for j in DNA:
                overlap_matrix[DNA.index(i)][DNA.index(j)] = Overlap(i, j)

        # Replace strings with max overlap with superstring
        ## Find index of max overlap + value
        max_overlap = MaxMatrix(overlap_matrix)
        ind1 = max_overlap[0]
        ind2 = max_overlap[1]
        ## Make superstring based on this info
        s = Combine(DNA[ind1], DNA[ind2])
        ##  Remove shorter strings, add superstring
        x = copy.copy(DNA[ind1])
        y = copy.copy(DNA[ind2])
        DNA.remove(x)
        DNA.remove(y)
        DNA.append(s)

    superstring = Combine(DNA[0], DNA[1])
    f.ExportToFile('rosalind_long_output.txt', superstring)
    return
Ejemplo n.º 21
0
def LCS():
    input = f.LoadFile('\\rosalind_lcsq.txt')
    [Labels, DNA] = f.FASTA(input)
    p = DNA[0]
    q = DNA[1]

    # Make a matrix that compares the two strings
    k = len(p)
    l = len(q)
    matrix = []
    matrix = MakeMatrix(matrix, k, l, p, q)

    # Use the length of lcs to find the location of the last element
    length = max([max(row) for row in matrix])
    [maxcol, maxrow, lcs, query] = LastElement(matrix, p, length)

    # Expand the lcs by finding where the next lowest # in the matrix occurs
    while len(lcs) < length:
        [maxcol, maxrow, lcs, query] = ExpandLCS(matrix, p, maxcol, maxrow,
                                                 query, lcs)

    lcs = ''.join(list(reversed(lcs)))
    f.ExportToFile('rosalind_lcsq_output.txt', lcs)
    return
Ejemplo n.º 22
0
def NglyMotif():
    """Finds locations of N-glycosylation motiif in proteins,
    given uniprot IDs.
    Motif = N + (anything but P) + (S or T) + anything but P"""
    input = f.LoadFile('\\rosalind_mprt.txt').splitlines()
    output = []
    for id in input:
        url = 'http://www.uniprot.org/uniprot/' + str(id) + '.fasta'
        fasta = urllib.request.urlopen(url).read().decode("utf-8")
        [labels, protein] = f.FASTA(fasta)

        # Get locations of motif
        locs = []
        for i in range(len(protein) - 4):  #4 = len(motif)
            m = protein[i:i + 4]
            if (m[0] == 'N' and m[2] in 'ST' and 'P' not in m):
                locs.append(str(i + 1))

        if locs != []:
            output.append(id)
            output.append(' '.join(str(x) for x in locs))

    f.ExportToFile('rosalind_mprt_output.txt', '\n'.join(output))
    return