コード例 #1
0
ファイル: PDST.py プロジェクト: houdak/Rosalind
def DistanceMatrix():
    """ Given n DNA strings (FASTA), returns distance matrix """
    input = f.LoadFile('\\rosalind_pdst.txt')
    [Label, DNA] = f.FASTA(input)

    # Initialize distance matrix
    D = []
    for _ in DNA:
        D.append([])
    for i in range(len(D)):
        for _ in range(len(D)):
            D[i].append(0)

    # Calculate Hamming Distance, add to matrix
    for i in range(len(DNA)):
        for j in range(len(DNA)):
            dist = HammingDistance(DNA[i], DNA[j])
            D[i][j] = str(dist / len(DNA[0]))

    # Properly format
    D_formatted = []
    for line in D:
        D_formatted.append(' '.join(line))
    f.ExportToFile('rosalind_pdst_output.txt', '\n'.join(D_formatted))
    return
コード例 #2
0
ファイル: GRPH.py プロジェクト: houdak/Rosalind
def OverlapGraph():
    """ Returns adjacency list of labels of DNA in FASTA format"""
    input = f.LoadFile('\\rosalind_grph.txt')
    [Labels, DNA] = f.FASTA(input)

    temp_dict = {}
    adj_dict = {}
    for kmer in DNA:
        temp_dict[kmer] = []
    for kmer in temp_dict:
        for i in DNA:
            if (kmer[-3:] == i[:3]  # if overlap by 3
                    and kmer != i):  # don't include self!
                temp_dict[kmer].append(i)
        # Remove any without matches
        if temp_dict[kmer] != []:
            adj_dict[kmer] = temp_dict[kmer]

    # Replace with labels
    name_dict = {}
    for kmer in adj_dict:
        kmer_ind = DNA.index(kmer)
        val_inds = []
        for value in adj_dict[kmer]:
            val_inds.append(DNA.index(value))
        name_dict[Labels[kmer_ind]] = [Labels[i] for i in val_inds]

    # Return in format
    output = []
    for name in name_dict:
        for i in name_dict[name]:
            output.append(' '.join([name, i]))
    f.ExportToFile('rosalind_grph_output.txt', '\n'.join(output))
    return
コード例 #3
0
ファイル: CORR.py プロジェクト: houdak/Rosalind
def ErrorCorrection():
    """ Given list of DNA (FASTA) with correct reads occuring at least twice,
    returns incorrect reads and the corrected version."""
    input = f.LoadFile('\\rosalind_corr.txt')
    [Labels, DNA] = f.FASTA(input)

    correct_DNA = []
    # Read is correct if it appears at least twice,
    #-possibly as reverse complement
    for i in DNA:
        if Freq(i, DNA) > 1:
            correct_DNA.append(i)

    # Add all reverse complements to correct_DNA
    new_correct = []
    for i in correct_DNA:
        new_correct.append(i)
        new_correct.append(ReverseComplement(i))
    correct_DNA = RemoveDuplicates(new_correct)

    # Compare each read against the correct ones
    output = []
    for read in DNA:
        # If its in correct_Dna, ignore
        if read not in correct_DNA:
            # Find which string it matches best
            match = MinimumDistance(read, correct_DNA)
            # print in format
            output.append('%s->%s' % (read, match))
    f.ExportToFile('rosalind_corr_output.txt', '\n'.join(output))
    return
コード例 #4
0
ファイル: TRAN.py プロジェクト: houdak/Rosalind
def TT():
    """ Given 2 DNA strings of equal length, FASTA format,
    returns the transition / transversion ratio
    Transitions: A<->G, C<->T
    Transversions: A<->T, A<->C, G<->C, G<->T """
    input = f.LoadFile('\\rosalind_tran.txt')
    [Labels, DNA] = f.FASTA(input)
    p = DNA[0]
    q = DNA[1]

    transition = 0
    transversion = 0
    for i in range(len(p)):
        if p[i] == q[i]:
            continue
        else:
            if (p[i] in 'AG') and (q[i] in 'AG'):
                transition += 1
            elif (p[i] in 'CT') and (q[i] in 'CT'):
                transition += 1
            else:
                transversion += 1
    ratio = str(transition / transversion)
    f.ExportToFile('rosalind_tran_output.txt', ratio)
    return
コード例 #5
0
ファイル: SPLC.py プロジェクト: houdak/Rosalind
def Splicing():
    """ Given a DNA substring and a collection of substrings acting as introns,
    returns a protein string from transcribing and translating exons"""
    input = f.LoadFile('\\rosalind_splc.txt')
    [Label, DNA] = f.FASTA(input)

    t = DNA[0]  # original string
    for substr in DNA[1:]:
        t = t.replace(substr, '')  # remove introns
    RNA = DNAtoRNA(t)
    f.ExportToFile('rosalind_splc_output.txt', RNAtoProtein(RNA))
    return
コード例 #6
0
ファイル: EDIT.py プロジェクト: houdak/Rosalind
def EditDistance():
    """ Given 2 strings, FASTA, returns the edit distance """
    input = f.LoadFile('\\rosalind_edit.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []

    result = MakeMatrixDist(matrix, k, l, p, q)
    f.ExportToFile('rosalind_edit_output.txt', str(result))
    return
コード例 #7
0
def GlobalAlignment():
    """ Uses MakeMatrix to return the maximum alignment score
    between 2 DNA strings (FASTA)"""
    input = f.LoadFile('\\rosalind_glob.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    maxalign = MakeMatrixGlobal(matrix, k, l, p, q)
    f.ExportToFile('rosalind_glob_output.txt', str(maxalign))
    return
コード例 #8
0
ファイル: CTEA.py プロジェクト: houdak/Rosalind
def CountOptimalAlignments():
    """ Uses MakeMatrix and ConstructPath to return
    number of optimal alignments. Repeats ConstructPath multiple times to get
    all possible optimal alignments """
    input = f.LoadFile('\\rosalind_ctea.txt')
    [Labels,[p,q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    [editdistance,matrix] = MakeMatrixDist(matrix,k,l,p,q)
    
    return ConstructPath(matrix,k,l,p,q) % 134217727
コード例 #9
0
def EditDistanceAlignment():
    """ Uses MakeMatrix and InterpretMatrix to return alignments """
    input = f.LoadFile('\\rosalind_edta.txt')
    [Labels, [p, q]] = f.FASTA(input)

    k = len(p)
    l = len(q)
    matrix = []
    [editdistance, matrix] = MakeMatrixDist(matrix, k, l, p, q)

    [p_aligned, q_aligned] = InterpretMatrix(matrix, k, l, p, q)
    output = [str(editdistance), ''.join(p_aligned), ''.join(q_aligned)]
    f.ExportToFile('rosalind_edta_output.txt', '\n'.join(output))
    return
コード例 #10
0
ファイル: LCSM.py プロジェクト: houdak/Rosalind
def SharedMotif():
    """ Finds the longest motif shared by all DNA strings in list, FASTA format """
    input = f.LoadFile('\\rosalind_lcsm.txt')
    [Labels,DNA] = f.FASTA(input)
    
    t = min(DNA) # shortest string
    k = len(t) # length of shortest string
    
    # From length of shortest sequence to 0
    for j in range(k,0,-1): # Do all at longest length first
        # From 0 to end of first sequence
        for i in range(k-j+1): # Adjust window
            template = t[i:i+j] #i + j be less than the length of t
            if InAll(template,DNA) == True:
                f.ExportToFile('rosalind_lcsm_output.txt',template)
                return
コード例 #11
0
ファイル: MULT.py プロジェクト: houdak/Rosalind
def MultipleAlignment():
    """ Returns multiple alignment score, and aligned scores.
    Input is meant to be very short, so this will take advantage of that.
    Thus, not useful for longer or more numerous sequences. """
    input = f.LoadFile('\\rosalind_mult.txt')
    [labels, strings] = f.FASTA(input)
    [a, b, c, d] = GappedStrings(strings)
    alignment_combos = list(k.product(a, b, c, d))
    maxscore = -100000000000000000

    for combo in alignment_combos:
        if MultiAlignScore(combo) > maxscore:
            maxscore = MultiAlignScore(combo)
            minalignments = combo

    output = [str(maxscore), '\n'.join(minalignments)]
    f.ExportToFile('rosalind_mult_output.txt', '\n'.join(output))
    return
コード例 #12
0
def MaxMatching():
    """ Given RNA string (FASTA), return total
    possible number of maximum matchings"""
    input = f.LoadFile('\\rosalind_mmch.txt')
    [Label, s] = f.FASTA(input)

    nuc_dict = {'A': 0, 'U': 0, 'C': 0, 'G': 0}
    for nuc in s:
        nuc_dict[nuc] += 1

    minAU = min([nuc_dict['A'], nuc_dict['U']])
    minCG = min([nuc_dict['C'], nuc_dict['G']])
    maxAU = max([nuc_dict['A'], nuc_dict['U']])
    maxCG = max([nuc_dict['C'], nuc_dict['G']])

    matches = str(nPr(maxAU, minAU) * nPr(maxCG, minCG))
    f.ExportToFile('rosalind_mmch_output.txt', matches)
    return
コード例 #13
0
ファイル: KMER.py プロジェクト: houdak/Rosalind
def kmerComp():
    """ Given DNA string s (FASTA), returns the 4-mer composition of s"""
    input = f.LoadFile('\\rosalind_kmer.txt')
    [Label, s] = f.FASTA(input)

    # Generate all 4-mers, ordered lexographically
    lex_list = Lex('ACGT', 4)
    A = []
    # go through lex_list, count frequency of each 4-mer
    for kmer in lex_list:
        count = 0
        for i in range(len(s) - 3):  #-k+1
            test = s[i:i + 4]
            if test == kmer:
                count += 1
        A.append(str(count))
    f.ExportToFile('rosalind_kmer_output.txt', ' '.join(A))
    return
コード例 #14
0
def ReversePalindromes():
    """ Identifies all reverse palindromes in DNA string, FASTA format,
    of length 4 - 12"""
    input = f.LoadFile('\\rosalind_revp.txt')
    [Label, s] = f.FASTA(input)

    k = len(s)
    pal_tuples = []
    # Adjust start of window
    for i in range(k):
        # Adjust size of window
        for j in range(i + 3, k + 1):
            if j - i <= 12:
                if s[i:j] == ReverseComplement(s[i:j]):
                    pal_tuples.append(' '.join((str(i + 1), str(j - 1))))

    f.ExportToFile('rosalind_revp_output.txt', '\n'.join(pal_tuples))
    return
コード例 #15
0
def ConsensusandProfile():
    """ Returns conensus string and profile matrix for a collection of
    FASTA format DNA strings"""
    input = f.LoadFile('\\rosalind_cons.txt')
    [Labels, DNA] = f.FASTA(input)

    k = len(DNA[0])

    # Initialize profile
    profile = {'A': [], 'C': [], 'G': [], 'T': []}
    for _ in range(k):
        profile['A'].append(0)
        profile['C'].append(0)
        profile['G'].append(0)
        profile['T'].append(0)
    # Fill in profile
    for line in DNA:
        i = -1
        for nuc in line:
            i += 1
            profile[nuc][i] += 1
    # Find consensus
    consensus = ''
    for i in range(k):
        nuc_l = [
            profile['A'][i], profile['C'][i], profile['G'][i], profile['T'][i]
        ]
        for key in profile:
            if max(nuc_l) == profile[key][i]:
                consensus += key
                break
    # Report results in proper format
    output = [
        consensus,
        'A: %s' % (' '.join(map(str, profile['A']))),
        'C: %s' % (' '.join(map(str, profile['C']))),
        'G: %s' % (' '.join(map(str, profile['G']))),
        'T: %s' % (' '.join(map(str, profile['T'])))
    ]

    f.ExportToFile('rosalind_cons_output.txt', '\n'.join(output))
    return
コード例 #16
0
def GC():
    """Computes GC content of set of DNA strings.
    Returns name and content of string with highest GC-content"""
    input = f.LoadFile('\\rosalind_gc.txt')
    [Labels, DNA] = f.FASTA(input)

    # Calculate GC content
    gc_content = []
    for seq in DNA:
        count = 0
        for nuc in seq:
            if nuc in 'GC':
                count += 1
        gc_content.append(count / len(seq) * 100)
    # Find & report max GC + label
    maxgc = max(gc_content)
    maxgc_index = gc_content.index(maxgc)
    label = Labels[maxgc_index]
    f.ExportToFile('rosalind_gc_output.txt', '\n'.join([label, str(maxgc)]))
    return
コード例 #17
0
def FailureArray():
    """ Given DNA string (FASTA), returns failure array"""
    input = f.LoadFile('\\rosalind_kmp.txt')
    [Label, s] = f.FASTA(input)

    # initialize P
    P = []
    for _ in range(len(s)):
        P.append(0)

    k = 0

    for i in range(2, len(s) + 1):

        while k > 0 and s[k] != s[i - 1]:
            k = P[k - 1]

        if s[k] == s[i - 1]:
            k += 1
        P[i - 1] = k
    f.ExportToFile('rosalind_kmp_output.txt', ' '.join(str(x) for x in P))
    return
コード例 #18
0
ファイル: ORF.py プロジェクト: houdak/Rosalind
def ORF():
    """ Returns all distinct candidate protein strings that can be 
    translated from ORFs of the given DNA string, FASTA format"""
    input = f.LoadFile('\\rosalind_orf.txt')
    [Labels,DNA] = f.FASTA(input)
    allDNA = []
    allDNA.append(ReverseComplement(DNA))
    
    # Convert to RNA
    RNA = []
    for i in allDNA:
        RNA.append(DNAtoRNA(i))
        
    # Get all strings from start codon to first stop
    seq = []
    for r in RNA:
        start = []
        stop = []
        for i in range(len(r)):
            if r[i:i+3] == 'AUG':
                start.append(i)
            elif r[i:i+3] in ['UAA', 'UAG', 'UGA']:
                stop.append(i)          
        
    for i in start:
        for j in stop:
            if (    j > i
                and (j-i) % 3 == 0):
                seq.append(r[i:j])
                break

    # Convert to Protein
    proteins = []
    for s in seq:
        proteins.append(RNAtoProtein(s))
    proteins = RemoveDuplicates(proteins)
    f.ExportToFile('rosalind_orf_output.txt', '\n'.join(proteins))
    return
コード例 #19
0
ファイル: SSEQ.py プロジェクト: houdak/Rosalind
def SplicedMotif():
    """ Given DNA strings s & t, FASTA format, returns
    a collection of indices in which symbols of t appears as a 
    subsequence of s """
    input = f.LoadFile('\\rosalind_sseq.txt')
    [Labels, DNA] = f.FASTA(input)
    s = DNA[0]
    t = DNA[1]

    loc = []
    i = 0
    # Go through s one nucleotide at a time
    for sym in s:
        i += 1
        # once first symbol
        if sym == t[0]:
            loc.append(str(i))
            t = t[1:]
            if t == '':
                break

    f.ExportToFile('rosalind_sseq_output.txt', ' '.join(loc))
    return
コード例 #20
0
def SuperString():
    """ Given several DNA strings, FASTA format,
    returns shortest possible superstring"""
    input = f.LoadFile('\\rosalind_long.txt')
    [Labels, DNA] = f.FASTA(input)

    while len(DNA) > 2:  # Repeat cycle until only one string left
        # Find pair of strings with greatest overlap
        ## Initialize overlap matrix
        overlap_matrix = []
        for i in range(len(DNA)):
            overlap_matrix.append([])
            for j in range(len(DNA)):
                overlap_matrix[i].append(0)

        ## Fill in with overlaps
        for i in DNA:
            for j in DNA:
                overlap_matrix[DNA.index(i)][DNA.index(j)] = Overlap(i, j)

        # Replace strings with max overlap with superstring
        ## Find index of max overlap + value
        max_overlap = MaxMatrix(overlap_matrix)
        ind1 = max_overlap[0]
        ind2 = max_overlap[1]
        ## Make superstring based on this info
        s = Combine(DNA[ind1], DNA[ind2])
        ##  Remove shorter strings, add superstring
        x = copy.copy(DNA[ind1])
        y = copy.copy(DNA[ind2])
        DNA.remove(x)
        DNA.remove(y)
        DNA.append(s)

    superstring = Combine(DNA[0], DNA[1])
    f.ExportToFile('rosalind_long_output.txt', superstring)
    return
コード例 #21
0
ファイル: LCSQ.py プロジェクト: houdak/Rosalind
def LCS():
    input = f.LoadFile('\\rosalind_lcsq.txt')
    [Labels, DNA] = f.FASTA(input)
    p = DNA[0]
    q = DNA[1]

    # Make a matrix that compares the two strings
    k = len(p)
    l = len(q)
    matrix = []
    matrix = MakeMatrix(matrix, k, l, p, q)

    # Use the length of lcs to find the location of the last element
    length = max([max(row) for row in matrix])
    [maxcol, maxrow, lcs, query] = LastElement(matrix, p, length)

    # Expand the lcs by finding where the next lowest # in the matrix occurs
    while len(lcs) < length:
        [maxcol, maxrow, lcs, query] = ExpandLCS(matrix, p, maxcol, maxrow,
                                                 query, lcs)

    lcs = ''.join(list(reversed(lcs)))
    f.ExportToFile('rosalind_lcsq_output.txt', lcs)
    return
コード例 #22
0
def NglyMotif():
    """Finds locations of N-glycosylation motiif in proteins,
    given uniprot IDs.
    Motif = N + (anything but P) + (S or T) + anything but P"""
    input = f.LoadFile('\\rosalind_mprt.txt').splitlines()
    output = []
    for id in input:
        url = 'http://www.uniprot.org/uniprot/' + str(id) + '.fasta'
        fasta = urllib.request.urlopen(url).read().decode("utf-8")
        [labels, protein] = f.FASTA(fasta)

        # Get locations of motif
        locs = []
        for i in range(len(protein) - 4):  #4 = len(motif)
            m = protein[i:i + 4]
            if (m[0] == 'N' and m[2] in 'ST' and 'P' not in m):
                locs.append(str(i + 1))

        if locs != []:
            output.append(id)
            output.append(' '.join(str(x) for x in locs))

    f.ExportToFile('rosalind_mprt_output.txt', '\n'.join(output))
    return