Beispiel #1
0
def q2():
    def q2_helper(str1, str2):
        if len(str1) != len(str2):
            return 0
        else:
            ctr = 0
            for char in range(len(str1)):
                if str1[char] == str2[char]:
                    ctr += 1

        return (ctr * 1.0 / len(str1)) * 100

    human = data.read_protein(data.HUMAN_EYELESS_URL)
    fly = data.read_protein(data.FRUITFLY_EYELESS_URL)
    scores = data.read_scoring_matrix(data.PAM50_URL)
    c_pax = data.read_protein(data.CONSENSUS_PAX_URL)

    # get local alignment of human and fly
    a_matrix = soln.compute_alignment_matrix(human, fly, scores, False)
    l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix)

    # removing the dashes
    l_h = l_h.replace("-", "")
    l_ff = l_ff.replace("-", "")

    # get global alignment matrix for each local string and pax
    pax_a_h_matrix = soln.compute_alignment_matrix(l_h, c_pax, scores, True)
    pax_a_ff_matrix = soln.compute_alignment_matrix(l_ff, c_pax, scores, True)

    # compute global alignment
    h_ga = soln.compute_global_alignment(l_h, c_pax, scores, pax_a_h_matrix)
    ff_ga = soln.compute_global_alignment(l_ff, c_pax, scores, pax_a_ff_matrix)

    print "human:\t\t", q2_helper(h_ga[1], h_ga[2])
    print "fruit fly:\t", q2_helper(ff_ga[1], ff_ga[2])
Beispiel #2
0
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials):
    """
    return a dictionary scoring_distribution that represents an un-normalized distribution generated by performing the
    following process num_trials times
    - Generate a random permutation rand_y of the sequence seq_y using random.shuffle().
    - Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix.
    - Increment the entry score in the dictionary scoring_distribution by one.

    :rtype : dict
    :param seq_x: first input sequence
    :param seq_y: second input sequence
    :param scoring_matrix: scoring matrix
    :param num_trials: number of trials
    :return: dictionary scoring_distribution
    """
    scoring_distribution = {}

    for idx in range(num_trials):
        rand_y = list(seq_y)
        random.shuffle(rand_y)
        rand_y = "".join(rand_y)

        a_matrix = soln.compute_alignment_matrix(seq_x, rand_y, scoring_matrix, False)
        score, dummy1, dummy2 = soln.compute_local_alignment(seq_x, rand_y, scoring_matrix, a_matrix)
        if score in scoring_distribution:
            scoring_distribution[score] += 1
        else:
            scoring_distribution[score] = 1

        if (idx + 1) % (num_trials / 20) == 0:
            print "done: ", idx

    return scoring_distribution
def analyze_data():
    global human_protein, fruitfly_protein, alignment, scoring_matrix
    human_protein = read_protein(HUMAN_EYELESS_URL)
    fruitfly_protein = read_protein(FRUITFLY_EYELESS_URL)
    scoring_matrix = read_scoring_matrix(PAM50_URL)
    alignment_matrix = p.compute_alignment_matrix(human_protein,
                                                  fruitfly_protein,
                                                  scoring_matrix, False)
    alignment = p.compute_local_alignment(human_protein, fruitfly_protein,
                                          scoring_matrix, alignment_matrix)
Beispiel #4
0
def q7():
    x = "ABCABC"
    y = "ABCABC"

    sm = soln.build_scoring_matrix("ABC-", 2, 1, 0)
    print sm

    # global
    am = soln.compute_alignment_matrix(x, y, sm, True)
    print soln.compute_global_alignment(x, y, sm, am)
def q2_solution():
    consensus = read_protein(CONSENSUS_PAX_URL)
    #print "Consensus PAX domain: \n", consensus
    #print "\nLength of census PAX domain", len(consensus)

    local_alignment_human = alignment[1]
    local_alignment_human_no_dashes = local_alignment_human.replace("-", "")
    print "\nLocal_alignment_1: \n", local_alignment_human_no_dashes
    alignment_matrix1 = p.compute_alignment_matrix(
        consensus, local_alignment_human_no_dashes, scoring_matrix, True)
    global_alignment_consensus_vs_human = p.compute_global_alignment(
        consensus, local_alignment_human_no_dashes, scoring_matrix,
        alignment_matrix1)
    print "\nGlobal alignment consensus vs human:\n", global_alignment_consensus_vs_human

    global_human1 = global_alignment_consensus_vs_human[1]
    global_human2 = global_alignment_consensus_vs_human[2]
    num_agree1 = 0
    for indx in range(len(global_human1)):
        if global_human1[indx] == global_human2[indx]:
            num_agree1 += 1
    print "Pencentage of ageeing letters in global alignment of local human VS consensus: {}%".format(
        num_agree1 / float(len(global_human1)) * 100)

    local_alignment_fruitfly = alignment[2]
    local_alignment_fruitfly_no_dashes = local_alignment_fruitfly.replace(
        "-", "")
    print "\nLocal_alignment_2:\n", local_alignment_fruitfly_no_dashes
    alignment_matrix2 = p.compute_alignment_matrix(
        consensus, local_alignment_fruitfly_no_dashes, scoring_matrix, True)
    global_alignment_consensus_vs_fruitfly = p.compute_global_alignment(
        consensus, local_alignment_fruitfly_no_dashes, scoring_matrix,
        alignment_matrix2)
    print "\nGlobal alignment consensus vs chimp:\n", global_alignment_consensus_vs_fruitfly

    global_fruitfly1 = global_alignment_consensus_vs_fruitfly[1]
    global_fruitfly2 = global_alignment_consensus_vs_fruitfly[2]
    num_agree2 = 0
    for indx in range(len(global_fruitfly1)):
        if global_fruitfly1[indx] == global_fruitfly2[indx]:
            num_agree2 += 1
    print "Pencentage of ageeing letters in global alignment of local fruitfly VS consensus: {}%".format(
        num_agree2 / float(len(global_fruitfly1)) * 100)
Beispiel #6
0
def q1():
    human = data.read_protein(data.HUMAN_EYELESS_URL)
    fly = data.read_protein(data.FRUITFLY_EYELESS_URL)
    scores = data.read_scoring_matrix(data.PAM50_URL)

    a_matrix = soln.compute_alignment_matrix(human, fly, scores, False)
    print soln.compute_local_alignment(human, fly, scores, a_matrix)

    a_matrix = soln.compute_alignment_matrix(human, fly, scores, True)
    print soln.compute_global_alignment(human, fly, scores, a_matrix)

    # local answer
    b = (875,
         'HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ',
         'HSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQ')

    # global answer
    a = (4,
         'MQN--------------------------------------S--------------HSGVNQLGGVFVNGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATPEVVSKIAQYKRECPSIFAWEIRDRLLSEGVCTNDNIPSVSSINRVLRNLASEK-QQ--------------------------------------------M------------GA----DG-----MYDKLRMLN-------G--Q----T---G-S---WGTR---P----G------------W----YPG----T--------------SV---------------P---------G-Q---P--T-------Q-DGCQQ-QE-G-G-GENTNSISSN-GEDSDEAQMRLQLKRKLQRNRTSFTQEQIEALEKEFERTHYPDVFARERLAAKIDLPEARIQVWFSNRRAKWRREEKLRNQRR--Q-----A-----S---N-T--P------SH-I------P----I---SS-S-FSTSVYQP-----I--PQ-PT-TP-V-SSFTSGSMLGR-T-D-----T--AL-T----NT-Y--S-------AL-P---P-M---P-SF-TM-AN--N--LPM-Q------P-P------V-----PS----Q---T-SS-YSC-M-L---PTSPS----V--N-GR--------------------S-YD--T-YT--PPHM------Q-------------T--H-M--NS-Q-P-MGTS--GTT-STGL----ISPGV-S---V----P--VQ-V-P----G-S---EPDMSQ------YWPRLQ',
         'MRNLPCLGTAGGSGLGGIAGKPSPTMEAVEASTASHPHSTSSYFATTYYHLTDDECHSGVNQLGGVFVGGRPLPDSTRQKIVELAHSGARPCDISRILQVSNGCVSKILGRYYETGSIRPRAIGGSKPRVATAEVVSKISQYKRECPSIFAWEIRDRLLQENVCTNDNIPSVSSINRVLRNLAAQKEQQSTGSGSSSTSAGNSISAKVSVSIGGNVSNVASGSRGTLSSSTDLMQTATPLNSSESGGASNSGEGSEQEAIYEKLRLLNTQHAAGPGPLEPARAAPLVGQSPNHLGTRSSHPQLVHGNHQALQQHQQQSWPPRHYSGSWYPTSLSEIPISSAPNIASVTAYASGPSLAHSLSPPNDIESLASIGHQRNCPVATEDIHLKKELDG-HQSDETGSGEGENSNGGASNIG-NTEDDQARLILKRKLQRNRTSFTNDQIDSLEKEFERTHYPDVFARERLAGKIGLPEARIQVWFSNRRAKWRREEKLRNQRRTPNSTGASATSSSTSATASLTDSPNSLSACSSLLSGSAGGPSVSTINGLSSPSTLSTNVNAPTLGAGIDSSESPTPIPHIRPSCTSDNDNGRQSEDCRRVCSPCPLGVGGHQNTHHIQSNGHAQGHALVPAISPRLNFNSGSFGAMYSNMHHTALSMSDSYGAVTPIPSFNHSAVGPLAPPSPIPQQGDLTPSSLYPCHMTLRPPPMAPAHHHIVPGDGGRPAGVGLGSGQSANLGASCSGSGYEVLSAYALPPPPMASSSAADSSFSAASSASANVTPHHTIAQESCPSPCSSASHFGVAHSSGFSSDPISPAVSSYAHMSYNYASSANTMTPSSASGTSAHVAPGKQQFFASCFYSPWV-')
def check_spelling(checked_word, dist, word_list):
    answer = set()
    letters = list("qwertyuiopasdfghjklzxcvbnm")
    scor_matrix = p.build_scoring_matrix(letters, 2, 1, 0)

    for word in word_list:
        align_matrix = p.compute_alignment_matrix(checked_word, word,
                                                  scor_matrix, True)
        score = p.compute_global_alignment(checked_word, word, scor_matrix,
                                           align_matrix)[0]
        edit_distance = len(word) + len(checked_word) - score
        if edit_distance <= dist:
            answer.add(word)
    return answer
def q7():
    seq_x = "kqistian"
    seq_y = "kristian"
    scor_matrix = p.build_scoring_matrix(
        ["a", "b", "e", "k", "q", "t", "r", "i", "t", "n", "s"], 2, 1, 0)
    #print scor_matrix
    #print ""
    align_matrix = p.compute_alignment_matrix(seq_x, seq_y, scor_matrix, True)
    print align_matrix
    score = p.compute_global_alignment(seq_x, seq_y, scor_matrix, align_matrix)
    print score
    #q7()

    #Question 8
    """
Beispiel #9
0
def check_spelling(checked_word, dist, word_list):
    wordlist = []
    word_len = len(checked_word)

    # global
    for word in word_list:
        am = soln.compute_alignment_matrix(checked_word, word, word_sm, True)
        g_al, dummy_x, dummy_y = soln.compute_global_alignment(checked_word, word, word_sm, am)

        score = word_len + len(word) - g_al

        if score <= dist:
            wordlist.append(word)
            print word, score

    return wordlist
def generate_null_distribution(seq_x, seq_y, scoring_matrix, num_trials):
    """
    Takes as input two sequences seq_x and
    seq_y, a scoring matrix scoring_matrix, and a number of trials num_trials. This function should return a dictionary
    scoring_distribution that represents an un-normalized distribution
    generated by performing the following process num_trials times:

    Generate a random permutation rand_y of the sequence seq_y using random.shuffle().
    Compute the maximum value score for the local alignment of seq_x and rand_y using the score matrix scoring_matrix.
    Increment the entry score in the dictionary scoring_distribution by one.
    """
    scoring_distribution = {}

    for dummy in range(num_trials):
        list_y = list(seq_y)
        random.shuffle(list_y)
        rand_y = ''.join(list_y)
        alignment_matrix = p.compute_alignment_matrix(seq_x, rand_y,
                                                      scoring_matrix, False)
        score = p.compute_local_alignment(seq_x, rand_y, scoring_matrix,
                                          alignment_matrix)[0]
        scoring_distribution[score] = scoring_distribution.get(score, 0) + 1
    return scoring_distribution
Beispiel #11
0
def q5():
    # calculate mean/standard deviation
    dist2 = {38: 3, 40: 7, 41: 13, 42: 23, 43: 27, 44: 29, 45: 45, 46: 74, 47: 69, 48: 72, 49: 71, 50: 65, 51: 52,
             52: 64, 53: 57, 54: 49, 55: 43, 56: 28, 57: 40, 58: 31, 59: 17, 60: 18, 61: 14, 62: 12, 63: 9, 64: 9,
             65: 4, 66: 10, 67: 8, 68: 5, 69: 6, 70: 2, 71: 3, 72: 4, 73: 1, 75: 3, 77: 4, 79: 2, 80: 1, 81: 1, 82: 2,
             84: 1, 85: 1, 93: 1}

    avg = numpy.mean(dist2.keys())
    std_d = numpy.std(dist2.keys())

    print "mean:", avg
    print "standard deviation: ", std_d

    # get local scores
    human = data.read_protein(data.HUMAN_EYELESS_URL)
    fly = data.read_protein(data.FRUITFLY_EYELESS_URL)
    scores = data.read_scoring_matrix(data.PAM50_URL)

    # get local alignment of human and fly
    a_matrix = soln.compute_alignment_matrix(human, fly, scores, False)
    l_score, l_h, l_ff = soln.compute_local_alignment(human, fly, scores, a_matrix)

    print "local score: ", l_score
    print "z score: ", (l_score - avg) / std_d