Beispiel #1
0
def do(sequences, query_seq, position, pattern, changed_pattern, SW=0):
    '''Finds the difference in the entropy in patterns
    when a pattern is changed to changed pattern.

    '''

    # case 1: for original amino acid
    sequences.append(query_seq)
    patterns = check_mutation_position.do(sequences, position)
    if SW:
        sw = landgraf_sequence_weights.do(sequences)
    else:
        sw = list(np.ones(len(sequences)))
    entropy1 = calculate_sop(patterns, sw)

    # case 2: for mutant amino acid
    sequences.pop()
    new_query_seq = util.strsub(query_seq, position, changed_pattern)
    sequences.append(new_query_seq)
    patterns = check_mutation_position.do(sequences, position)
    if SW:
        sw = landgraf_sequence_weights.do(sequences)
    else:
        sw = list(np.ones(len(sequences)))
    entropy2 = calculate_sop(patterns, sw)

    return (entropy2 - entropy1)
Beispiel #2
0
def do(file1, file2, winsize):
    mutations = []
    with open(file1) as fp:
        for line in fp:
            mutations.append(line.strip().split(','))

    with open(file2, 'w') as ofp:
        for mut in mutations:

            print mut
            flag = 0

            protein = mut[0]
            position = int(mut[1]) - 1
            orig_aa = mut[2]
            mut_aa = mut[3]

            fasta_seq = util.read_sequence('../fasta/%s.fasta' % protein)

            alfile = '../alignments/%s' % protein
            if not os.path.isfile(alfile):
                continue

            proteins = load_alignments.do(alfile)
            proteins = util.prune_proteins_list(proteins)
            query, proteins = util.fetch_query_protein_in_alignments(proteins)
            query_seq = query['alignment']

            alignments = [a['alignment'] for a in proteins]
            try:
                sequence_weights = landgraf_sequence_weights.do(alignments)
            except Exception as e:
                print str(e)
                flag = 1

            if flag:
                continue

            scores = []

            for w in range(winsize + 1):  # 0,1,2,3 for winsize=3, hence the +1
                try:
                    # what score to use?

                    if w == 0:
                        mod_pos = correct_alignment_position.do(
                            query_seq, position)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        o, m = gapped_frequency_score.do(
                            list(aa), orig_aa, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(m)

                    else:
                        mod_pos = correct_alignment_position.do(
                            query_seq, position - w)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        aa_in_fasta = fasta_seq[position - w]
                        o, m = gapped_frequency_score.do(
                            list(aa), aa_in_fasta, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(
                            o
                        )  # left neighbour at position w from mutation position

                        mod_pos = correct_alignment_position.do(
                            query_seq, position + w)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        aa_in_fasta = fasta_seq[position + w]
                        o, m = gapped_frequency_score.do(
                            list(aa), aa_in_fasta, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(
                            o
                        )  # right neighbour at position w from mutation position

                except Exception:
                    flag = 1
                    break

            if flag:
                continue

            #print(scores)

            ofp.write(','.join([
                str(item) for item in list(chain.from_iterable([mut, scores]))
            ]) + '\n')
def do(file1, file2):
    # fetch all mutations
    mutations = []
    with open(file1) as fp:
        for line in fp:
            parts = line.strip().split(',')
            mutations.append(
                [parts[0], parts[1], parts[2], parts[3], parts[4]])

    ofp = open(file2, 'w')

    for mut in mutations:
        # fetch reqd info
        protein = mut[0]
        position = int(mut[1]) - 1
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s' % protein
        if not os.path.isfile(alignment_file):
            continue

        print(mut)

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # prune the proteins
        alignments = util.prune_proteins_list(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # fetch the corresponding sequences
        sequences = [a['alignment'] for a in alignments]

        # compute frequency of amino acid at desired position
        aa = check_mutation_position.do([k['alignment'] for k in alignments],
                                        actual_pos)

        # compute simple frequency of original & mutant amino acid
        o_score, m_score = simple_frequency_score.do(list(aa), orig_aa, mut_aa)

        # compute score using pseudo-counts in order to account for missing aa
        o_ps_score, m_ps_score = pseudo_count_score.do(list(aa), orig_aa,
                                                       mut_aa)

        # compute simple sequence-weighted frequency score
        sequence_weights = landgraf_sequence_weights.do(
            [a['alignment'] for a in alignments])
        o_sw_score, m_sw_score = simple_frequency_score.do(
            list(aa), orig_aa, mut_aa, sequence_weights)
        # using gap frequencies
        o_gf_score, m_gf_score = gapped_frequency_score.do(
            list(aa), orig_aa, mut_aa, sequence_weights)

        # calculate shannon entropy score w/o sequence weights
        shannon = shannon_entropy_score.do(list(sequences), query['alignment'],
                                           actual_pos, orig_aa, mut_aa)
        # shannon entropy score with sequence weights
        shannon_weighted = shannon_entropy_score.do(list(sequences),
                                                    query['alignment'],
                                                    actual_pos, orig_aa,
                                                    mut_aa, 1)

        # calculate von-neumann entropy score
        von_neumann_score = von_neumann_entropy_score.do(
            list(aa), orig_aa, mut_aa)

        # calculate sum-of-pairs scores
        sop = sum_of_pairs_score.do(list(sequences), query['alignment'],
                                    actual_pos, orig_aa, mut_aa,
                                    0)  # wo seq wg
        sop_wg = sum_of_pairs_score.do(list(sequences), query['alignment'],
                                       actual_pos, orig_aa, mut_aa,
                                       1)  # w seq wg

        # append all scores together and write to file
        scores = [
            o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score,
            o_gf_score, m_gf_score, shannon, shannon_weighted,
            von_neumann_score, sop, sop_wg
        ]
        #print(scores)

        ofp.write(','.join(
            [str(item)
             for item in list(chain.from_iterable([mut, scores]))]) + '\n')

        #break

    ofp.close()
def do(file1, file2):
    # fetch all mutations
    mutations = []
    with open(file1) as fp:
        for line in fp:
            parts = line.strip().split(',')
            mutations.append(
                [parts[0], parts[1], parts[2], parts[3], parts[4]])


#    ofp = open(file2,'w')

    for mut in mutations:
        # fetch reqd info
        protein = mut[0]
        position = int(mut[1]) - 1
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s' % protein
        if not os.path.isfile(alignment_file):
            continue

        print(mut)

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # remove duplicate sequences in aligned sequences
        alignments = uniquify_alignments.do(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # compute frequency of amino acid at desired position
        aa = check_mutation_position.do([k['alignment'] for k in alignments],
                                        actual_pos)

        # compute simple frequency of original & mutant amino acid
        o_score, m_score = simple_frequency_score.do(aa, orig_aa, mut_aa)

        # compute score using pseudo-counts in order to account for missing aa
        o_ps_score, m_ps_score = pseudo_count_score.do(aa, orig_aa, mut_aa)

        # compute simple sequence-weighted frequency score
        sequence_weights = landgraf_sequence_weights.do(
            [a['alignment'] for a in alignments])
        o_sw_score, m_sw_score = simple_frequency_score.do(
            aa, orig_aa, mut_aa, sequence_weights)
        # using gap frequencies
        o_gf_score, m_gf_score = gapped_frequency_score.do(
            aa, orig_aa, mut_aa, sequence_weights)

        # append all scores together and write to file
        scores = [
            o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score,
            o_gf_score, m_gf_score
        ]
        print(scores)

        #	ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n')

        break