Beispiel #1
0
def do(sequences, query_seq, position, pattern, changed_pattern, SW=0):
    '''Finds the difference in the entropy in patterns
    when a pattern is changed to changed pattern.

    '''

    # case 1: for original amino acid
    sequences.append(query_seq)
    patterns = check_mutation_position.do(sequences, position)
    if SW:
        sw = landgraf_sequence_weights.do(sequences)
    else:
        sw = list(np.ones(len(sequences)))
    entropy1 = calculate_sop(patterns, sw)

    # case 2: for mutant amino acid
    sequences.pop()
    new_query_seq = util.strsub(query_seq, position, changed_pattern)
    sequences.append(new_query_seq)
    patterns = check_mutation_position.do(sequences, position)
    if SW:
        sw = landgraf_sequence_weights.do(sequences)
    else:
        sw = list(np.ones(len(sequences)))
    entropy2 = calculate_sop(patterns, sw)

    return (entropy2 - entropy1)
Beispiel #2
0
def main():
    filename = '../alignments/Q92481'
    position = 72
    changed_pattern = 'R'

    import load_alignments
    import uniquify_alignments
    import check_mutation_position
    import correct_alignment_position

    proteins = load_alignments.do(filename)
    sorted_list = sorted(proteins, key=itemgetter('match_percentage'))
    proteins = sorted_list
    proteins.reverse()

    proteins = uniquify_alignments.do(proteins)
    query, prots = util.fetch_query_protein_in_alignments(proteins)

    mod_position = correct_alignment_position.do(query['alignment'], position)
    pattern = query['alignment'][mod_position]

    patterns = check_mutation_position.do(
        [prot['alignment'] for prot in proteins], mod_position)

    print(do(patterns, pattern, changed_pattern))
Beispiel #3
0
def do(sequences):

    L = len(sequences[0])  # length of aligned sequences
    N = len(sequences)  # number of sequences

    weights = []
    for i in range(L):
        aa = check_mutation_position.do(sequences, i)
        freq = util.calc_frequency(aa)

        uniq_aa = util.unique(aa)

        # compute sequence weights
        w = []
        for i in range(N):
            if aa[i] == '-' or aa[i] == 'X':
                w.append(0)
            else:
                if aa[i] == 'B':
                    aa[i] = 'N'
                elif aa[i] == 'Z':
                    aa[i] = 'Q'
                w.append(1.0 / (len(uniq_aa) * freq[aa[i]]))

        weights.append(w)  # N x L matrix, for each position find the weights

    # compute average of w over all positions
    avg_weight = np.zeros(N)
    for i in range(L):
        avg_weight += np.array(weights[i])
    avg_weight *= 1.0 / L

    return avg_weight
Beispiel #4
0
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # fetch the corresponding sequences
        sequences = [a['alignment'] for a in alignments]

        # compute frequency of amino acid at desired position
        aa = check_mutation_position.do([k['alignment'] for k in alignments],
                                        actual_pos)

        # compute mean value in the column
        mean = {}
        cnt = 0
        for key in values.keys():  # for each physco property
            first = 1
            for z in aa:
                if first:
                    if z != '-':
                        mean[key] = values[key][z]
                        cnt += 1
                        first = 0
                    else:
                        continue
                else:
Beispiel #5
0
def work(line):
    '''This method computes all kinds of scores,
    for a given mutation, and returns the computed scores.

    Args:
    line: which contains mutation info.
        <PROTEIN_ID, POSITION, ORIGINAL_AMINO_ACID, MUTANT_AMINO_ACID>

    Return Values:
    [parts, scores] flattend list

    '''

    line = line.strip()
    parts = line.split(',')

    protein = parts[0]
    position = int(parts[1]) - 1  # correction for indexing python lists
    mut_aa = parts[3]

    result = []

    print(protein)

    for alignment_dir in align_dirs:

        alignment_file = os.path.join(alignment_dir, protein)
        if not os.path.isfile(alignment_file):
            return []

        fasta = util.read_sequence('../fasta/%s.fasta' % protein)

        proteins = load_alignments.do(
            alignment_file)  # types of proteins required

        # sort & prune the list of proteins
        proteins = prune_proteins_list(proteins)

        # fetch the record pertaining to the query protein
        query, p = util.fetch_query_protein_in_alignments(
            proteins)  # p does not contain query sequence

        prots = [prot["id"] for prot in proteins]
        types = [prot["type"] for prot in proteins]
        match_percents = [prot["match_percentage"] for prot in proteins]
        alignments = [prot["alignment"] for prot in proteins]
        proteins = prots  #replacing proteins dicitonary with only ids

        result.append(len(prots) + 1)

        orig_aa = fasta[position]
        # correct the position of query protein wrt to alignment
        mod_pos = correct_alignment_position.do(query['alignment'], position)
        aa = check_mutation_position.do(alignments, mod_pos)

        # calculate shannon entropy score w/o sequence weights
        result.append(
            shannon_entropy_score.do(list(alignments), query['alignment'],
                                     mod_pos, orig_aa, mut_aa))
        # shannon entropy score with sequence weights
        result.append(
            shannon_entropy_score.do(list(alignments), query['alignment'],
                                     mod_pos, orig_aa, mut_aa, 1))

        # calculate von-neumann entropy score
        result.append(von_neumann_entropy_score.do(list(aa), orig_aa, mut_aa))

        # calculate relative entropy score
        #result.append(relative_entropy_score.do(list(aa), orig_aa, mut_aa))

        # calculate jensen-shannon divergence score
        #result.append(jensen_shannon_divergence_score(list(aa), orig_aa, mut_aa))

        # calculate sum-of-pairs scores
        result.append(
            sum_of_pairs_score.do(list(alignments), query['alignment'],
                                  mod_pos, orig_aa, mut_aa, 0))  # wo seq wg
        result.append(
            sum_of_pairs_score.do(list(alignments), query['alignment'],
                                  mod_pos, orig_aa, mut_aa, 1))  # w seq wg

    # return mutation information and scores for recording in a file
    return [str(item) for item in list(chain.from_iterable([parts, result]))]
def do(file1, file2):
    # fetch all mutations
    mutations = []
    with open(file1) as fp:
        for line in fp:
            parts = line.strip().split(',')
            mutations.append(
                [parts[0], parts[1], parts[2], parts[3], parts[4]])

    ofp = open(file2, 'w')

    for mut in mutations:
        # fetch reqd info
        protein = mut[0]
        position = int(mut[1]) - 1
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s' % protein
        if not os.path.isfile(alignment_file):
            continue

        print(mut)

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # prune the proteins
        alignments = util.prune_proteins_list(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # fetch the corresponding sequences
        sequences = [a['alignment'] for a in alignments]

        # compute frequency of amino acid at desired position
        aa = check_mutation_position.do([k['alignment'] for k in alignments],
                                        actual_pos)

        # compute simple frequency of original & mutant amino acid
        o_score, m_score = simple_frequency_score.do(list(aa), orig_aa, mut_aa)

        # compute score using pseudo-counts in order to account for missing aa
        o_ps_score, m_ps_score = pseudo_count_score.do(list(aa), orig_aa,
                                                       mut_aa)

        # compute simple sequence-weighted frequency score
        sequence_weights = landgraf_sequence_weights.do(
            [a['alignment'] for a in alignments])
        o_sw_score, m_sw_score = simple_frequency_score.do(
            list(aa), orig_aa, mut_aa, sequence_weights)
        # using gap frequencies
        o_gf_score, m_gf_score = gapped_frequency_score.do(
            list(aa), orig_aa, mut_aa, sequence_weights)

        # calculate shannon entropy score w/o sequence weights
        shannon = shannon_entropy_score.do(list(sequences), query['alignment'],
                                           actual_pos, orig_aa, mut_aa)
        # shannon entropy score with sequence weights
        shannon_weighted = shannon_entropy_score.do(list(sequences),
                                                    query['alignment'],
                                                    actual_pos, orig_aa,
                                                    mut_aa, 1)

        # calculate von-neumann entropy score
        von_neumann_score = von_neumann_entropy_score.do(
            list(aa), orig_aa, mut_aa)

        # calculate sum-of-pairs scores
        sop = sum_of_pairs_score.do(list(sequences), query['alignment'],
                                    actual_pos, orig_aa, mut_aa,
                                    0)  # wo seq wg
        sop_wg = sum_of_pairs_score.do(list(sequences), query['alignment'],
                                       actual_pos, orig_aa, mut_aa,
                                       1)  # w seq wg

        # append all scores together and write to file
        scores = [
            o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score,
            o_gf_score, m_gf_score, shannon, shannon_weighted,
            von_neumann_score, sop, sop_wg
        ]
        #print(scores)

        ofp.write(','.join(
            [str(item)
             for item in list(chain.from_iterable([mut, scores]))]) + '\n')

        #break

    ofp.close()
Beispiel #7
0
def do(file1, file2, winsize):
    mutations = []
    with open(file1) as fp:
        for line in fp:
            mutations.append(line.strip().split(','))

    with open(file2, 'w') as ofp:
        for mut in mutations:

            print mut
            flag = 0

            protein = mut[0]
            position = int(mut[1]) - 1
            orig_aa = mut[2]
            mut_aa = mut[3]

            fasta_seq = util.read_sequence('../fasta/%s.fasta' % protein)

            alfile = '../alignments/%s' % protein
            if not os.path.isfile(alfile):
                continue

            proteins = load_alignments.do(alfile)
            proteins = util.prune_proteins_list(proteins)
            query, proteins = util.fetch_query_protein_in_alignments(proteins)
            query_seq = query['alignment']

            alignments = [a['alignment'] for a in proteins]
            try:
                sequence_weights = landgraf_sequence_weights.do(alignments)
            except Exception as e:
                print str(e)
                flag = 1

            if flag:
                continue

            scores = []

            for w in range(winsize + 1):  # 0,1,2,3 for winsize=3, hence the +1
                try:
                    # what score to use?

                    if w == 0:
                        mod_pos = correct_alignment_position.do(
                            query_seq, position)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        o, m = gapped_frequency_score.do(
                            list(aa), orig_aa, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(m)

                    else:
                        mod_pos = correct_alignment_position.do(
                            query_seq, position - w)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        aa_in_fasta = fasta_seq[position - w]
                        o, m = gapped_frequency_score.do(
                            list(aa), aa_in_fasta, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(
                            o
                        )  # left neighbour at position w from mutation position

                        mod_pos = correct_alignment_position.do(
                            query_seq, position + w)
                        aa = check_mutation_position.do(alignments, mod_pos)
                        aa_in_fasta = fasta_seq[position + w]
                        o, m = gapped_frequency_score.do(
                            list(aa), aa_in_fasta, mut_aa,
                            sequence_weights)  # mod_pos
                        scores.append(
                            o
                        )  # right neighbour at position w from mutation position

                except Exception:
                    flag = 1
                    break

            if flag:
                continue

            #print(scores)

            ofp.write(','.join([
                str(item) for item in list(chain.from_iterable([mut, scores]))
            ]) + '\n')
def do(file1, file2):
    # fetch all mutations
    mutations = []
    with open(file1) as fp:
        for line in fp:
            parts = line.strip().split(',')
            mutations.append(
                [parts[0], parts[1], parts[2], parts[3], parts[4]])


#    ofp = open(file2,'w')

    for mut in mutations:
        # fetch reqd info
        protein = mut[0]
        position = int(mut[1]) - 1
        orig_aa = mut[2]
        mut_aa = mut[3]

        # if protein not aligned - pass this
        alignment_file = '../alignments/%s' % protein
        if not os.path.isfile(alignment_file):
            continue

        print(mut)

        # load the aligned sequences
        alignments = load_alignments.do(alignment_file)

        # remove duplicate sequences in aligned sequences
        alignments = uniquify_alignments.do(alignments)

        # fetch query sequence
        query, alignments = util.fetch_query_protein_in_alignments(alignments)
        if len(alignments) == 0:
            continue

        # fetch actual position of mutation in aligned query sequence
        try:
            actual_pos = correct_alignment_position.do(query['alignment'],
                                                       position)
        except Exception as e:
            print(query['alignment'])
            continue

        # compute frequency of amino acid at desired position
        aa = check_mutation_position.do([k['alignment'] for k in alignments],
                                        actual_pos)

        # compute simple frequency of original & mutant amino acid
        o_score, m_score = simple_frequency_score.do(aa, orig_aa, mut_aa)

        # compute score using pseudo-counts in order to account for missing aa
        o_ps_score, m_ps_score = pseudo_count_score.do(aa, orig_aa, mut_aa)

        # compute simple sequence-weighted frequency score
        sequence_weights = landgraf_sequence_weights.do(
            [a['alignment'] for a in alignments])
        o_sw_score, m_sw_score = simple_frequency_score.do(
            aa, orig_aa, mut_aa, sequence_weights)
        # using gap frequencies
        o_gf_score, m_gf_score = gapped_frequency_score.do(
            aa, orig_aa, mut_aa, sequence_weights)

        # append all scores together and write to file
        scores = [
            o_score, m_score, o_ps_score, m_ps_score, o_sw_score, m_sw_score,
            o_gf_score, m_gf_score
        ]
        print(scores)

        #	ofp.write(','.join([ str(item) for item in list(chain.from_iterable([mut, scores])) ])+'\n')

        break