def calculate_physiochemical_features(temp_dict, sequence):
    analyzed_seq = ProteinAnalysis(sequence)

    charge_at_pH7 = analyzed_seq.charge_at_pH(7)
    instability_index = analyzed_seq.instability_index()
    molecular_weight = analyzed_seq.molecular_weight()
    aromaticity = analyzed_seq.aromaticity()
    molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient()
    range_l, range_h = molar_extinction_coefficient
    molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2
    gravy = analyzed_seq.gravy(
    )  #Grand Average Hyrdopathy - Higher value = More Hydrophobic
    isoelectric_point = analyzed_seq.isoelectric_point()
    helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction(
    )

    physiochem_dict = {
        "Charge at pH7": charge_at_pH7,
        "Instability Index": instability_index,
        "Molecular Wt": molecular_weight,
        "Aromaticity": aromaticity,
        "Molar Extinction Coeff": molar_extinction_coefficient,
        "Gravy": gravy,
        "Isoelectric pt": isoelectric_point,
        "Helix Fraction": helix_fraction,
        "Turn Fraction": turn_fraction,
        "Sheet Fraction": sheet_fraction
    }
    temp_dict.update(physiochem_dict)

    #Adding separately because get_amino_acids_percent() generates a dictionary on its own
    aa_percent = analyzed_seq.get_amino_acids_percent()
    temp_dict.update(aa_percent)
Esempio n. 2
0
    def _protein_parameters(self, sequence):
        """Calculates physicochemical properties for the amino acid sequence.
        
        Args:
            sequence: str, amino acid sequence.
            
        Returns: 
            property_arr: np array, vector of properties.
            
        """

        analysis = ProteinAnalysis(sequence)

        property_arr = []

        property_arr.append(analysis.molecular_weight())
        property_arr.append(analysis.aromaticity())
        property_arr.append(analysis.instability_index())
        property_arr.append(analysis.gravy())
        property_arr.append(analysis.isoelectric_point())

        secondary = analysis.secondary_structure_fraction()
        property_arr.append(secondary[0])
        property_arr.append(secondary[1])
        property_arr.append(secondary[2])

        molar_extinction_coefficient = analysis.molar_extinction_coefficient()
        property_arr.append(molar_extinction_coefficient[0])
        property_arr.append(molar_extinction_coefficient[1])

        property_arr.append(self._net_charge(sequence))

        return np.array(property_arr)
Esempio n. 3
0
    def get_molar_extinction_coefficient(
        self
    ):  # [reduced, oxidized] # with reduced cysteines / # with disulfid bridges
        """
        Calculates the molar extinction coefficient (2 values) from biopython

        :return: dictionary with the value of reduced cysteins and oxidized (with disulfid bridges)
        """

        res = {}
        analysed_seq = ProteinAnalysis(self.ProteinSequence)
        res['Molar_extinction_coefficient_reduced'] = analysed_seq.molar_extinction_coefficient(
        )[0]  # reduced
        res['Molar_extinction_coefficient_oxidized'] = analysed_seq.molar_extinction_coefficient(
        )[1]  # cys cys bounds
        return res
def phyChemProps(seq):
    svv = [0 for x in range(10)]
    X = ProteinAnalysis(seq)
    svv[0] = X.aromaticity()
    svv[1] = X.secondary_structure_fraction()[0]
    svv[2] = X.secondary_structure_fraction()[1]
    svv[3] = X.secondary_structure_fraction()[2]
    svv[4] = X.gravy()
    svv[5] = X.instability_index()
    svv[6] = X.isoelectric_point()
    svv[7] = X.molecular_weight()
    svv[8] = X.molar_extinction_coefficient()[0]
    svv[9] = X.molar_extinction_coefficient()[1]
    return svv
Esempio n. 5
0
def biopython_proteinanalysis_seq(seq, scaling=False):
    res = ProteinAnalysis(seq)
    d = {}
    flex = np.array(res.flexibility())
    d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max(
    ), flex.std()
    d['gravy'] = res.gravy()
    d['instability_index'] = res.instability_index()
    d['isoelectric_point'] = res.isoelectric_point()
    r, c = res.molar_extinction_coefficient()
    d['molar_extinction_coefficient_reduced'], d[
        'molar_extinction_coefficient_cysteines'] = r, c
    d['molecular_weight'] = res.molecular_weight()
    d['percent_helix_naive'], d['percent_turn_naive'], d[
        'percent_strand_naive'] = res.secondary_structure_fraction()

    aap = res.get_amino_acids_percent()
    aas = sorted(aap.keys())
    d.update({'percent:%s' % aa: aap[aa] for aa in aas})
    d.update({
        'prop_res_%s' % key: sum([aap.get(x, 0) for x in value])
        for key, value in list(property_residues.items())
    })
    return d
Esempio n. 6
0
    def extract(self):
        AA=["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]
        SC=["1","2","3","4","5","6","7"]
        tri_pep = [''.join(i) for i in itertools.product(AA, repeat = 3)]
        myseq="AILMVNQSTGPCHKRDEFWY"
        trantab2=myseq.maketrans("AILMVNQSTGPCHKRDEFWY","11111222233455566777")
        tetra_sc = [''.join(i) for i in itertools.product(SC, repeat = 4)]
        total_fasta=self.g_total_fasta
        sec_code=0
        record_current=0
        arr = numpy.empty((total_fasta,10409), dtype=numpy.float)
        names = numpy.empty((total_fasta,1),  dtype=object)
        names_dic=dict()
        for record in SeqIO.parse(self.infile, "fasta"):
            data=(record_current/total_fasta) * 100
            if (self.g_is_socket==1):
                self.g_socketio.emit('set bar', {'data': data},room=self.g_sid)
            else:
                print('extracting features of seq ' + str(record_current+1) + ' of ' + str(total_fasta),end='\r')
            #yield "event: update\ndata:" + str(data) + "\n\n"
            record_current += 1
            
            #job.meta['current']=record_current
            #job.save_meta()
            ll=len(record.seq)
            seq_name=''
            if not self.prot_check(str(record.seq)):
                print("Warning: " + record.id + " is not a valid protein sequence")
                continue
            if record.id in names_dic:
                seq_name= record.id + '_' + str(names_dic[record.id])
                names_dic[record.id]=names_dic[record.id]+1
            else:
                seq_name= record.id
                names_dic[record.id]=1
            seqq=record.seq.__str__().upper()
            seqqq=seqq.replace('X','A').replace('J','L').replace('*','A').replace('Z','E').replace('B','D')
           # X = ProteinAnalysis(record.seq.__str__().upper().replace('X','A').replace('J','L').replace('*',''))
            X = ProteinAnalysis(seqqq)
            myseq=seqq.translate(trantab2)
            tt= [X.isoelectric_point(), X.instability_index(),ll,X.aromaticity(),
                 X.molar_extinction_coefficient()[0],X.molar_extinction_coefficient()[1],
                 X.gravy(),X.molecular_weight()]
            tt_n = numpy.asarray(tt,dtype=numpy.float)

            tri_pep_count=[seqq.count(i)/(ll-2) for i in tri_pep]
            tri_pep_count_n = numpy.asarray(tri_pep_count,dtype=numpy.float)
            
            tetra_sc_count=[myseq.count(i)/(ll-3) for i in tetra_sc]
            tetra_sc_count_n = numpy.asarray(tetra_sc_count,dtype=numpy.float)
    
            cat_n= numpy.concatenate((tetra_sc_count_n,tri_pep_count_n,tt_n))
            cat_n = cat_n.reshape((1,cat_n.shape[0]))


            arr[sec_code,:]=cat_n
            names[sec_code,0]=seq_name
            sec_code += 1
        if (self.g_is_socket==1):
            self.g_socketio.emit('set bar', {'data': 100},room=self.g_sid)
            self.g_socketio.emit('done features',1,room=self.g_sid)
        print("\nDone")
        return (names,arr)
Esempio n. 7
0
def get_phanns_input(fasta_list, d2vmodel):
    #     d2vmodel = pickle.load(open('d2v_model1.p','rb'))
    AA = [
        "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q",
        "R", "S", "T", "V", "W", "Y"
    ]
    SC = ["1", "2", "3", "4", "5", "6", "7"]
    tri_pep = [''.join(i) for i in itertools.product(AA, repeat=3)]
    tetra_sc = [''.join(i) for i in itertools.product(SC, repeat=4)]
    prot_class = 0
    myseq = "AILMVNQSTGPCHKRDEFWY"
    trantab2 = myseq.maketrans("AILMVNQSTGPCHKRDEFWY", "11111222233455566777")
    kmer_size = 3
    this_prot = 0
    vectors = []
    classes = []
    for file in fasta_list:
        print('####################' + file)
        #         file_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))),"fasta",file + "_all_clustered.fasta")
        for record in SeqIO.parse(file, "fasta"):
            ll = len(record.seq)
            seqq = record.seq.__str__().upper()
            seqqq = seqq.replace('X', 'A').replace('J', 'L').replace(
                '*', 'A').replace('Z', 'E').replace('B', 'D')
            X = ProteinAnalysis(seqqq)
            tt = [
                X.isoelectric_point(),
                X.instability_index(), ll,
                X.aromaticity(),
                X.molar_extinction_coefficient()[0],
                X.molar_extinction_coefficient()[1],
                X.gravy(),
                X.molecular_weight()
            ]
            tt_n = np.asarray(tt, dtype=np.float)
            myseq = seqq.translate(trantab2)

            #count tripeptides
            tri_pep_count = [seqq.count(i) / (ll - 2) for i in tri_pep]
            tri_pep_count_n = np.asarray(tri_pep_count, dtype=np.float)

            #count tetra side chains
            tetra_sc_count = [myseq.count(i) / (ll - 3) for i in tetra_sc]
            tetra_sc_count_n = np.asarray(tetra_sc_count, dtype=np.float)

            #get embedding vector
            vec = d2vmodel.infer_vector([
                seqqq[k:k + kmer_size] for k in range(0, len(seqqq), kmer_size)
            ])
            for s in range(1, kmer_size):
                vec = vec + d2vmodel.infer_vector([
                    seqqq[k:k + kmer_size]
                    for k in range(s, len(seqqq), kmer_size)
                ])
            vec = vec / kmer_size

            cat_n = np.concatenate(
                (tri_pep_count_n, tetra_sc_count_n, tt_n, vec))
            vectors.append((cat_n, record))

            this_prot += 1
            if (this_prot % 500 == 0):
                print("processing sequence # " + str(this_prot), end="\r")
        prot_class += 1
        this_prot = 0
    return vectors
def main():
    aa = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    dipeptide = [
        'AA', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AK', 'AL', 'AM', 'AN',
        'AP', 'AQ', 'AR', 'AS', 'AT', 'AV', 'AW', 'AY', 'CA', 'CC', 'CD', 'CE',
        'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CP', 'CQ', 'CR', 'CS',
        'CT', 'CV', 'CW', 'CY', 'DA', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'DI',
        'DK', 'DL', 'DM', 'DN', 'DP', 'DQ', 'DR', 'DS', 'DT', 'DV', 'DW', 'DY',
        'EA', 'EC', 'ED', 'EE', 'EF', 'EG', 'EH', 'EI', 'EK', 'EL', 'EM', 'EN',
        'EP', 'EQ', 'ER', 'ES', 'ET', 'EV', 'EW', 'EY', 'FA', 'FC', 'FD', 'FE',
        'FF', 'FG', 'FH', 'FI', 'FK', 'FL', 'FM', 'FN', 'FP', 'FQ', 'FR', 'FS',
        'FT', 'FV', 'FW', 'FY', 'GA', 'GC', 'GD', 'GE', 'GF', 'GG', 'GH', 'GI',
        'GK', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', 'GT', 'GV', 'GW', 'GY',
        'HA', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HI', 'HK', 'HL', 'HM', 'HN',
        'HP', 'HQ', 'HR', 'HS', 'HT', 'HV', 'HW', 'HY', 'IA', 'IC', 'ID', 'IE',
        'IF', 'IG', 'IH', 'II', 'IK', 'IL', 'IM', 'IN', 'IP', 'IQ', 'IR', 'IS',
        'IT', 'IV', 'IW', 'IY', 'KA', 'KC', 'KD', 'KE', 'KF', 'KG', 'KH', 'KI',
        'KK', 'KL', 'KM', 'KN', 'KP', 'KQ', 'KR', 'KS', 'KT', 'KV', 'KW', 'KY',
        'LA', 'LC', 'LD', 'LE', 'LF', 'LG', 'LH', 'LI', 'LK', 'LL', 'LM', 'LN',
        'LP', 'LQ', 'LR', 'LS', 'LT', 'LV', 'LW', 'LY', 'MA', 'MC', 'MD', 'ME',
        'MF', 'MG', 'MH', 'MI', 'MK', 'ML', 'MM', 'MN', 'MP', 'MQ', 'MR', 'MS',
        'MT', 'MV', 'MW', 'MY', 'NA', 'NC', 'ND', 'NE', 'NF', 'NG', 'NH', 'NI',
        'NK', 'NL', 'NM', 'NN', 'NP', 'NQ', 'NR', 'NS', 'NT', 'NV', 'NW', 'NY',
        'PA', 'PC', 'PD', 'PE', 'PF', 'PG', 'PH', 'PI', 'PK', 'PL', 'PM', 'PN',
        'PP', 'PQ', 'PR', 'PS', 'PT', 'PV', 'PW', 'PY', 'QA', 'QC', 'QD', 'QE',
        'QF', 'QG', 'QH', 'QI', 'QK', 'QL', 'QM', 'QN', 'QP', 'QQ', 'QR', 'QS',
        'QT', 'QV', 'QW', 'QY', 'RA', 'RC', 'RD', 'RE', 'RF', 'RG', 'RH', 'RI',
        'RK', 'RL', 'RM', 'RN', 'RP', 'RQ', 'RR', 'RS', 'RT', 'RV', 'RW', 'RY',
        'SA', 'SC', 'SD', 'SE', 'SF', 'SG', 'SH', 'SI', 'SK', 'SL', 'SM', 'SN',
        'SP', 'SQ', 'SR', 'SS', 'ST', 'SV', 'SW', 'SY', 'TA', 'TC', 'TD', 'TE',
        'TF', 'TG', 'TH', 'TI', 'TK', 'TL', 'TM', 'TN', 'TP', 'TQ', 'TR', 'TS',
        'TT', 'TV', 'TW', 'TY', 'VA', 'VC', 'VD', 'VE', 'VF', 'VG', 'VH', 'VI',
        'VK', 'VL', 'VM', 'VN', 'VP', 'VQ', 'VR', 'VS', 'VT', 'VV', 'VW', 'VY',
        'WA', 'WC', 'WD', 'WE', 'WF', 'WG', 'WH', 'WI', 'WK', 'WL', 'WM', 'WN',
        'WP', 'WQ', 'WR', 'WS', 'WT', 'WV', 'WW', 'WY', 'YA', 'YC', 'YD', 'YE',
        'YF', 'YG', 'YH', 'YI', 'YK', 'YL', 'YM', 'YN', 'YP', 'YQ', 'YR', 'YS',
        'YT', 'YV', 'YW', 'YY'
    ]

    sequences = pandas.read_csv('protein_data.csv', header=None)

    lengths = []
    weights = []
    for protein in sequences.itertuples():
        protein_length = len(str(protein[1]))  # length of protein sequence
        lengths.append(protein_length)
        analyzed_protein = ProteinAnalysis(str(protein[1]))
        ambigious_match = re.findall("X+|Z+", protein[1])
        if ambigious_match:
            molecular_weight = "?"
        else:
            molecular_weight = analyzed_protein.molecular_weight()
        weights.append(molecular_weight)
    # remove bad amino acids from sequences
    for i in range(len(sequences)):
        sequences[0][i] = sequences[0][i].replace('B', '')
        sequences[0][i] = sequences[0][i].replace('U', '')
        sequences[0][i] = sequences[0][i].replace('X', '')
        sequences[0][i] = sequences[0][i].replace('Z', '')
    pandas.DataFrame(sequences).to_csv('updated_protein_data.csv',
                                       index_label=None,
                                       header=None,
                                       index=None)

    # use amino acid composition results from pfeature to generate most common amino acid and dipeptide
    data = pandas.read_csv('updated_protein_data.csv', header=None)
    data = numpy.asarray(data)
    most_frequent_di = []
    most_frequent = []
    for i in range(len(data)):
        max = 0
        col = 0
        for j in range(len(dipeptide)):
            c = data[i][0].count(dipeptide[j])
            if (c > max):
                max = c
                col = j
        most_frequent_di.append(dipeptide[col])
        for j in range(len(aa)):
            c = data[i][0].count(aa[j])
            if (c > max):
                max = c
                col = j
        most_frequent.append(aa[col])

    # more features
    amino_acid = {}
    first_aa = []
    last_aa = []
    arom = []
    ii = []
    ip = []
    mec_rc = []
    mec_db = []
    ssf_helix = []
    ssf_turn = []
    ssf_sheet = []
    gravy = []
    ph_0 = []
    ph_7 = []
    ph_14 = []
    A = []
    C = []
    D = []
    E = []
    F = []
    G = []
    H = []
    I = []
    K = []
    L = []
    M = []
    N = []
    P = []
    Q = []
    R = []
    S = []
    T = []
    V = []
    W = []
    Y = []
    classes = []
    data = pandas.read_csv('updated_protein_data.csv', header=None)
    for protein in data.itertuples():
        analyzed_protein = ProteinAnalysis(str(protein[1]))
        amino_acid = (analyzed_protein.count_amino_acids())
        A.append(amino_acid.get('A'))
        C.append(amino_acid.get('C'))
        D.append(amino_acid.get('D'))
        E.append(amino_acid.get('E'))
        F.append(amino_acid.get('F'))
        G.append(amino_acid.get('G'))
        H.append(amino_acid.get('H'))
        I.append(amino_acid.get('I'))
        K.append(amino_acid.get('K'))
        L.append(amino_acid.get('L'))
        M.append(amino_acid.get('M'))
        N.append(amino_acid.get('N'))
        P.append(amino_acid.get('P'))
        Q.append(amino_acid.get('Q'))
        R.append(amino_acid.get('R'))
        S.append(amino_acid.get('S'))
        T.append(amino_acid.get('T'))
        V.append(amino_acid.get('V'))
        W.append(amino_acid.get('W'))
        Y.append(amino_acid.get('Y'))

        first_aa.append(str(protein[1])[0])
        last_aa.append(str(protein[1])[-1])
        arom.append(analyzed_protein.aromaticity())
        ii.append(analyzed_protein.instability_index())
        ip.append(analyzed_protein.isoelectric_point())
        mec_rc.append(analyzed_protein.molar_extinction_coefficient()[0])
        mec_db.append(analyzed_protein.molar_extinction_coefficient()[1])
        ssf_helix.append(analyzed_protein.secondary_structure_fraction()[0])
        ssf_turn.append(analyzed_protein.secondary_structure_fraction()[1])
        ssf_sheet.append(analyzed_protein.secondary_structure_fraction()[2])
        gravy.append(analyzed_protein.gravy())
        ph_0.append(analyzed_protein.charge_at_pH(0.0))
        ph_7.append(analyzed_protein.charge_at_pH(7.0))
        ph_14.append(analyzed_protein.charge_at_pH(14.0))
        classes.append(protein[2])

    features = pandas.DataFrame()
    features["LENGTH"] = lengths
    #features["MOLECULAR WEIGHT"] = weights
    features["most frequent aa"] = most_frequent
    #features["first amino acids"] = first_aa
    features["last amino acid"] = last_aa
    features["most frequence dipeptide"] = most_frequent_di
    features["aromaticity"] = arom
    features["instability index"] = ii
    features["isolectric point"] = ip
    features["molecular extinction coefficient - reduced cysteines"] = mec_rc
    features["molecular extinction coefficient - disulfid bridges"] = mec_db
    features["secondary structure fraction helix"] = ssf_helix
    features["secondary structure fraction turn"] = ssf_turn
    features["secondary structure fraction sheet"] = ssf_sheet
    features["gravy"] = gravy
    features["charge at ph 0"] = ph_0
    features["charge at ph 7"] = ph_7
    features["charge at ph 14"] = ph_14
    features['A'] = A
    features['C'] = C
    features['D'] = D
    features['E'] = E
    features['F'] = F
    features['G'] = G
    features['H'] = H
    features['I'] = I
    features['K'] = K
    features['L'] = L
    features['M'] = M
    features['N'] = N
    features['P'] = P
    features['Q'] = Q
    features['R'] = R
    features['S'] = S
    features['T'] = T
    features['V'] = V
    features['W'] = W
    features['Y'] = Y
    features["CLASS"] = classes
    features.to_csv('features.csv', index=None)
Esempio n. 9
0
    print('\nResults for record: {} ###'.format(record.id))
    print(X.count_amino_acids()['A'])
    print(X.count_amino_acids()['E'])
    print("A percentage :%0.2f" % X.get_amino_acids_percent()['A'])
    print("T percentage :%0.2f" % X.get_amino_acids_percent()['T'])

    print("C percentage :%0.2f" % X.get_amino_acids_percent()['C'])
    print("G percentage :%0.2f" % X.get_amino_acids_percent()['G'])

    print("%0.2f" % X.molecular_weight())
    print("%0.2f" % X.aromaticity())
    print("%0.2f" % X.instability_index())
    print("%0.2f" % X.isoelectric_point())
    sec_struc = X.secondary_structure_fraction()
    print("%0.2f" % sec_struc[0])
    epsilon_prot = X.molar_extinction_coefficient()
    print(epsilon_prot[0])
    print(epsilon_prot[1])
    composition1 = X.count_amino_acids()

print("*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*\nTCF7L2:")
for record in SeqIO.parse('TCF7L2.fasta', 'fasta'):
    X = ProteinAnalysis(str(record.seq))

    print('\nResults for record: {} ###'.format(record.id))
    print(X.count_amino_acids()['A'])
    print(X.count_amino_acids()['E'])
    print("A percentage :%0.2f" % X.get_amino_acids_percent()['A'])
    print("T percentage :%0.2f" % X.get_amino_acids_percent()['T'])

    print("C percentage :%0.2f" % X.get_amino_acids_percent()['C'])
Esempio n. 10
0
    "Gravy index", "pI", "Molar exctinction coefficient"
]
with open(csvfilename, "w") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    for inputfile in inputfiles:
        for record in SeqIO.parse(inputfile, "fasta"):
            print(record.id)
            protein = ProteinAnalysis(str(record.seq))
            writer.writerow({
                "Accession number":
                record.id,
                "Molecular Weight":
                protein.molecular_weight(),
                "Aromaticity":
                protein.aromaticity(),
                "Instability index":
                protein.instability_index(),
                "Gravy index":
                protein.gravy(),
                "pI":
                protein.isoelectric_point(),
                "Molar exctinction coefficient":
                protein.molar_extinction_coefficient()[1]
            })
            print("%s %s %s %s %s %s" %
                  (protein.molecular_weight(), protein.aromaticity(),
                   protein.instability_index(), protein.gravy(),
                   protein.isoelectric_point(),
                   protein.molar_extinction_coefficient()[1]))
Esempio n. 11
0
    fasta = SeqIO.parse(fasta_file, "fasta")

    header = [
        "protein_id", "MW", "length", "aromaticity", "II", "GRAVY", "pI",
        "helix", "turn", "sheet", "extinction"
    ]
    out_file.write("\t".join(header) + "\n")

    for rec in fasta:
        sequence = (str(rec.seq).upper().replace("*", "").replace(
            "X", "").replace("J", "L").replace("B",
                                               "N").replace("Z", "Q").replace(
                                                   "U", "C").replace("O", "K"))
        ID = rec.id.split("|")[-1]
        length = len(sequence)
        anal = ProteinAnalysis(sequence)
        mw = anal.molecular_weight()
        aro = anal.aromaticity()
        insta = anal.instability_index()
        grev = anal.gravy()
        ie = anal.isoelectric_point()
        sec = anal.secondary_structure_fraction()
        helix = sec[0]
        turn = sec[1]
        sheet = sec[2]
        ext = anal.molar_extinction_coefficient()[0]
        row = [ID, mw, length, aro, insta, grev, ie, helix, turn, sheet, ext]
        row = [str(n) for n in row]
        out_file.write("\t".join(row) + "\n")
Esempio n. 12
0
# število alaninskih ostankov
print('Število A:', analysis_seq.count_amino_acids()['A'])

# delež ostankov, ki jih predstavljajo alaninski ostanki
print('Delež  A: %0.2f' % analysis_seq.get_amino_acids_percent()['A'])

# izoelektrična točka
print('Izoelektrična točka (pI): %0.2f' % analysis_seq.isoelectric_point())

# delež ak-ostankov, ki so preferenčno v določenem elementu sekundarne strukture [helix, turn, sheet]
sec_struc = analysis_seq.secondary_structure_fraction()
print('Delež alfa-vijačnic: %0.2f' % sec_struc[0])

# ekstinkcijski koeficient
excoeff_prot = analysis_seq.molar_extinction_coefficient()
# ekstinkcijski koeficient, vsi cisteinski ostanki reducirani (prosti)
print("Molarni ekstinkcijski koeficient, red. [1/(M cm)]:", excoeff_prot[0])
# ekstinkcijski koeficient, vsi cisteinski ostanki v obliki cistinov
print("Molarni ekstinkcijski koeficient, oks. [1/(M cm)]:", excoeff_prot[1])


# Gre za podobno analizo, ki jo izvede spletna storitev ProtParam na naslovu https://web.expasy.org/protparam/.

# ---
# ## Analiza zaporedja z drsečim oknom
# 
# Za primer bomo analizirali hidrofobnost proteina in sicer bomo narisali hidrofobni profil. Pri tovrstnih analizah ponavadi uporabljamo drseče okno ustrezne velikosti, pri čemer nek parameter povprečimo po tem oknu in ga pripišemo aminokislinskemu ostanki v sredini tega okna. Zato, da je določitev ak-ostanka v sredini okna nedvoumna, uporabljamo kot velikost okna liho število ak-ostankov.
# 
# O analizi z uporabo drsečega okna smo že govorili na predavanjih:
# ![Analiza z drsečim oknom](slike/drsece_okno.png)
Esempio n. 13
0
def extract_all(fasta_list):
    d = {'seq_description': [], 'seq_id': [], "sec_code": []}
    sec_code = 0
    df = pd.DataFrame(data=d)
    total_fasta = 0
    for file in fasta_list:
        for record in SeqIO.parse(file, "fasta"):
            total_fasta += 1

    AA = [
        "A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q",
        "R", "S", "T", "V", "W", "Y"
    ]
    SC = ["1", "2", "3", "4", "5", "6", "7"]
    di_pep = [''.join(i) for i in itertools.product(AA, repeat=2)]
    tri_pep = [''.join(i) for i in itertools.product(AA, repeat=3)]
    di_sc = [''.join(i) for i in itertools.product(SC, repeat=2)]
    tri_sc = [''.join(i) for i in itertools.product(SC, repeat=3)]
    tetra_sc = [''.join(i) for i in itertools.product(SC, repeat=4)]
    prot_class = 0
    myseq = "AILMVNQSTGPCHKRDEFWY"
    trantab2 = myseq.maketrans("AILMVNQSTGPCHKRDEFWY", "11111222233455566777")
    arr = numpy.empty((total_fasta, 11201), dtype=numpy.float)
    class_arr = numpy.empty((total_fasta), dtype=numpy.int)
    group_arr = numpy.empty((total_fasta), dtype=numpy.int)
    id_arr = numpy.empty((total_fasta), dtype=numpy.int)
    this_prot = 0
    for file in fasta_list:
        print('####################' + file)
        for record in SeqIO.parse(file, "fasta"):
            ll = len(record.seq)
            seqq = record.seq.__str__().upper()
            seqqq = seqq.replace('X', 'A').replace('J', 'L').replace(
                '*', 'A').replace('Z', 'E').replace('B', 'D')
            X = ProteinAnalysis(seqqq)
            tt = [
                X.isoelectric_point(),
                X.instability_index(), ll,
                X.aromaticity(),
                X.molar_extinction_coefficient()[0],
                X.molar_extinction_coefficient()[1],
                X.gravy(),
                X.molecular_weight()
            ]
            tt_n = numpy.asarray(tt, dtype=numpy.float)
            myseq = seqq.translate(trantab2)

            di_pep_count = [seqq.count(i) / (ll - 1) for i in di_pep]
            di_pep_count_n = numpy.asarray(di_pep_count, dtype=numpy.float)

            tri_pep_count = [seqq.count(i) / (ll - 2) for i in tri_pep]
            tri_pep_count_n = numpy.asarray(tri_pep_count, dtype=numpy.float)

            di_sc_count = [myseq.count(i) / (ll - 1) for i in di_sc]
            di_sc_count_n = numpy.asarray(di_sc_count, dtype=numpy.float)

            tri_sc_count = [myseq.count(i) / (ll - 2) for i in tri_sc]
            tri_sc_count_n = numpy.asarray(tri_sc_count, dtype=numpy.float)

            tetra_sc_count = [myseq.count(i) / (ll - 3) for i in tetra_sc]
            tetra_sc_count_n = numpy.asarray(tetra_sc_count, dtype=numpy.float)

            cat_n = numpy.concatenate(
                (di_pep_count_n, tri_pep_count_n, di_sc_count_n,
                 tri_sc_count_n, tetra_sc_count, tt_n))
            #print(cat_n.shape)
            cat_n = cat_n.reshape((1, cat_n.shape[0]))

            #arr = numpy.append(arr,cat_n , axis=0)
            #class_arr = numpy.append(class_arr,prot_class)
            #id_arr = numpy.append(id_arr,sec_code)
            arr[sec_code, :] = cat_n
            class_arr[sec_code] = prot_class % 11
            group_arr[sec_code] = prot_class // 11
            id_arr[sec_code] = sec_code

            data_row = [record.description, record.id, int(sec_code)]
            #        df=df.append(pd.Series(data_row,index=df.columns),sort=False,ignore_index=True)
            sec_code += 1
            this_prot += 1
            if (this_prot % 500 == 0):
                print("processing sequence # " + str(this_prot), end="\r")

        prot_class += 1
        this_prot = 0
    return (arr, class_arr, group_arr, id_arr, df)
Esempio n. 14
0
    def featurise(self, data):
        """
        Featurise the data.

        Parameters:
        -----------
        data : `list` of `Bio.SeqRecord.SeqRecord`
            The data to be featurised.

        Returns:
        -------
        featurised_data : `pandas.DataFrame`
            (num_data, features) The featurised data.
        """
        # Get features of data
        features = collections.defaultdict(list)

        # Featurise the data
        for i, example in enumerate(data):
            # Convert Bio.SeqRecord.SeqRecord object to string for Bio.SeqUtils.ProtParam.ProteinAnalysis
            analysed_example = ProteinAnalysis(str(example.seq))
            first50_analysed_example = ProteinAnalysis(str(example.seq)[:50])
            last50_analysed_example = ProteinAnalysis(str(example.seq)[-50:])

            features["length"].append(analysed_example.length)
            features["molecular_weight"].append(
                analysed_example.molecular_weight())
            features["isoelectric_point"].append(
                analysed_example.isoelectric_point())
            features["aromaticity"].append(analysed_example.aromaticity())
            features["instability_index"].append(
                analysed_example.instability_index())
            features["gravy"].append(analysed_example.gravy())

            reduced, oxidised = analysed_example.molar_extinction_coefficient()
            features["reduced"].append(reduced)
            features["oxidised"].append(oxidised)

            helix, turn, sheet = analysed_example.secondary_structure_fraction(
            )
            features["helix"].append(helix)
            features["turn"].append(turn)
            features["sheet"].append(sheet)

            features["charge_at_ph1"].append(analysed_example.charge_at_pH(1))
            # features["charge_at_ph2"].append(analysed_example.charge_at_pH(2))
            # features["charge_at_ph3"].append(analysed_example.charge_at_pH(3))
            # features["charge_at_ph4"].append(analysed_example.charge_at_pH(4))
            features["charge_at_ph7"].append(analysed_example.charge_at_pH(7))
            features["charge_at_ph12"].append(
                analysed_example.charge_at_pH(12))

            features["hydrophobicity"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['kd'],
                                                   window=5,
                                                   edge=1.0)))
            features["flexibility"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['flex'],
                                                   window=5,
                                                   edge=1.0)))
            features["hydrophilicity"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['hw'],
                                                   window=5,
                                                   edge=1.0)))
            features["surface_accessibility"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['em'],
                                                   window=5,
                                                   edge=1.0)))
            features["janin"].append(
                np.mean(
                    analysed_example.protein_scale(self.dicts['ja'],
                                                   window=5,
                                                   edge=1.0)))
            #         features["dipeptide_dg "].append(np.mean(analysed_example.protein_scale(self.dicts['diwv'], window=5, edge=1.0)))

            features["first50_hydrophobicity"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['kd'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_flexibility"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['flex'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_hydrophilicity"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['hw'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_surface_accessibility"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['em'],
                                                           window=5,
                                                           edge=1.0)))
            features["first50_janin"].append(
                np.mean(
                    first50_analysed_example.protein_scale(self.dicts['ja'],
                                                           window=5,
                                                           edge=1.0)))

            features["last50_hydrophobicity"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['kd'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_flexibility"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['flex'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_hydrophilicity"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['hw'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_surface_accessibility"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['em'],
                                                          window=5,
                                                          edge=1.0)))
            features["last50_janin"].append(
                np.mean(
                    last50_analysed_example.protein_scale(self.dicts['ja'],
                                                          window=5,
                                                          edge=1.0)))

            for key, val in analysed_example.get_amino_acids_percent().items():
                features[key].append(val * 5)
            for key, val in first50_analysed_example.get_amino_acids_percent(
            ).items():
                features["first_50_" + str(key)].append(val * 5)
            for key, val in last50_analysed_example.get_amino_acids_percent(
            ).items():
                features["last_50_" + str(key)].append(val * 5)
        return pd.DataFrame.from_dict(features)
Esempio n. 15
0
class ProteinFeatureExtractor:
    """
    Feature extraction from protein sequence for
    Machine Learning classification or deeper analysis

    Example usage:

        from features.extractors.proteins import ProteinFeatureExtractor

        pfe = ProteinFeatureExtractor(protein_sequence='MAKINELLRESTTTNSNSIGRPNLVALTRATTKLIYSDIVATQRTNQPVAA')
        pfe.get_features()
    """

    FEATURE_NAMES = [
        "protein_length",
        "gravy",
        "molecular_weight",
        "aromaticity",
        "instability_index",
        "isoelectric_point",
        "flexibility",
        "mec_cysteines",
        "mec_cystines",
        "ssf_helix",
        "ssf_turn",
        "ssf_sheet",
    ]

    def __init__(self, protein_sequence: str):
        self.protein_sequence = self._normalize(protein_sequence)

        self.protein_analysis = ProteinAnalysis(self.protein_sequence)

    @staticmethod
    def _normalize(source: Union[str, SeqRecord]) -> str:
        """
        Normalize each protein sequence
        to uppercase and without blank chars
        """

        # If source is a string
        if isinstance(source, str):
            entry = source
        # If source is a BioPython object with seq field
        else:
            entry = source.seq

        return str(entry).upper().strip()

    def _get_protein_length(self) -> int:
        """
        Protein length
        """

        return len(self.protein_sequence)

    def _calculate_gravy(self) -> float:
        """
        GRAVY (Grand Average of Hydropathy) index score
        is calculated by adding the hydropathy value for
        each residue and then dividing by the length of
        the protein sequence

        Negative GRAVY value indicates that the protein
        is non-polar and Positive value indicates that
        the protein is polar
        """

        return self.protein_analysis.gravy()

    def _calculate_molecular_weight(self) -> float:
        """
        Molecular Weight is calculated as the sum
        of atomic masses of all atoms in the molecul
        """

        return self.protein_analysis.molecular_weight()

    def _calculate_aromaticity(self) -> float:
        """
        Aromaticity is used to describe a planar, cyclic
        molecule with a ring of resonance bonds which is
        more stable when compared to other connective or
        geometric arrangements consisting of the same set
        of atoms
        """

        return self.protein_analysis.aromaticity()

    def _calculate_instability_index(self) -> float:
        """
        Instability index gives an estimate of the stability
        of the protein in a test tube

        Any value above 40 means that the protein is unstable
        (has a short half life)
        """

        return self.protein_analysis.instability_index()

    def _calculate_isoelectric_point(self) -> float:
        """
        Isoelectric point (pI) is the pH at which net charge of
        the protein is zero. Isoelectric point is widely useful
        for choosing a buffer system for purification and
        crystallisation of a given protein
        """

        return self.protein_analysis.isoelectric_point()

    def _calculate_flexibility(self) -> float:
        """
        Flexibility is of overwhelming importance for protein function,
        because of the changes in protein structure during interactions
        with binding partners
        """

        return sum(self.protein_analysis.flexibility())

    def _calculate_molar_extinction_coefficient(self) -> Mapping[str, float]:
        """
        Molar extinction coefficient of a protein sequence can be calculated
        from the molar extension coefficient of amino acids which are
        Cystine, Tyrosine and Tryptophan
        """

        cysteines, cystines = self.protein_analysis.molar_extinction_coefficient()

        residues = {self.FEATURE_NAMES[7]: cysteines, self.FEATURE_NAMES[8]: cystines}

        return residues

    def _calculate_secondary_structure_fraction(self) -> Mapping[str, float]:
        """
        This function returns a list of the fraction of amino acids which
        tend to be in Helix, Turn or Sheet

        Amino acids present in Turn are:
        Asparagine (N), Proline (P), Glycine (G), Serine (S)

        Amino acids present in Sheets are:
        Glutamic acid (E), Methionine (M), Alanine (A), Leucine (L)
        """

        helix, turn, sheet = self.protein_analysis.secondary_structure_fraction()

        fractions = {
            self.FEATURE_NAMES[9]: helix,
            self.FEATURE_NAMES[10]: turn,
            self.FEATURE_NAMES[11]: sheet,
        }

        return fractions

    def get_features(self) -> Mapping[str, Union[int, float, None]]:
        """
        Return full feature space for single protein as Python dict
        """

        features = {
            self.FEATURE_NAMES[0]: self._get_protein_length(),
            self.FEATURE_NAMES[1]: self._calculate_gravy(),
            self.FEATURE_NAMES[2]: self._calculate_molecular_weight(),
            self.FEATURE_NAMES[3]: self._calculate_aromaticity(),
            self.FEATURE_NAMES[4]: self._calculate_instability_index(),
            self.FEATURE_NAMES[5]: self._calculate_isoelectric_point(),
            self.FEATURE_NAMES[6]: self._calculate_flexibility(),
        }

        features.update(self._calculate_molar_extinction_coefficient())

        features.update(self._calculate_secondary_structure_fraction())

        return features