def get_features(seq):
    p = PyPro()
    try:
        p.ReadProteinSequence(seq)
        features = list(p.GetALL().values())
        return features
    except:
        return ''
Exemple #2
0
def get_paac_sequence(sequence, lamb):
    from pydpi.pypro import PyPro
    protein = PyPro()
    protein.ReadProteinSequence(sequence)
    dictionary = protein.GetPAAC(lamda=lamb)
    result = []
    for i in range(1, 20 + lamb + 1):
        result.append(dictionary['PAAC' + str(i)])
    return result
def get_aa_frequency(sequence):
    from pydpi.pypro import PyPro
    protein=PyPro()
    protein.ReadProteinSequence(sequence)
    dictionary=protein.GetAAComp()
    AA_list=['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
    result=[]
    for aa in AA_list:
        temp=dictionary[aa]/float(100)
        result.append(temp)
    return result
Exemple #4
0
def find_unusual_sequence(sequence_list):
    import copy
    from pydpi.pypro import PyPro
    sequence_list_temp = copy.copy(sequence_list)
    unusual_sequence_index = []
    for i in range(len(sequence_list_temp)):
        try:
            protein = PyPro()
            protein.ReadProteinSequence(sequence_list[i][1:])
            dictionary = protein.GetAAComp()
        except AttributeError:
            print('The %dth sequence has more than 20 types aa' % i)
            unusual_sequence_index.append(i)
    return unusual_sequence_index
Exemple #5
0
    def get_features(self, seq, smi):
        p = PyPro()
        try:
            p.ReadProteinSequence(seq)
            features = list(p.GetALL().values())

            smi_features = self.get_smi_features(smi)
            smi_features2 = list(
                np.array([f for f in smi_features], dtype=np.float32))

            total_features = np.array(features + smi_features2)[np.newaxis, :]
            #      total_features = np.array(smi_features2+features)[np.newaxis, :] # does not work...!
            return total_features
        except Exception as e:
            print(str(e))
        return None
    def uniprot_converter(self, ps, ido, L2):
        try:
            dpi = pydpi.PyDPI()
            #         print prot
            #         ps=dpi.GetProteinSequenceFromID(prot)
            #         print ps
            dpi.ReadProteinSequence(ps)
            protein = PyPro()
            protein.ReadProteinSequence(ps)
            allp = protein.GetALL()
            allp["ID"] = ido
            L2.append(allp)

    #      print(smi)
        except:
            logger.info("Unable to convert the sequence into features")
            return False
Exemple #7
0
def three_parts_representation_list(sequence_list, class_notation_index=[]):
    import copy
    from pydpi.pypro import PyPro
    sequence_list_temp = copy.copy(sequence_list)
    for i in range(len(class_notation_index) - 1, -1, -1):
        sequence_list_temp.pop(class_notation_index[i])
    print('length of sequence list temp: %d' % len(sequence_list_temp))
    result = []
    for sequence in sequence_list_temp:
        try:
            protein = PyPro()
            protein.ReadProteinSequence(sequence[1:])
            dictionary = protein.GetAAComp()
        except AttributeError:
            sequence = delete_unusual_aa(sequence)
            sequence = '>' + sequence
        temp = three_parts_representation_sequence(sequence[1:])
        result.append(temp)
    return result
    def descriptor_generator(self, protein_sequence):
        obj_PyPro = PyPro()
        obj_PyPro.ReadProteinSequence(protein_sequence)

        ds1 = obj_PyPro.GetAAComp()
        ds2 = obj_PyPro.GetDPComp()
        ds3 = obj_PyPro.GetPAAC(lamda=30)
        ds4 = obj_PyPro.GetCTD()
        ds5 = obj_PyPro.GetQSO()
        ds6 = obj_PyPro.GetTriad()

        ds_all = []
        # This is use to append since sequentially add .. otherwise update() function update not sequentially
        for ds in (ds1, ds2, ds3, ds4, ds5, ds6):
            ds_all.append(ds)

        return ds_all
Exemple #9
0
def Protein_gen(data, Proteingroup):
    import numpy as np
    from pydpi.pypro import PyPro
    protein = PyPro()

    HP_list, D_list = [], []
    for ii in range(len(data)):
        p = data[ii]
        protein.ReadProteinSequence(p)
        keys, values = [], []
        for jj in Proteingroup:
            if jj == '0':  #All descriptors          2049
                res = protein.GetALL()
            elif jj == '1':  #amino acid composition   20
                res = protein.GetAAComp()
            elif jj == '2':  #dipeptide composition    400
                res = protein.GetDPComp()
            elif jj == '3':  #Tripeptide composition   8000
                res = protein.GetTPComp()
            elif jj == '4':  #Moreau-Broto autocorrelation  240
                res = protein.GetMoreauBrotoAuto()
            elif jj == '5':  #Moran autocorrelation       240
                res = protein.GetMoranAuto()
            elif jj == '6':  #Geary autocorrelation       240
                res = protein.GetGearyAuto()
            elif jj == '7':  #composition,transition,distribution  21+21+105
                res = protein.GetCTD()
            elif jj == '8':  #conjoint triad features     343
                res = protein.GetTriad()
            elif jj == '9':  #sequence order coupling number  60
                res = protein.GetSOCN(30)
            elif jj == '10':  #quasi-sequence order descriptors   100
                res = protein.GetQSO()
            elif jj == '11':  #pseudo amino acid composition   50
                res = protein.GetPAAC(30)

            keys.extend(res.keys())
            values.extend(res.values())
        if ii == 0:
            HP_list = keys
            D_list.append(values)
        else:
            D_list.append(values)

    D_Pro = np.zeros((len(D_list), len(HP_list)), dtype=float)
    for k in range(len(D_list)):
        D_Pro[k, :] = D_list[k]

    #Variance threshold       std > 0.01
    import Descriptors_Selection as DesSe
    ind_var = DesSe.VarinceThreshold(D_Pro)
    D_Pro = D_Pro[:, ind_var]
    HP_list = np.array(HP_list)[ind_var]

    H_Pro = np.reshape(HP_list, (1, len(HP_list)))
    Array_Pro = np.append(H_Pro, D_Pro, axis=0)

    return Array_Pro
Exemple #10
0
def describe_sequences():
    path = r"C:\Users\Patrick\OneDrive - University College Dublin\Bioinformatics\HemolyticStudies\BOTH_peptides.json"

    aa_letters = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]
    di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters]
    tri_letters = [
        "%s%s%s" % (a, b, c) for a in aa_letters for b in aa_letters
        for c in aa_letters
    ]
    conjoint_letters = ["A", "I", "Y", "H", "R", "D", "C"]
    letters = {
        1: aa_letters,
        2: di_letters,
        3: tri_letters,
        4: conjoint_letters
    }

    #Conjoint src = https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0828-1

    conjoint_dict = {
        "A": "A",
        "G": "A",
        "V": "A",
        "I": "I",
        "L": "I",
        "F": "I",
        "P": "I",
        "Y": "Y",
        "M": "Y",
        "T": "Y",
        "S": "Y",
        "H": "H",
        "N": "H",
        "Q": "H",
        "W": "H",
        "R": "R",
        "K": "R",
        "D": "D",
        "E": "D",
        "C": "C",
    }

    def counter(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / l for k in d}
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 1) for k in d}
        if seq_type == 3:
            for a in range(l - 2):
                s = string[a:a + seq_type]
                try:
                    d[s] += 1.0
                except KeyError:
                    d[s] = 1.0
            d = {k: d[k] / (l - 2) for k in d}
        return d

    def counter_boolean(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def counter_abs(string, seq_type):
        '''
        A function for counting the number of letters present.
        
        Returns a list of (letter, #occurances) tuples. 
        '''
        l = len(string)
        d = {i: 0 for i in letters[seq_type]}
        if seq_type == 1:
            for s in string:
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        if seq_type == 2:
            for a in range(l - 1):
                s = string[a:a + seq_type]
                try:
                    d[s] = d[s] + 1.0
                except KeyError:
                    d[s] = 1.0
        return d

    def residue_distribution(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        elif seq_type == 3:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if tp[i] >= 20]))
        elif seq_type == 4:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]]))

        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_boolean(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_boolean(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    def residue_abs(all_residues, seq_type, dp):
        '''
        Takes as arguments a string with letters, and the type of sequence represented.
        Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. 
        '''
        d = counter_abs(all_residues, seq_type)
        if seq_type == 1:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type]
                        ]))  ##Removes ambiguous letters
        elif seq_type == 2:
            residue_counts = list(
                sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50]))
        r_c = [i[1] for i in residue_counts]
        dis = np.array([
            r_c,
        ])
        return dis

    with open(path, "r") as f:
        text = f.read()

    peptides = eval(text)["Peptides"]

    train_peptides, test_peptides = train_test_split(peptides,
                                                     test_size=0.15,
                                                     random_state=42)

    train_peptides_seqs = [peptide["seq"] for peptide in train_peptides]

    for peptide in peptides:
        if peptide["seq"] in train_peptides_seqs:
            peptide["train"] = True
        else:
            peptide["train"] = False

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))

    new_peptides = []
    for peptide in peptides:
        if peptide["train"] == True:
            new_peptide = peptide.copy()
            new_seq = ''.join(reversed(peptide["seq"]))
            new_peptide["seq"] = new_seq
            new_peptides.append(new_peptide)

    #peptides.extend(new_peptides)
    random.shuffle(peptides)

    print(len([p for p in peptides if p["train"] == True]))
    print(len([p for p in peptides if p["train"] == False]))
    print("doubling complete")

    dp = {i: 0 for i in letters[2]}
    tp = {i: 0 for i in letters[3]}

    name_i = 0

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 1):
            s = seq[a:a + 2]
            temp_set.add(s)
        for s in temp_set:
            dp[s] = dp[s] + 1

    for peptide in peptides:
        temp_set = set()
        seq = peptide["seq"]
        l = len(seq)
        for a in range(l - 2):
            s = seq[a:a + 3]
            temp_set.add(s)
        for s in temp_set:
            tp[s] = tp[s] + 1

    for peptide in peptides:
        peptide["conjoint_seq"] = "".join(
            [conjoint_dict[letter] for letter in peptide["seq"]])

    for peptide in peptides:

        globdesc = GlobalDescriptor(peptide["seq"])
        globdesc.calculate_all(amide=peptide["cTer"] == "Amidation")

        ctdc = CTD.CalculateC(peptide["seq"])
        ctdc_keys = list(sorted(list([key for key in ctdc])))
        ctdc_vals = np.array([[ctdc[key] for key in ctdc_keys]])

        conjointtriad = ConjointTriad.CalculateConjointTriad(peptide["seq"])
        conjointtriad_keys = list(sorted(list([key for key in conjointtriad])))
        conjointtriad_vals = np.array(
            [[conjointtriad[key] for key in conjointtriad_keys]])

        conjoint_dis = residue_distribution(peptide["conjoint_seq"], 4, None)

        #peptide["GlobalDescriptor"] = globdesc

        #print(peptide["GlobalDescriptor"].descriptor)

        #Eisenberg hydrophobicity consensus
        #Take most of the values from here

        pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        #pepdesc.calculate_profile(append=True, prof_type = "uH")

        pepdesc.load_scale("Ez")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("aasi")
        pepdesc.calculate_global(append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("abhprk")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_acid")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("cougar")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("gravy")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("hopp-woods")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("kytedoolittle")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)

        pepdesc.load_scale("ppcali")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("charge_phys")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("flexibility")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("bulkiness")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("TM_tend")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("mss")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("t_scale")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("peparc")
        pepdesc.calculate_arc(modality="max", append=True)
        pepdesc.calculate_arc(modality="mean", append=True)

        pepdesc.load_scale("msw")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("polarity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("pepcats")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("isaeci")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("refractivity")
        pepdesc.calculate_moment(modality="max", append=True)
        pepdesc.calculate_moment(modality="mean", append=True)
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z3")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        pepdesc.load_scale("z5")
        pepdesc.calculate_global(modality="mean", append=True)
        pepdesc.calculate_global(modality="max", append=True)

        #pepdesc.load_scale("PPCALI")
        #pepdesc.calculate_autocorr(2)
        #peptide["PeptideDescriptor"] = pepdesc

        protein = PyPro()
        protein.ReadProteinSequence(peptide["seq"])
        paac = protein.GetPAAC(lamda=1, weight=0.05)
        paac2 = [[
            paac[a] for a in list(
                sorted([k for k in paac],
                       key=lambda x: int(x.replace("PAAC", ""))))
        ]]

        cTer = np.array([[1 if peptide["cTer"] == "Amidation" else 0]])

        paac = np.array(paac2)

        analysed_seq = ProteinAnalysis(peptide["seq"])
        secondary_structure_fraction = np.array(
            [analysed_seq.secondary_structure_fraction()])

        peptide["TotalDescriptor"] = str(
            np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1))

        try:
            pepid = np.array([[
                int(peptide["id"].replace("HEMOLYTIK", "").replace(
                    "DRAMP", "").replace("DBAASP", ""))
            ]])
        except KeyError:
            pepid = 0

        pep_train = np.array([[1 if peptide["train"] == True else 0]])

        freq_1d = residue_distribution(peptide["seq"], 1, dp)
        freq_2d = residue_distribution(peptide["seq"], 2, dp)
        freq_3d = residue_distribution(peptide["seq"], 3, dp)
        freq_1dbool = residue_boolean(peptide["seq"], 1, dp)
        freq_2dbool = residue_boolean(peptide["seq"], 2, dp)
        freq_1dabs = residue_abs(peptide["seq"], 1, dp)
        freq_2dabs = residue_abs(peptide["seq"], 2, dp)

        len_peptide = np.array([[len(peptide["seq"])]])

        if peptide["activity"] == "YES":
            pepact = 1
        else:
            pepact = 0
        pepact = np.array([[pepact]])

        peptide_di2 = di2(peptide["seq"])
        peptide_di3 = di3(peptide["conjoint_seq"])

        ####################### AAindex #########################
        to_get = [
            ("CHAM810101", "mean"),  #Steric Hinderance
            ("CHAM810101", "total"),  #Steric Hinderance
            ("KYTJ820101", "mean"),  #Hydropathy
            ("KLEP840101", "total"),  #Charge
            ("KLEP840101", "mean"),  #Charge
            ("MITS020101", "mean"),  #Amphiphilicity
            ("FAUJ830101", "mean"),  #Hydrophobic parameter pi
            ("GOLD730102", "total"),  #Residue volume
            ("MEEJ800101", "mean"),  #Retention coefficient in HPLC
            ("OOBM850105",
             "mean"),  #Optimized side chain interaction parameter
            ("OOBM850105",
             "total"),  #Optimized side chain interaction parameter
            ("VELV850101", "total"),  #Electron-ion interaction parameter
            ("VELV850101", "mean"),  #Electron-ion interaction parameter
            ("PUNT030102",
             "mean"),  #Knowledge-based membrane-propensity scale from 3D_Helix
            ("BHAR880101", "mean"),  #Average flexibility indeces
            ("KRIW790102", "mean"),  #Fraction of site occupied by water
            ("PLIV810101", "mean"),  #Partition coefficient
            ("ZIMJ680102", "mean"),  #Bulkiness
            ("ZIMJ680102", "total"),  #Bulkiness
            ("ZHOH040101", "mean"),  #Stability scale
            ("CHAM820102", "total"),  #Free energy solubility in water
            #From HemoPi: src = https://github.com/riteshcanfly/Hemopi/blob/master/pcCalculator.java
            ("HOPT810101", "mean"),  #Hydrophilicity 
            ("EISD840101", "mean"),  #Hydrophobicity
            ("FAUJ880109", "total"),  #Net Hydrogen
            ("EISD860101", "mean"),  #Solvation
        ]

        tetra_peptides = [
            "KLLL",  # src = https://github.com/riteshcanfly/Hemopi/blob/master/tetrapos.txt
            "GCSC",
            "AAAK",
            "KLLS",
            "LGKL",
            "VLKA",
            "LLGK",
            "LVGA",
            "LSDF",
            "SDFK",
            "SWLR",
            "WLRD",
        ]

        tp_bin = []
        for t_p in tetra_peptides:
            if t_p in peptide["seq"]:
                tp_bin.append(1)
            else:
                tp_bin.append(0)
        tp_bin = np.array([tp_bin])

        for identifier, mode in to_get:
            x = aaf(peptide["seq"], identifier, mode)

        aminoacidindeces = np.array([[
            aaf(peptide["seq"], identifier, mode)
            for identifier, mode in to_get
        ]])

        peptide["array"] = np.concatenate(
            (
                pepid,
                pep_train,
                pepdesc.descriptor,
                globdesc.descriptor,
                len_peptide,
                cTer,
                secondary_structure_fraction,
                aminoacidindeces,
                ctdc_vals,
                conjointtriad_vals,
                tp_bin,
                freq_1d,
                freq_2d,
                freq_3d,
                freq_1dbool,
                freq_2dbool,
                freq_1dabs,
                freq_2dabs,
                peptide_di2,
                peptide_di3,  #Conjoint Alphabet
                paac,
                pepact,
            ),
            axis=1)
        #print(peptide["TotalDescriptor"])

    x = np.concatenate([peptide["array"] for peptide in peptides], axis=0)

    np.save("peptides_array", x, allow_pickle=False)
fig, axes = plt.subplots(4, 1)
X = df_p.ix[:,1:].values
y = df_p['e']
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
plt.figure()
for color, i, target_name in zip(['navy', 'darkorange'], [0, 1], ['0', '1']):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=2,label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of hmmscan df')

# Collect physicochemical features homology -- pydpi ##
print 'Collecting physicochemical stats per protein...'
with open('./physicochem_annot_training.csv', 'w') as f:
    for record in SeqIO.parse(all_fasta, "fasta"):
        protein = PyPro()
        protein.ReadProteinSequence(str(record.seq))
        desc = protein.GetGearyAuto()
        desc2 = protein.GetDPComp()
        z = desc.copy()
        z.update(desc2)
        len_p = str(len(record.seq))
        label = str(record.description).strip().split(':')[1]
        id = str(record.id)
        row = [id, label, len_p] + [str(i) for i in z.values()]
        f.write(','.join(row) + '\n')

df_desc = pd.read_table('./physicochem_annot_training.csv', header=None, sep=',')
header = z.keys()
df_desc.columns = ['id', 'label', 'seq_length'] + header
print 'Pydpi table: {}'.format(df_desc.columns)
Exemple #12
0

##############################################################################
                      # FEATURES PROTEINA
#############################################################################

origin=pd.read_csv('domain_sequences.csv')    #ficheiro contendo domínios das proteínas
origin.drop(origin.columns[0], axis=1, inplace=True)
target, ids=origin.iloc[:,0], origin.iloc[:,2]

dprot={}

for i in range(len(ids)):
	nome=target[i]
    ps=ids[i]
    protein=PyPro()
    protein.ReadProteinSequence(ps)
    
    temp=pypro.ProteinCheck(ps)
    aacomp=AAComposition.CalculateAAComposition(ps)   
    mor=protein.GetMoranAuto()
    moran=[mor[x] for x in mor]
    pseudo_aa=protein.GetPAAC(lamda=5, weight=0.05)
    dprot[i]=(nome,ps,aacomp,moran,pseudo_aa)
    
feat_prot=pd.DataFrame.from_dict(dprot, orient='index', columns=['target_id','ps','aacomp','moran','pseudo_aa'])    

train=pd.read_csv('features1_2.csv')
train.drop(train.columns[0], axis=1, inplace=True)

final=pd.concat([train, feat_prot], on='target_id', join='left')
    def Decriptor_generator(self, ps):

        protein = PyPro()
        protein.ReadProteinSequence(ps)
        moran = protein.GetPAAC(lamda=5,weight=0.5)
        DS_1 = protein.GetAPAAC(lamda=5,weight=0.5)
        DS_2 = protein.GetCTD()
        DS_3 = protein.GetDPComp()
        DS_4 = protein.GetGearyAuto()
        DS_5 = protein.GetMoranAuto()
        DS_6 = protein.GetMoreauBrotoAuto()
        DS_7 = protein.GetQSO()
        DS_8 = protein.GetSOCN()
        DS_9 = protein.GetTPComp()

        DS_ALL = {}

        for DS in (DS_1,DS_2,DS_3,DS_4,DS_5,DS_6,DS_7,DS_8,DS_9,moran):
            DS_ALL.update(DS)
            
        return DS_ALL
Exemple #14
0
from sklearn.neural_network import MLPClassifier
from scipy import interp
from sklearn.metrics import matthews_corrcoef as mcc
from pydpi.pypro import PyPro
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# from classifier.classical_classifiers import RFClassifier, SVM
# from make_representations.sequencelist_representation import SequenceKmerRep, SequenceKmerEmbRep
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score
import argparse
import pickle

protein = PyPro()


def readAAP(file):  #read AAP features from the AAP textfile
    try:
        aapdic = {}
        aapdata = open(file, 'r')
        for l in aapdata.readlines():
            aapdic[l.split()[0]] = float(l.split()[1])
        aapdata.close()
        return aapdic
    except:
        print("Error in reading AAP feature file. Please make sure that the AAP file is correctly formatted")
        sys.exit()

def Decriptor_generator(infile, lamda, weight, maxlag, destype, out_file):

    list_pep_name = []
    f = open(infile)
    lines = f.readlines()

    for line in lines:
        if ">" in line:
            pass
        else:
            list_pep_name.append(line.strip('\n'))

    out_df = pd.DataFrame()

    for seq in list_pep_name:

        protein = PyPro()
        protein.ReadProteinSequence(seq)

        if destype == "GetAAComp":
            DS = protein.GetAAComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetDPComp":
            DS = protein.GetDPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTPComp":
            DS = protein.GetTPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoreauBrotoAuto":
            DS = protein.GetMoreauBrotoAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoranAuto":
            DS = protein.GetMoranAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetGearyAuto":
            DS = protein.GetGearyAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetCTD":
            DS = protein.GetCTD()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetPAAC":
            DS = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetAPAAC":
            DS = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetSOCN":
            DS = protein.GetSOCN(maxlag=int(maxlag))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetQSO":
            DS = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTriad":
            DS = protein.GetTriad()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "All":
            DS1 = protein.GetAAComp()
            DS2 = protein.GetDPComp()
            DS3 = protein.GetTPComp()
            DS4 = protein.GetMoreauBrotoAuto()
            DS5 = protein.GetMoranAuto()
            DS6 = protein.GetGearyAuto()
            DS7 = protein.GetCTD()
            DS8 = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            DS9 = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            DS10 = protein.GetSOCN(maxlag=int(maxlag))
            DS11 = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            DS12 = protein.GetTriad()

            DS = {}

            for D in (DS1, DS2, DS3, DS4, DS5, DS6, DS7, DS8, DS9, DS10, DS11,
                      DS12):
                print(D)
                DS.update(D)
            df = pd.DataFrame(DS, index=[0])

        if destype == 'BinaryDescriptor':
            out_df = BinaryDescriptor(list_pep_name)
        else:
            out_df = pd.concat([out_df, df], axis=0)

    out_df.to_csv(out_file, index=False, sep='\t')
Exemple #16
0
    def Decriptor_generator(self, ps):
        protein = PyPro()
        protein.ReadProteinSequence(ps)
        DS_1 = protein.GetAAComp()
        # print len(DS_1)
        #DS_2 = protein.GetDPComp()

        # print len(DS_2)
        #DS_3 = protein.GetTPComp() # takes time
        # print len(DS_3)
        DS_4 = protein.GetTriad()

        DS_5 = protein.GetPAAC(lamda=5, weight=0.5)  # takes time

        DS_6 = protein.GetAPAAC(lamda=5, weight=0.5)  # takes time

        DS_7 = protein.GetCTD()

        DS_8 = protein.GetGearyAuto()

        DS_9 = protein.GetMoranAuto()

        DS_10 = protein.GetMoreauBrotoAuto()

        DS_11 = protein.GetQSO()

        DS_12 = protein.GetSOCN()

        DS_ALL = {}

        for DS in (DS_1, DS_4, DS_5, DS_6, DS_7, DS_8, DS_9, DS_10, DS_11,
                   DS_12):
            DS_ALL.update(DS)
        # print len(DS_ALL)
        return DS_ALL
Exemple #17
0
     seq= pdbSeq2Fasta(filep)
     #filep.close()
     
     #seq = getpdb.GetSeqFromPDB(path1+'\\'+val[:4]+".pdb")
     f15=len(seq)
     #calculating amino acid composition descriptors  total 20 descriptors
     res=AAComposition.CalculateAAComposition(seq)
     k_list1=','.join(c for c in res.keys())
     v_list1=','.join(str(c) for c in res.values())
     #CTD  calculates 147 descriptors
     ctd = CTD.CalculateCTD(seq)
     k_list2=','.join(c for c in ctd.keys())
     v_list2=','.join(str(c) for c in ctd.values())
     
     #pseudo amino acid composition descriptors  total 30
     protein=PyPro()   #protein object
     protein.ReadProteinSequence(seq)
     paac = protein.GetPAAC(lamda=10,weight=0.05)
     k_list3=','.join(c for c in paac.keys())
     v_list3=','.join(str(c) for c in paac.values())
     #all protein descriptors total 2049
     allp=protein.GetALL()
     k_list4=','.join(c for c in allp.keys())
     v_list4=','.join(str(c) for c in allp.values())
 except:
     print "\nerror while calculation of features in:\t",f
     continue
 #fp.write(str(key+"_"+f.replace(".pdb",""))+","+str((line[0])[:locSlash(line[0])-1])+","+str((line[1])[:locSlash(line[1])-1])+","+str((line[2])[:(locSlash(line[2])-1)])+","+str(f15)+","+str(v_list1)+","+str(v_list2)+"\n")
 fp.write(str(key+"_"+f.replace(".pdb",""))+","+str((line[0])[:locSlash(line[0])-1])+","+str((line[1])[:locSlash(line[1])-1])+","+str((line[2])[:(locSlash(line[2])-1)])+","+str(f15)+","+str(v_list1)+","+str(v_list2)+","+str(v_list3)+","+str(v_list4)+"\n")
 fileTime=time.time()-time1
 print "\ntime for file\t"+f+"is\t",fileTime