def get_features(seq):
    p = PyPro()
    try:
        p.ReadProteinSequence(seq)
        features = list(p.GetALL().values())
        return features
    except:
        return ''
Ejemplo n.º 2
0
def Protein_gen(data, Proteingroup):
    import numpy as np
    from pydpi.pypro import PyPro
    protein = PyPro()

    HP_list, D_list = [], []
    for ii in range(len(data)):
        p = data[ii]
        protein.ReadProteinSequence(p)
        keys, values = [], []
        for jj in Proteingroup:
            if jj == '0':  #All descriptors          2049
                res = protein.GetALL()
            elif jj == '1':  #amino acid composition   20
                res = protein.GetAAComp()
            elif jj == '2':  #dipeptide composition    400
                res = protein.GetDPComp()
            elif jj == '3':  #Tripeptide composition   8000
                res = protein.GetTPComp()
            elif jj == '4':  #Moreau-Broto autocorrelation  240
                res = protein.GetMoreauBrotoAuto()
            elif jj == '5':  #Moran autocorrelation       240
                res = protein.GetMoranAuto()
            elif jj == '6':  #Geary autocorrelation       240
                res = protein.GetGearyAuto()
            elif jj == '7':  #composition,transition,distribution  21+21+105
                res = protein.GetCTD()
            elif jj == '8':  #conjoint triad features     343
                res = protein.GetTriad()
            elif jj == '9':  #sequence order coupling number  60
                res = protein.GetSOCN(30)
            elif jj == '10':  #quasi-sequence order descriptors   100
                res = protein.GetQSO()
            elif jj == '11':  #pseudo amino acid composition   50
                res = protein.GetPAAC(30)

            keys.extend(res.keys())
            values.extend(res.values())
        if ii == 0:
            HP_list = keys
            D_list.append(values)
        else:
            D_list.append(values)

    D_Pro = np.zeros((len(D_list), len(HP_list)), dtype=float)
    for k in range(len(D_list)):
        D_Pro[k, :] = D_list[k]

    #Variance threshold       std > 0.01
    import Descriptors_Selection as DesSe
    ind_var = DesSe.VarinceThreshold(D_Pro)
    D_Pro = D_Pro[:, ind_var]
    HP_list = np.array(HP_list)[ind_var]

    H_Pro = np.reshape(HP_list, (1, len(HP_list)))
    Array_Pro = np.append(H_Pro, D_Pro, axis=0)

    return Array_Pro
Ejemplo n.º 3
0
    def get_features(self, seq, smi):
        p = PyPro()
        try:
            p.ReadProteinSequence(seq)
            features = list(p.GetALL().values())

            smi_features = self.get_smi_features(smi)
            smi_features2 = list(
                np.array([f for f in smi_features], dtype=np.float32))

            total_features = np.array(features + smi_features2)[np.newaxis, :]
            #      total_features = np.array(smi_features2+features)[np.newaxis, :] # does not work...!
            return total_features
        except Exception as e:
            print(str(e))
        return None
Ejemplo n.º 4
0
    def uniprot_converter(self, ps, ido, L2):
        try:
            dpi = pydpi.PyDPI()
            #         print prot
            #         ps=dpi.GetProteinSequenceFromID(prot)
            #         print ps
            dpi.ReadProteinSequence(ps)
            protein = PyPro()
            protein.ReadProteinSequence(ps)
            allp = protein.GetALL()
            allp["ID"] = ido
            L2.append(allp)

    #      print(smi)
        except:
            logger.info("Unable to convert the sequence into features")
            return False
Ejemplo n.º 5
0
            res=AAComposition.CalculateAAComposition(seq)
            k_list1=','.join(c for c in res.keys())
            v_list1=','.join(str(c) for c in res.values())
            #CTD  calculates 147 descriptors
            ctd = CTD.CalculateCTD(seq)
            k_list2=','.join(c for c in ctd.keys())
            v_list2=','.join(str(c) for c in ctd.values())
            
            #pseudo amino acid composition descriptors  total 30
            protein=PyPro()   #protein object
            protein.ReadProteinSequence(seq)
            paac = protein.GetPAAC(lamda=10,weight=0.05)
            k_list3=','.join(c for c in paac.keys())
            v_list3=','.join(str(c) for c in paac.values())
            #all protein descriptors total 2049
            allp=protein.GetALL()
            k_list4=','.join(c for c in allp.keys())
            v_list4=','.join(str(c) for c in allp.values())
        except:
            print "\nerror while calculation of features in:\t",f
            continue
        #fp.write(str(key+"_"+f.replace(".pdb",""))+","+str((line[0])[:locSlash(line[0])-1])+","+str((line[1])[:locSlash(line[1])-1])+","+str((line[2])[:(locSlash(line[2])-1)])+","+str(f15)+","+str(v_list1)+","+str(v_list2)+"\n")
        fp.write(str(key+"_"+f.replace(".pdb",""))+","+str((line[0])[:locSlash(line[0])-1])+","+str((line[1])[:locSlash(line[1])-1])+","+str((line[2])[:(locSlash(line[2])-1)])+","+str(f15)+","+str(v_list1)+","+str(v_list2)+","+str(v_list3)+","+str(v_list4)+"\n")
        fileTime=time.time()-time1
        print "\ntime for file\t"+f+"is\t",fileTime
    folderTime=time.time()-time1
    print "\ncompletion time for folder\t"+key+"is\t",folderTime
fp.close()


print "done\n"