Beispiel #1
0
def featurize(structure: Structure) -> list[Any]:
    """
    Calculates 3D ML features from the `structure`.
    """
    structure1 = freesasa.Structure(pdbpath)
    result = freesasa.calc(structure1)
    area_classes = freesasa.classifyResults(result, structure1)

    Total_area = []
    Total_area.append(result.totalArea())

    Polar_Apolar = []

    for key in area_classes:
        # print( key, ": %.2f A2" % area_classes[key])
        Polar_Apolar.append(area_classes[key])
    # get all the residues
    residues = [res for res in structure.get_residues()]
    seq_length = []
    seq_length.append(len(residues))
    # calculate some random 3D features (you should be smarter here!)
    protein_length = residues[1]["CA"] - residues[-2]["CA"]
    angle = calc_dihedral(
        residues[1]["CA"].get_vector(),
        residues[2]["CA"].get_vector(),
        residues[-3]["CA"].get_vector(),
        residues[-2]["CA"].get_vector(),
    )
    # create the feature vector
    features = [Total_area, Polar_Apolar, protein_length, seq_length, angle]

    return features
def get_area_classes(file):
    struct = freesasa.Structure(file)
    result = freesasa.calc(struct)
    area_classes = freesasa.classifyResults(result, struct)
    list_areas = [(list(area_classes.values())[0]),
                  (list(area_classes.values())[1]),
                  result.totalArea()]
    return list_areas
def calculate_SAS(temp_dict, pdb_path, seq_len):
    struct = freesasa.Structure(str(pdb_path))
    result = freesasa.calc(struct)
    area_classes = freesasa.classifyResults(result, struct)
    polar = area_classes['Polar']
    apolar = area_classes['Apolar']
    sasa_fraction = (polar + apolar) / seq_len
    temp_dict.update({
        "Polar": polar,
        "Apolar": apolar,
        "SASA Fraction": sasa_fraction
    })
parser = argparse.ArgumentParser()
parser.add_argument("--infile", type=str, default="data/test.zip")
parser.add_argument("--model", type=str, default="model.pkl")
args = parser.parse_args()

#protein_parser = PDBParser()

with temppathlib.TemporaryDirectory() as tmpdir:
    # unzip the file with all the test PDBs
    with zipfile.ZipFile(args.infile, "r") as zip_:
        zip_.extractall(tmpdir.path)
        for test_pdb in tmpdir.path.glob("*.pdb"):
            struct = freesasa.Structure(str(test_pdb))
            result = freesasa.calc(struct)
            areas_classes = freesasa.classifyResults(result, struct)
            list_areas = [(list(areas_classes.values())[0]),
                          (list(areas_classes.values())[1]),
                          result.totalArea()]

            polar_area.append(list_areas[0])
            apolar_area.append(list_areas[1])
            total_area.append(list_areas[2])

print('done')
with temppathlib.TemporaryDirectory() as tmpdir:
    # unzip the file with all the test PDBs
    with zipfile.ZipFile(args.infile, "r") as zip_:
        zip_.extractall(tmpdir.path)

        for test_pdb in tmpdir.path.glob("*.pdb"):
Beispiel #5
0
def openfile():
    global prob, probab, te
    global my_seq
    global anti
    global structure, structure_id, filename
    global antigenicity, hydro, flex, sec
    global m, a, c, b, length, j, k
    global hydroph, flexi, access
    anti = []
    sec = []
    probab = []
    from tkinter import filedialog
    root = Tk()
    root.filename = filedialog.askopenfilename(
        initialdir="/",
        title="Select file",
        filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb")))
    filename = root.filename
    print(filename)
    structure_id = "1e6j"
    structure = PDBParser().get_structure(structure_id, root.filename)
    ppb = PPBuilder()
    for pp in ppb.build_peptides(structure):
        my_seq = pp.get_sequence()  # type: Seq
        print(my_seq)
    for model in structure:
        for chain in model:
            print(chain)
    sequence = list(my_seq)
    m = ''.join(sequence)
    print(m)
    length = len(m)  # type: int
    print("Sequence consist of", length, "Amino Acids")
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    analysed_seq = ProteinAnalysis(m)
    print("Molecular weight = ", analysed_seq.molecular_weight())
    print("Amino Acid Count = ", analysed_seq.count_amino_acids())
    print("Secondary structure fraction =",
          analysed_seq.secondary_structure_fraction())
    kd = {
        'A': 1.8,
        'R': -4.5,
        'N': -3.5,
        'D': -3.5,
        'C': 2.5,
        'Q': -3.5,
        'E': -3.5,
        'G': -0.4,
        'H': -3.2,
        'I': 4.5,
        'L': 3.8,
        'K': -3.9,
        'M': 1.9,
        'F': 2.8,
        'P': -1.6,
        'S': -0.8,
        'T': -0.7,
        'W': -0.9,
        'Y': -1.3,
        'V': 4.2
    }
    c = list(analysed_seq.flexibility())
    b = list(analysed_seq.protein_scale(kd, 10, 1.0))
    hydro = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flex = list(analysed_seq.flexibility())
    hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0))
    flexi = list(analysed_seq.flexibility())

    i = 1
    j = -1  # type: int
    k = 9
    while i <= (length - 10):
        print("Sequence is = ", m[j + 1:k + 1])
        print("Flexibility value = ", c[j + 1])
        print("Hydrophilicity value = ", b[j + 1])
        ana_seq = ''.join(m[j + 1:k + 1])
        analyze_seq = ProteinAnalysis(ana_seq)
        # For Secondary structure Analysis
        print("Secondary structure fraction =",
              analyze_seq.secondary_structure_fraction())
        a = list(analyze_seq.secondary_structure_fraction())
        a = a[0]
        sec.append(a)
        i += 1
        j += 1
        k += 1
    f = length
    r = 1
    y = 10
    global acc, logacc
    acc = []
    for i in range(0, f):
        str1 = "accessibility, resi "
        str2 = str(r) + "-" + str(y)
        saving = str1 + str2
        print(saving)
        r = r + 1
        y = y + 1
        structure = freesasa.Structure("1e6j.pdb")
        resulta = freesasa.calc(structure)
        area_classes = freesasa.classifyResults(resulta, structure)
        print("Total : %.2f A2" % resulta.totalArea())
        for key in area_classes:
            print(key, ": %.2f A2" % area_classes[key])
        resulta = freesasa.calc(
            structure,
            freesasa.Parameters({
                'algorithm': freesasa.LeeRichards,
                'n-slices': 10
            }))
        selections = freesasa.selectArea(('alanine, resn ala', saving),
                                         structure, resulta)
        for key in selections:
            print(key, ": %.2f A2" % selections[key])
            a = selections[key]
            acc.append(a)

    l = acc[0::2]
    access = l
    print(acc)
    print(l)
    logacc = [math.log(y, 10) for y in l]

    print(logacc)
 def ClassifySASA(self, SASA, SASAStruct):
     area_classes = freesasa.classifyResults(SASA, SASAStruct)
     return area_classes
def calculate_sasa(pdbfile, chain, multichain=True, relative_type='sidechain'):
    """

    :param pdbfile: String of PDB file name.
    :param chain: String or List of chain identifiers.
    :param multichain: Boolean. True to separate chains. This allows SASA calculation for a single unattached monomer.
    False if you want to calculate SASA for the structure 'as-is'.
    :return: Pandas Dataframe of residue number, types, and sasa values as columns.
    """
    import freesasa as fs
    dict_max_acc = {
        # Miller max acc: Miller et al. 1987 https://doi.org/10.1016/0022-2836(87)90038-6
        # Wilke: Tien et al. 2013 https://doi.org/10.1371/journal.pone.0080635
        # Sander: Sander & Rost 1994 https://doi.org/10.1002/prot.340200303
        "Miller": {
            "ALA": 113.0,
            "ARG": 241.0,
            "ASN": 158.0,
            "ASP": 151.0,
            "CYS": 140.0,
            "GLN": 189.0,
            "GLU": 183.0,
            "GLY": 85.0,
            "HIS": 194.0,
            "ILE": 182.0,
            "LEU": 180.0,
            "LYS": 211.0,
            "MET": 204.0,
            "PHE": 218.0,
            "PRO": 143.0,
            "SER": 122.0,
            "THR": 146.0,
            "TRP": 259.0,
            "TYR": 229.0,
            "VAL": 160.0,
        },
        "Wilke": {
            "ALA": 129.0,
            "ARG": 274.0,
            "ASN": 195.0,
            "ASP": 193.0,
            "CYS": 167.0,
            "GLN": 225.0,
            "GLU": 223.0,
            "GLY": 104.0,
            "HIS": 224.0,
            "ILE": 197.0,
            "LEU": 201.0,
            "LYS": 236.0,
            "MET": 224.0,
            "PHE": 240.0,
            "PRO": 159.0,
            "SER": 155.0,
            "THR": 172.0,
            "TRP": 285.0,
            "TYR": 263.0,
            "VAL": 174.0,
            "MSE": 224.0,
            "SEC": 167.0,
        },
        "Sander": {
            "ALA": 106.0,
            "ARG": 248.0,
            "ASN": 157.0,
            "ASP": 163.0,
            "CYS": 135.0,
            "GLN": 198.0,
            "GLU": 194.0,
            "GLY": 84.0,
            "HIS": 184.0,
            "ILE": 169.0,
            "LEU": 164.0,
            "LYS": 205.0,
            "MET": 188.0,
            "PHE": 197.0,
            "PRO": 136.0,
            "SER": 130.0,
            "THR": 142.0,
            "TRP": 227.0,
            "TYR": 222.0,
            "VAL": 142.0,
        },
    }
    theoreticalMaxASA = dict_max_acc["Wilke"]

    # Calculates SASA for unseparated chains.
    if not multichain:
        structure = fs.Structure(pdbfile)
    else:
        # Separate chains if multichain structure. This allows SASA calculation for a single unattached monomer.
        structures = fs.structureArray(pdbfile, options={"separate-chains": True})
        chains = []
        for c in range(len(structures)):
            chains.append(structures[c].chainLabel(1))
        structure = structures[chains.index(chain)]
        print("using {} separating chains {}".format(chains.index(chain), chains))

    print("Number of atoms of {}: {}".format(pdbfile, structure.nAtoms()))
    result = fs.calc(structure, fs.Parameters({'algorithm': fs.ShrakeRupley, 'n-points': 10000}))
    res = result.residueAreas()
    residue = []
    resnum = []
    total = []
    apolar = []
    mainchain = []
    sidechain = []
    ratio = []

    for idx, v in res[chain].items():
        residue.append(v.residueType)
        resnum.append(v.residueNumber)
        total.append(v.total)
        apolar.append(v.apolar)
        mainchain.append(v.mainChain)
        sidechain.append(v.sideChain)
        if v.residueType == 'GLY':
            ratio.append(100 * v.mainChain / theoreticalMaxASA[v.residueType])
        elif v.residueType not in theoreticalMaxASA.keys():
            possibleSASA = []
            for i, maxSASA in enumerate(theoreticalMaxASA.values()):
                # If the residue is unknown but has a SASA,
                # calculate the rSASA dividing by theoretical maxSASA and then use the average of that value
                possibleSASA.append(100 * v.sideChain / maxSASA)
            ratio.append(np.average(possibleSASA))
        else:
            if relative_type == 'sidechain':
                ratio.append(100 * v.sideChain / theoreticalMaxASA[v.residueType])
            else:
                ratio.append(100 * v.total / theoreticalMaxASA[v.residueType])

        # if v.hasRelativeAreas:
        #     ratio.append(v.relativeSideChain)
        # else:
        #     ratio.append(np.nan)

    df_sasa = pd.DataFrame({'Residue': residue, 'Residue_num': resnum, 'Chain': chain, 'Total': total, 'Apolar': apolar,
                            'Backbone': mainchain, 'Sidechain': sidechain, 'Ratio': ratio})
    area_class = fs.classifyResults(result, structure)
    print("Total : %.2f A2" % result.totalArea())
    for key in area_class:
        print(key, ": %.2f A2" % area_class[key])

    return df_sasa
Beispiel #8
0
def main(args):

      try:
        os.makedirs(args.output_dir)
      except:
        pass

      with open(f'../data/filtered_pdb_ID/filtered_{args.halide}.txt', 'r') as f:
          text=f.readlines()
          base_list=[]
          base_list+=text
          base_list=[line.rstrip() for line in base_list]
          base_list=[i for i in base_list if i !='']
          print (base_list)
          for i in range(2,len(base_list),3): 
              
              if args.input_type == 'pdb_id':

                   struct = PandasPdb().fetch_pdb(f'{base_list[i]}')
                   model_name = args.input
                   structure= freesasa.Structure(f'{base_list[i]}')

              elif args.input_type == 'structure':
                   struct = PandasPdb()
                   struct = struct.read_pdb(f'{args.input}/pdb{base_list[i].lower()}.ent')
                   print(f'{args.input}/pdb{base_list[i].lower()}.ent')
                   model_name = re.search('[\d\w]+$', struct.header).group()
                   structure= freesasa.Structure(f'{args.input}/pdb{base_list[i].lower()}.ent')
              try:
                        resolution = float(re.search("REMARK\s+2\s+RESOLUTION\.\s+([\d\(.)]+)\s\w+", struct.pdb_text).group(1))
              except:
                        resolution = 100
              print (resolution)
      
              result= freesasa.calc(structure)
              classArea =freesasa.classifyResults(result,structure)
              print(result.totalArea())

              halide_atoms = struct.df['HETATM'][struct.df['HETATM']['atom_name'] == args.halide]
              modern_df=struct.df['ATOM'] # make the subset 
              dict_of_subsets = {}
              for i in halide_atoms.values:
                          global Coordinate
                          Coordinate=[] # list of coordinates М[0]= halide coordinates М[1] the nearest atom's coordinates
                          dist = struct.distance(xyz=tuple(i[11:14]), records=('ATOM'))
                          modern_df['dist']=dist # add distanse to subset
                          

                          if args.C == 1:

                                 modern_subset =modern_df[modern_df.dist < args.angstrem_radius] # halide neighbors
                      
                          
                          elif args.C == 2:

                                 modern_df1=modern_df[modern_df.dist < args.angstrem_radius]
                                 modern_subset=modern_df1.loc[~modern_df1['element_symbol'].isin(['C','H'])]

        
                          elif args.C == 3:
                                 
                                 modern_df1 =modern_df[modern_df.dist < args.angstrem_radius]
                                 modern_subset=modern_df1.loc[~modern_df1['element_symbol'].isin(['C'])]

                          elif args.C == 4:
                                 modern_df1 =modern_df[modern_df.dist < args.angstrem_radius]
                                 atoms=list(map("".join,itertools.permutations('BDEGHZ',1)))
                                 atoms1=list(map("".join,itertools.permutations('BDEGHZ123',2)))
                                 atoms2=atoms+atoms1
                                 atom2=['C'+ atom for atom in atoms2] +['O','C']
                                 modern_subset=modern_df1.loc[~modern_df1['atom_name'].isin(atom2)]

                                
                          
                          xyz=i[11:14] # halide coordinate
                          Coordinate+=[xyz] # add coordinates
                          try:
                                    nearest=modern_subset.loc[modern_subset['dist']==min(modern_subset['dist'])].values[0][[3,5,11,12,13,20,21]] #define the nearest atom
            
                                    Coordinate+=[nearest[2:5]] # add the nearest atom's coordinates
                          except:
                               continue
                          for n in modern_subset.values:
                                 xyz2= n[11:14]
                                 Coordinate+=[xyz2] # add coordinates
        
                          modern_subset1=modern_subset.copy(deep=True)
                          modern_subset1['angles']=ang(Coordinate) # add angles to subset
                          #modern_subset1=modern_subset1.loc[modern_subset1['angles'] != 0] # delete rows  with angles=0
                          #{nearest[0]}:{nearest[1]}:{"%.3f"% nearest[6]}:{np.nan}
                          dict_of_subsets[f'{model_name}:{"%.3f"% result.totalArea()}:{resolution}:{i[3]}:{nearest[5]}'] =\
                          [(f'{j[3]}:{j[5]}:{"%.3f"% j[21]}:{"%.3f"% j[22]}') for j in modern_subset1.values]
 

              def write_output(sfx):

                       with open(f'{args.output_dir}/{args.output_file_name}_{sfx}.tsv', 'a') as w:
                            for k,v in dict_of_subsets.items():
                                 w.write(f'{k}\t')
                                 for i in range(len(v)):
                                     if i == len(v)-1:
                                             w.write(f'{v[i]}')
                                     else:
                                             w.write(f'{v[i]},')
                                 w.write('\n')

              if resolution <= 1.5:
                         write_output('HIGH')
              elif resolution > 1.5 and resolution < 2.5:
                         write_output('MODERATE')
              else:
                         write_output('LOW')
      path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pdb_one_halide.txt')
      os.remove(path)