Ejemplo n.º 1
0
def read_PDB_files(pdbids,directory="",show_info=True):

	path = Path.cwd() / directory
	path.mkdir(parents=True,exist_ok=True)

	# with open(pdbids) as input_pdbs:
	# 	pdbids = list(map(lambda x: x.split(","),input_pdbs.read().split("\n")))
	# 	pdbids = list(filter(lambda x: x[0][0] != "#",pdbids))

	pdb_structures = dict()
	info = dict()

	for i in pdbids:

		if (path / ("pdb%s.ent" %i.lower())).exists():
			f_name = str(path / ("pdb%s.ent" %i.lower()))

		elif (path / ("%s.pdb" %i.lower())).exists():
			f_name = str(path / ("%s.pdb" %i.lower()))

		else:
			f_name = pdbl().retrieve_pdb_file(pdb_code=i,pdir=directory,file_format='pdb')

    
		#chains = [i[2*n+1:2*n+3] for n in range(len(i)//2)]
		structure = PDBParser(QUIET=True).get_structure(i,f_name)

		pdb_structures[i] = structure
		
		num_models = len([j for j in structure.get_models()])
		if num_models > 1:
			info[i] = num_models
		
		#exit()
	#chains = {k:v for k,v in }
	# if show_info:
	# 	print("%d pdbs have multiple models:"%len(info))
	# 	for k,v in sorted(info.items()):
	# 		print("%s-(%d)"%(k,v))
	# 	print("Total number of models: %s"%sum(info.values()))

	return pdb_structures	
Ejemplo n.º 2
0
def build_matrix(
        path: str,
        filename: str,
        truncate_log: Union[tqdm.tqdm, None] = None) -> BuildMatrixDict:
    """Build the input matrix for one protein.

    Args:
        path: path of the pdb file.
        filename: name of the file (without extension).
        truncate_log: tqdm logger

    Returns:
        Build matrix dictionary
    """
    PROTEIN_SEQ_MAX_LEN = 4000
    protein_matrix = [[0 for x in range(PROTEIN_SEQ_MAX_LEN)]
                      for y in range(10)]
    protein_structure = PDBParser().get_structure(filename, path)
    protein_model = list(protein_structure.get_models())
    protein_chains = list(protein_model[0].get_chains())

    col = 0

    try:
        for chain in protein_chains:
            protein_residues = list(chain.get_residues())

            for residue in protein_residues:
                if Polypeptide.is_aa(residue.get_resname(), standard=True):
                    atoms = list(residue.get_atoms())
                    x = []
                    y = []
                    z = []

                    for atom in atoms:
                        vec = atom.get_vector()
                        x.append(vec.__getitem__(0))
                        y.append(vec.__getitem__(1))
                        z.append(vec.__getitem__(2))

                    # calculate position of residue
                    x = round(mean(x))
                    y = round(mean(y))
                    z = round(mean(z))

                    # one letter code
                    code = Polypeptide.three_to_one(residue.get_resname())

                    aa = amino_acid[code]
                    protein_matrix[0][col] = aa["code"]
                    protein_matrix[1][col] = x
                    protein_matrix[2][col] = y
                    protein_matrix[3][col] = z
                    protein_matrix[4][col] = aa["hydropathy"]
                    protein_matrix[5][col] = aa["hydropathy_index"]
                    protein_matrix[6][col] = aa["acidity_basicity"]
                    protein_matrix[7][col] = aa["mass"]
                    protein_matrix[8][col] = aa["isoelectric_point"]
                    protein_matrix[9][col] = aa["charge"]

                # Even if the current residue is not amino acid we increase the col.
                # 0 is save at this position if it is not an amino acid.
                col = col + 1

    except IndexError:
        if truncate_log is not None:
            truncate_log.set_description_str(
                f"Protein {filename} is truncated.")

    # Prepare dict so it can be load to vaex dataframe
    dic: BuildMatrixDict = {
        "seq": [[]],
        "x_pos": [[]],
        "y_pos": [[]],
        "z_pos": [[]],
        "hydropathy": [[]],
        "hydropathy_index": [[]],
        "acidity_basicity": [[]],
        "mass": [[]],
        "isoelectric_point": [[]],
        "charge": [[]],
    }

    for i in range(10):
        dic[col_name[i]] = pyarrow.array(
            [[protein_matrix[i][x] for x in range(PROTEIN_SEQ_MAX_LEN)]])

    return dic