def build( # type: ignore[override] cls, structure_file: Union[Path, str], protein_sequence: str, ligand_sequence: Optional[str], remove_hetatms=True, ) -> ProteinSolverData: import proteinsolver structure = PDB.load(structure_file) pdata = extract_seq_and_adj( structure, [0] if ligand_sequence is None else [0, 1], remove_hetatms=remove_hetatms, ) expected_sequence = protein_sequence + (ligand_sequence or "") if remove_hetatms: expected_sequence = expected_sequence.replace("X", "") if pdata.sequence != expected_sequence: raise ProteinSolverBuildError( f"Parsed sequence does not match provided sequence " f"({pdata.sequence} != {protein_sequence} + {ligand_sequence})." ) data = proteinsolver.datasets.protein.row_to_data(pdata) data = proteinsolver.datasets.protein.transform_edge_attr(data) return data
def extract_protein_info(mutation_info: MutationInfo) -> Dict: structure_file = Path(mutation_info.structure_file) if not structure_file.is_file(): raise EL2Error( f"Could not find structure for mutation: {mutation_info}.") if config.DATA_DIR not in structure_file.as_posix(): raise EL2Error( f"Structure file is not available remotely for mutation: {mutation_info}." ) @contextmanager def disable_logger(logger, level=logging.WARNING): try: logger.setLevel(level) yield finally: logger.setLevel(logging.NOTSET) with disable_logger(logging.getLogger("kmbio.PDB.core.atom")): structure = PDB.load(structure_file) protein_sequence, ligand_sequence = _extract_chain_sequences( structure, mutation_info.chain_id, mutation_info.coi) mutation = map_mutation_to_chain(structure, mutation_info.chain_id, mutation_info.mutation) if protein_sequence is None: raise EL2Error( f"Could not extract protein sequence for mutation: {mutation_info}." ) if mutation_info.coi == COI.INTERFACE and ligand_sequence is None: raise EL2Error( f"Could not extract ligand sequence for mutation: {mutation_info}." ) if protein_sequence[int(mutation[1:-1]) - 1] != mutation[0]: raise EL2Error( f"Mutation does not match extracted protein sequence: {mutation_info}." ) structure_file_url = urljoin( config.SITE_URL, Path(config.SITE_DATA_DIR).joinpath( structure_file.relative_to(config.DATA_DIR)).as_posix(), ) result = { **{ "protein_structure_url": structure_file_url, "protein_sequence": protein_sequence, "mutations": mutation, }, **({ "ligand_sequence": ligand_sequence } if ligand_sequence is not None else {}), } return result
def get_interaction_dataset_wdistances(structure_file, model_id, chain_id, r_cutoff=12): structure = PDB.load(structure_file) chain = structure[0][chain_id] num_residues = len(list(chain.residues)) dd = structure_tools.DomainDef(model_id, chain_id, 1, num_residues) domain = structure_tools.extract_domain(structure, [dd]) distances_core = structure_tools.get_distances(domain.to_dataframe(), r_cutoff, groupby="residue") assert (distances_core["residue_idx_1"] <= distances_core["residue_idx_2"]).all() return domain, distances_core
def build( self, structure_file: Union[Path, str], protein_sequence: str, ligand_sequence: Optional[str], remove_hetatms=True, ) -> ELASPIC2Data: structure = PDB.load(structure_file) protein_domain_def, ligand_domain_def = guess_domain_defs( structure, protein_sequence, ligand_sequence, remove_hetatms=remove_hetatms) if protein_domain_def is None or (ligand_sequence is not None and ligand_domain_def is None): raise ValueError( "Cound not find protein and / or ligand sequence in the provided structure file." ) domain_defs = ([protein_domain_def] if ligand_sequence is None else [protein_domain_def, ligand_domain_def]) structure_new = structure_tools.extract_domain( structure, domain_defs, remove_hetatms=remove_hetatms) with tempfile.NamedTemporaryFile(suffix=".pdb") as pdb_file_obj: PDB.save(structure_new, pdb_file_obj.name) protbert_data = ProtBert.build(protein_sequence, ligand_sequence, remove_hetatms) proteinsolver_data = ProteinSolver.build(pdb_file_obj.name, protein_sequence, ligand_sequence, remove_hetatms) data = ELASPIC2Data(ligand_sequence is not None, protbert_data, proteinsolver_data) return data
def get_adjacency_with_distances_and_orientations( row, max_cutoff=12, min_cutoff=None, structure_url_prefix="rcsb://"): """""" missing_attributes = [ attr for attr in GET_ADJACENCY_WITH_DISTANCES_ROW_ATTRIBUTES if not hasattr(row, attr) ] assert not missing_attributes, missing_attributes # === Parse input structure === # Load structure url = f"{structure_url_prefix}{row.structure_id.lower()}.cif.gz" structure = PDB.load(url) # Template sequence chain_sequence = structure_tools.get_chain_sequence( structure[row.model_id][row.chain_id], if_unknown="replace") template_sequence = chain_sequence[int(row.s_start - 1):int(row.s_end)] assert len(template_sequence) == len(row.a2b) # Target sequence target_sequence = row.sequence[int(row.q_start - 1):int(row.q_end)] assert len(target_sequence) == len(row.b2a) # Extract domain dd = structure_tools.DomainDef(row.model_id, row.chain_id, int(row.s_start), int(row.s_end)) domain = structure_tools.extract_domain(structure, [dd]) assert template_sequence == structure_tools.get_chain_sequence( domain, if_unknown="replace") assert template_sequence == row.sseq.replace("-", "") # === Generate mdtraj trajectory === with tempfile.NamedTemporaryFile(suffix=".pdb") as pdb_file: PDB.save(domain, pdb_file.name) traj = mdtraj.load(pdb_file.name) assert template_sequence == traj.top.to_fasta()[0] # === Extract residues and residue-residue interactions === # Residue info residue_df = construct_residue_df(traj) validate_residue_df(residue_df) residue_df["residue_idx_corrected"] = pd.array( residue_df["residue_idx"].apply( lambda idx: sequence_tools.convert_residue_index_a2b(idx, row.b2a )), dtype=pd.Int64Dtype(), ) # Residue pair info residue_pairs_df = construct_residue_pairs_df(traj) validate_residue_pairs_df(residue_pairs_df) for i in [1, 2]: residue_pairs_df[f"residue_idx_{i}_corrected"] = pd.array( residue_pairs_df[f"residue_idx_{i}"].apply( lambda idx: sequence_tools.convert_residue_index_a2b( idx, row.b2a)), dtype=pd.Int64Dtype(), ) # === Sanity check === # Get the set of interactions interactions_1 = set( residue_pairs_df[(residue_pairs_df["residue_idx_1_corrected"] < residue_pairs_df["residue_idx_2_corrected"]) & (residue_pairs_df["distance"] <= 5.0)][[ "residue_idx_1_corrected", "residue_idx_2_corrected" ]].apply(tuple, axis=1)) # Get the reference set of interactions interactions_2 = {(int(r1), int(r2)) if r1 <= r2 else (int(r2), int(r1)) for r1, r2 in zip(row.residue_idx_1_corrected, row.residue_idx_2_corrected) if pd.notnull(r1) and pd.notnull(r2)} assert not interactions_1 ^ interactions_2, interactions_1 ^ interactions_2 return { **residue_df_to_row(residue_df), **residue_pairs_df_to_row(residue_pairs_df) }
def get_adjacency_with_distances(row, max_cutoff=12, min_cutoff=None, structure_url_prefix="rcsb://"): """ Notes: - This is the 2018 version, where we calculated distnaces only. """ missing_attributes = [ attr for attr in GET_ADJACENCY_WITH_DISTANCES_ROW_ATTRIBUTES if not hasattr(row, attr) ] assert not missing_attributes, missing_attributes # Load structure url = f"{structure_url_prefix}{row.structure_id.lower()}.cif.gz" structure = PDB.load(url) # Template sequence chain_sequence = structure_tools.get_chain_sequence( structure[row.model_id][row.chain_id], if_unknown="replace") template_sequence = chain_sequence[int(row.s_start - 1):int(row.s_end)] assert len(template_sequence) == len(row.a2b) # Target sequence target_sequence = row.sequence[int(row.q_start - 1):int(row.q_end)] assert len(target_sequence) == len(row.b2a) # Extract domain dd = structure_tools.DomainDef(row.model_id, row.chain_id, int(row.s_start), int(row.s_end)) domain = structure_tools.extract_domain(structure, [dd]) assert template_sequence == structure_tools.get_chain_sequence( domain, if_unknown="replace") assert template_sequence == row.sseq.replace("-", "") # Get interactions distances_core = structure_tools.get_distances(domain, max_cutoff, min_cutoff, groupby="residue") assert (distances_core["residue_idx_1"] <= distances_core["residue_idx_2"]).all() # Map interactions to target for i in [1, 2]: distances_core[f"residue_idx_{i}_corrected"] = distances_core[ f"residue_idx_{i}"].apply(lambda idx: sequence_tools. convert_residue_index_a2b(idx, row.b2a)) # Remove missing values distances_core = distances_core[ distances_core["residue_idx_1_corrected"].notnull() & distances_core["residue_idx_2_corrected"].notnull()] # Convert to integers distances_core[["residue_idx_1_corrected", "residue_idx_2_corrected"]] = distances_core[[ "residue_idx_1_corrected", "residue_idx_2_corrected" ]].astype(int) # Sanity check assert (distances_core["residue_idx_1_corrected"] < distances_core["residue_idx_2_corrected"]).all() # Get the set of interactions interactions_1 = set(distances_core[(distances_core["distance"] <= 5)][[ "residue_idx_1_corrected", "residue_idx_2_corrected" ]].apply(tuple, axis=1)) # Get the reference set of interactions interactions_2 = {(int(r1), int(r2)) if r1 <= r2 else (int(r2), int(r1)) for r1, r2 in zip(row.residue_idx_1_corrected, row.residue_idx_2_corrected) if pd.notnull(r1) and pd.notnull(r2)} assert not interactions_1 ^ interactions_2 return ( distances_core["residue_idx_1_corrected"].values, distances_core["residue_idx_2_corrected"].values, distances_core["distance"].values, )
def run_modeller(structure, alignment, temp_dir: Union[str, Path, Callable]): """Run Modeller to create a homology model. Args: structure: Structure of the template protein. alignment_file: Alignment of the target sequence(s) to chain(s) of the template structure. temp_dir: Location to use for storing Modeller temporary files and output. Returns: results: A dictionary of model properties. Of particular interest are the followng: `name`: The name of the generated PDB structure. `Normalized DOPE score`: DOPE score that should be comparable between structures. `GA341 score`: GA341 score that should be comparable between structures. """ import modeller from modeller.automodel import assess, automodel, autosched if isinstance(structure, (str, Path)): structure = PDB.load(structure) if callable(temp_dir): temp_dir = Path(temp_dir()) else: temp_dir = Path(temp_dir) assert len(alignment) == 2 target_id = alignment[0].id template_id = alignment[1].id PDB.save(structure, temp_dir.joinpath(f"{template_id}.pdb")) alignment_file = temp_dir.joinpath(f"{template_id}-{target_id}.aln") structure_tools.write_pir_alignment(alignment, alignment_file) # Don't display log messages modeller.log.none() # Create a new MODELLER environment env = modeller.environ() # Directories for input atom files env.io.atom_files_directory = [str(temp_dir)] env.schedule_scale = modeller.physical.values(default=1.0, soft_sphere=0.7) # Selected atoms do not feel the neighborhood # env.edat.nonbonded_sel_atoms = 2 env.io.hetatm = True # read in HETATM records from template PDBs env.io.water = True # read in WATER records (including waters marked as HETATMs) a = automodel( env, # alignment filename alnfile=str(alignment_file), # codes of the templates knowns=(str(template_id)), # code of the target sequence=str(target_id), # wich method for validation should be calculated assess_methods=(assess.DOPE, assess.normalized_dope, assess.GA341), ) a.starting_model = 1 # index of the first model a.ending_model = 1 # index of the last model # Very thorough VTFM optimization: a.library_schedule = autosched.slow a.max_var_iterations = 300 # Thorough MD optimization: # a.md_level = refine.slow a.md_level = None # a.repeat_optimization = 2 # Stop if the objective function is higher than this value a.max_molpdf = 2e6 with py_tools.log_print_statements(logger), system_tools.switch_paths(temp_dir): a.make() assert len(a.outputs) == 1 return a.outputs[0]
def uploadFile(request): if not request.FILES: raise Http404 if not ("fileToUpload" in request.FILES): raise Http404 myfile = request.FILES["fileToUpload"] filetype = request.POST["filetype"] random_id = "" if myfile.size > 10000000: jsonDict = {"msg": "File is too large (>10 MB)", "error": 1} return HttpResponse(json.dumps(jsonDict), content_type="application/json") try: process = Popen(["/usr/bin/file", "-i", myfile.temporary_file_path()], stdout=PIPE) stdout, stderr = process.communicate() if stdout.decode().split(" ")[1].split("/")[0] not in ("text", "chemical"): msg = "Uploaded file has to be raw text (not '{0}')".format( stdout.decode().split(" ")[1][:-1]) jsonDict = {"msg": msg, "error": 1} return HttpResponse(json.dumps(jsonDict), content_type="application/json") # Protein list upload. if filetype == "prot": # Remove white-spaces and empty lines. lines = myfile.read().decode().split("\n") trimmedLines = [] for idx, line in enumerate(lines): if idx >= 500: break newline = sub(r"\s+", "", line) if newline: trimmedLines.append(newline) msg = "\n".join(trimmedLines) except Exception as e: logger.error("Caught exception '%s': %s", type(e), e) jsonDict = { "msg": "File could not be uploaded - try again", "error": 1 } return HttpResponse(json.dumps(jsonDict), content_type="application/json") if filetype == "pdb": try: random_id = fn.get_random_id() user_path = fn.get_user_path(random_id) suffix = myfile.name.split(".")[-1] if suffix in ["cif", "mmcif"]: input_pdb = op.join(user_path, "input.cif") else: input_pdb = op.join(user_path, "input.pdb") with open(input_pdb, "w") as ofh: ofh.write(myfile.read().decode()) structure = PDB.load(input_pdb) # Save cleaned up version of file PDB.save(structure, Path(input_pdb).with_suffix(".pdb")) structure_tools.process_structure(structure) seq = [( chain.id, structure_tools.get_chain_sequence(chain, if_unknown="replace", unknown_residue_marker="X"), ) for chain in structure.chains] logger.debug("seq: '%s'", seq) if len(seq) < 1: jsonDict = { "msg": "PDB does not have any valid chains. ", "error": 1 } return HttpResponse(json.dumps(jsonDict), content_type="application/json") with open(op.join(user_path, "pdb_parsed.pickle"), "bw") as f: f.write(pickle.dumps(seq)) msg = seq except Exception as e: logger.error("Caught exception '%s': %s", type(e), e) jsonDict = {"msg": f"PDB could not be parsed: {e}.", "error": 1} return HttpResponse(json.dumps(jsonDict), content_type="application/json") jsonDict = { "inputfile": myfile.name or "uploadedFile", "userpath": random_id, "msg": msg, "error": 0, } return HttpResponse(json.dumps(jsonDict), content_type="application/json")
def f(pdb_id): session = Session() exists = session.query(ProteinSolverResult).filter_by( pdb_id=pdb_id).scalar() if exists: print(f"DIDZ {pdb_id}") return #!/usr/bin/env python # coding: utf-8 # # ProteinSolver Demo # # Here, we load the ProteinSolver network and use it to design sequences that match the geometry of the PDZ domain. # In[1]: # In[2]: # # Globals # # PICK YOUR PROTEIN HERE! PDB_ID = pdb_id #os.environ.get('PDB_ID', '2HE4') DATA_PATH = "/home/home3/fny/cs590/data/proteinsolver" PDB_PATH = Bio.PDB.PDBList().retrieve_pdb_file(PDB_ID, file_format="pdb", pdir=DATA_PATH) STRUCTURE = PDB.Structure(PDB_ID + "_A", PDB.load(PDB_PATH)[0].extract('A')) MODEL_ID = "191f05de" MODEL_STATE = "protein_train/191f05de/e53-s1952148-d93703104.state" print('Protein ID:', PDB_ID) # The following should return True indicating GPUs are available. # In[3]: device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") torch.cuda.is_available() # ## Load Model # In[4]: torch.cuda.empty_cache() # In[5]: get_ipython().run_line_magic('run', '../protein_train/{MODEL_ID}/model.py') # In[6]: # Model configuration batch_size = 1 num_features = 20 adj_input_size = 2 hidden_size = 128 frac_present = 0.5 frac_present_valid = frac_present info_size = 1024 state_file = MODEL_STATE net = Net(x_input_size=num_features + 1, adj_input_size=adj_input_size, hidden_size=hidden_size, output_size=num_features) net.load_state_dict(torch.load('../' + state_file, map_location=device)) net.eval() net = net.to(device) # ## Data Preprocessing # Many proteins from the PDB did not work due to functional groups being placed at the residue locations. This portion of the script corrects for that. # In[7]: from typing import NamedTuple class ProteinData(NamedTuple): sequence: str row_index: torch.LongTensor col_index: torch.LongTensor distances: torch.FloatTensor def extract_seq_and_adj(structure, chain_id): domain, result_df = get_interaction_dataset_wdistances( StructureWrapper(structure), 0, chain_id, r_cutoff=12) domain_sequence = structure_tools.get_chain_sequence(domain) assert max(result_df["residue_idx_1"].values) < len(domain_sequence) assert max(result_df["residue_idx_2"].values) < len(domain_sequence) data = ProteinData(domain_sequence, result_df["residue_idx_1"].values, result_df["residue_idx_2"].values, result_df["distance"].values) return data def get_interaction_dataset_wdistances(structure, model_id, chain_id, r_cutoff=100): chain = structure[0][chain_id] num_residues = len(list(chain.residues)) dd = structure_tools.DomainDef(model_id, chain_id, 1, num_residues) domain = structure_tools.extract_domain(structure, [dd]) distances_core = structure_tools.get_distances(domain.to_dataframe(), r_cutoff, groupby="residue") assert (distances_core["residue_idx_1"] <= distances_core["residue_idx_2"]).all() return domain, distances_core class StructureWrapper(object): def __init__(self, structure): self.structure = structure def __getitem__(self, item): return StructureWrapper(self.structure[item]) def __getattr__(self, name): if name == 'residues': rs = [] for residue in STRUCTURE.residues: x, _, _ = residue.id if x == ' ': rs.append(residue) return rs return getattr(self.structure, name) def preprocess(structure): return extract_seq_and_adj(StructureWrapper(STRUCTURE), 'A') STRUCTURE_SUMMARY = preprocess(STRUCTURE) # ## Searching for Designs # The model returns probabilities for every amino acid for each residue in the sequence. One method to search the space is using uniform cost search (i.e. single-source, greedy Djikstra's). # # We start with the initial sequence and run it through the model. We then find the amino acid with the highest score for each residue, create a series of new chains with those residues updated, and place the newly created chains back in the prioirty queue which is ordered by score. # In[8]: @torch.no_grad() def frontier(net, x, x_score, edge_index, edge_attr, cutoff): index_array = torch.arange(len(x)) mask = x == 20 # Compute the output output = torch.softmax(net(x, edge_index, edge_attr), dim=1)[mask] # Select missing positions index_array = index_array[mask] # Find the entry with the highest probability max_score, max_index = output.max(dim=1)[0].max(dim=0) row_with_max_score = output[max_index] # Build nodes to search where each node updates one # probability from the maximum found nodes = [] for i, p in enumerate(row_with_max_score): x_clone = x.clone() x_score_clone = x_score.clone() x_clone[index_array[max_index]] = i x_score_clone[index_array[max_index]] = torch.log(p) nodes.append((x_clone, x_score_clone)) return nodes @torch.no_grad() def protein_search(net, x, edge_index, edge_attr, candidates, cutoff, max_iters=1000000, verbose=False): x_score = torch.ones_like(x).to(torch.float) * cutoff heap = [(0, torch.randn(1), x, x_score)] iters = tqdm.tqdm(range(max_iters)) if verbose else range(max_iters) for i in iters: p, tiebreaker, x, x_score = heapq.heappop(heap) n_missing = torch.sum(x == 20) if verbose and i % 1000 == 0: print(i, p, "- Heap:", len(heap), f", Results:", len(candidates), f", Missing: {n_missing}/{x.shape[0]}") if n_missing == 0: candidates.append((p.cpu(), x.data.cpu().numpy(), x_score.data.cpu().numpy())) continue for x, x_score in frontier(net, x, x_score, edge_index, edge_attr, cutoff): pre_p = -x_score.sum() heapq.heappush(heap, (-x_score.sum(), torch.randn(1), x, x_score)) if len(heap) > 1_000_000: heap = heap[:700_000] heapq.heapify(heap) return candidates # Convert protein data and load it into the to GPU row_data = proteinsolver.datasets.protein.row_to_data(STRUCTURE_SUMMARY) data = proteinsolver.datasets.protein.transform_edge_attr(row_data) data.to(device) data.y = data.x candidates = [] try: protein_search(net, torch.ones_like(data.x) * 20, data.edge_index, data.edge_attr, candidates=candidates, cutoff=np.log(0.15), verbose=False, max_iters=5000) except KeyboardInterrupt: pass # ## Results # In[9]: df = pd.DataFrame( [(''.join(proteinsolver.utils.AMINO_ACIDS[i] for i in candidate[1]), candidate[2].sum(), candidate[2].sum() / len(candidate[1]), float((candidate[1] == data.x.data.cpu().numpy()).sum().item()) / data.x.size(0)) for candidate in candidates], columns=["sequence", "log_prob_sum", "log_prob_avg", "seq_identity"]) # In[10]: df = df.sort_values("log_prob_avg", ascending=False).iloc[:200_000] # In[11]: df # In[12]: result = ProteinSolverResult( pdb_id=PDB_ID, n_results=df.shape[0], max_prob_avg=df['log_prob_avg'].max(), sequences=df['sequence'].values, log_prob_sums=df['log_prob_sum'].values, log_prob_avgs=df['log_prob_avg'].values, seq_identities=df['seq_identity'].values, ) exists = session.query(ProteinSolverResult).filter_by( pdb_id=PDB_ID).scalar() if not exists: session.add(result) session.commit() print(f"DONE {pdb_id}")