Beispiel #1
0
    def build(  # type: ignore[override]
        cls,
        structure_file: Union[Path, str],
        protein_sequence: str,
        ligand_sequence: Optional[str],
        remove_hetatms=True,
    ) -> ProteinSolverData:
        import proteinsolver

        structure = PDB.load(structure_file)
        pdata = extract_seq_and_adj(
            structure,
            [0] if ligand_sequence is None else [0, 1],
            remove_hetatms=remove_hetatms,
        )

        expected_sequence = protein_sequence + (ligand_sequence or "")
        if remove_hetatms:
            expected_sequence = expected_sequence.replace("X", "")
        if pdata.sequence != expected_sequence:
            raise ProteinSolverBuildError(
                f"Parsed sequence does not match provided sequence "
                f"({pdata.sequence} != {protein_sequence} + {ligand_sequence})."
            )

        data = proteinsolver.datasets.protein.row_to_data(pdata)
        data = proteinsolver.datasets.protein.transform_edge_attr(data)

        return data
Beispiel #2
0
def extract_protein_info(mutation_info: MutationInfo) -> Dict:
    structure_file = Path(mutation_info.structure_file)

    if not structure_file.is_file():
        raise EL2Error(
            f"Could not find structure for mutation: {mutation_info}.")
    if config.DATA_DIR not in structure_file.as_posix():
        raise EL2Error(
            f"Structure file is not available remotely for mutation: {mutation_info}."
        )

    @contextmanager
    def disable_logger(logger, level=logging.WARNING):
        try:
            logger.setLevel(level)
            yield
        finally:
            logger.setLevel(logging.NOTSET)

    with disable_logger(logging.getLogger("kmbio.PDB.core.atom")):
        structure = PDB.load(structure_file)

    protein_sequence, ligand_sequence = _extract_chain_sequences(
        structure, mutation_info.chain_id, mutation_info.coi)
    mutation = map_mutation_to_chain(structure, mutation_info.chain_id,
                                     mutation_info.mutation)

    if protein_sequence is None:
        raise EL2Error(
            f"Could not extract protein sequence for mutation: {mutation_info}."
        )
    if mutation_info.coi == COI.INTERFACE and ligand_sequence is None:
        raise EL2Error(
            f"Could not extract ligand sequence for mutation: {mutation_info}."
        )
    if protein_sequence[int(mutation[1:-1]) - 1] != mutation[0]:
        raise EL2Error(
            f"Mutation does not match extracted protein sequence: {mutation_info}."
        )

    structure_file_url = urljoin(
        config.SITE_URL,
        Path(config.SITE_DATA_DIR).joinpath(
            structure_file.relative_to(config.DATA_DIR)).as_posix(),
    )
    result = {
        **{
            "protein_structure_url": structure_file_url,
            "protein_sequence": protein_sequence,
            "mutations": mutation,
        },
        **({
            "ligand_sequence": ligand_sequence
        } if ligand_sequence is not None else {}),
    }

    return result
def get_interaction_dataset_wdistances(structure_file,
                                       model_id,
                                       chain_id,
                                       r_cutoff=12):
    structure = PDB.load(structure_file)
    chain = structure[0][chain_id]
    num_residues = len(list(chain.residues))
    dd = structure_tools.DomainDef(model_id, chain_id, 1, num_residues)
    domain = structure_tools.extract_domain(structure, [dd])
    distances_core = structure_tools.get_distances(domain.to_dataframe(),
                                                   r_cutoff,
                                                   groupby="residue")
    assert (distances_core["residue_idx_1"] <=
            distances_core["residue_idx_2"]).all()
    return domain, distances_core
Beispiel #4
0
    def build(
        self,
        structure_file: Union[Path, str],
        protein_sequence: str,
        ligand_sequence: Optional[str],
        remove_hetatms=True,
    ) -> ELASPIC2Data:
        structure = PDB.load(structure_file)
        protein_domain_def, ligand_domain_def = guess_domain_defs(
            structure,
            protein_sequence,
            ligand_sequence,
            remove_hetatms=remove_hetatms)
        if protein_domain_def is None or (ligand_sequence is not None
                                          and ligand_domain_def is None):
            raise ValueError(
                "Cound not find protein and / or ligand sequence in the provided structure file."
            )

        domain_defs = ([protein_domain_def] if ligand_sequence is None else
                       [protein_domain_def, ligand_domain_def])
        structure_new = structure_tools.extract_domain(
            structure, domain_defs, remove_hetatms=remove_hetatms)

        with tempfile.NamedTemporaryFile(suffix=".pdb") as pdb_file_obj:
            PDB.save(structure_new, pdb_file_obj.name)
            protbert_data = ProtBert.build(protein_sequence, ligand_sequence,
                                           remove_hetatms)
            proteinsolver_data = ProteinSolver.build(pdb_file_obj.name,
                                                     protein_sequence,
                                                     ligand_sequence,
                                                     remove_hetatms)

        data = ELASPIC2Data(ligand_sequence is not None, protbert_data,
                            proteinsolver_data)
        return data
def get_adjacency_with_distances_and_orientations(
        row, max_cutoff=12, min_cutoff=None, structure_url_prefix="rcsb://"):
    """"""
    missing_attributes = [
        attr for attr in GET_ADJACENCY_WITH_DISTANCES_ROW_ATTRIBUTES
        if not hasattr(row, attr)
    ]
    assert not missing_attributes, missing_attributes
    # === Parse input structure ===
    # Load structure
    url = f"{structure_url_prefix}{row.structure_id.lower()}.cif.gz"
    structure = PDB.load(url)
    # Template sequence
    chain_sequence = structure_tools.get_chain_sequence(
        structure[row.model_id][row.chain_id], if_unknown="replace")
    template_sequence = chain_sequence[int(row.s_start - 1):int(row.s_end)]
    assert len(template_sequence) == len(row.a2b)
    # Target sequence
    target_sequence = row.sequence[int(row.q_start - 1):int(row.q_end)]
    assert len(target_sequence) == len(row.b2a)
    # Extract domain
    dd = structure_tools.DomainDef(row.model_id, row.chain_id,
                                   int(row.s_start), int(row.s_end))
    domain = structure_tools.extract_domain(structure, [dd])
    assert template_sequence == structure_tools.get_chain_sequence(
        domain, if_unknown="replace")
    assert template_sequence == row.sseq.replace("-", "")

    # === Generate mdtraj trajectory ===
    with tempfile.NamedTemporaryFile(suffix=".pdb") as pdb_file:
        PDB.save(domain, pdb_file.name)
        traj = mdtraj.load(pdb_file.name)
    assert template_sequence == traj.top.to_fasta()[0]

    # === Extract residues and residue-residue interactions ===
    # Residue info
    residue_df = construct_residue_df(traj)
    validate_residue_df(residue_df)
    residue_df["residue_idx_corrected"] = pd.array(
        residue_df["residue_idx"].apply(
            lambda idx: sequence_tools.convert_residue_index_a2b(idx, row.b2a
                                                                 )),
        dtype=pd.Int64Dtype(),
    )

    # Residue pair info
    residue_pairs_df = construct_residue_pairs_df(traj)
    validate_residue_pairs_df(residue_pairs_df)
    for i in [1, 2]:
        residue_pairs_df[f"residue_idx_{i}_corrected"] = pd.array(
            residue_pairs_df[f"residue_idx_{i}"].apply(
                lambda idx: sequence_tools.convert_residue_index_a2b(
                    idx, row.b2a)),
            dtype=pd.Int64Dtype(),
        )

    # === Sanity check ===
    # Get the set of interactions
    interactions_1 = set(
        residue_pairs_df[(residue_pairs_df["residue_idx_1_corrected"] <
                          residue_pairs_df["residue_idx_2_corrected"])
                         & (residue_pairs_df["distance"] <= 5.0)][[
                             "residue_idx_1_corrected",
                             "residue_idx_2_corrected"
                         ]].apply(tuple, axis=1))
    # Get the reference set of interactions
    interactions_2 = {(int(r1), int(r2)) if r1 <= r2 else (int(r2), int(r1))
                      for r1, r2 in zip(row.residue_idx_1_corrected,
                                        row.residue_idx_2_corrected)
                      if pd.notnull(r1) and pd.notnull(r2)}
    assert not interactions_1 ^ interactions_2, interactions_1 ^ interactions_2

    return {
        **residue_df_to_row(residue_df),
        **residue_pairs_df_to_row(residue_pairs_df)
    }
def get_adjacency_with_distances(row,
                                 max_cutoff=12,
                                 min_cutoff=None,
                                 structure_url_prefix="rcsb://"):
    """
    Notes:
        - This is the 2018 version, where we calculated distnaces only.
    """
    missing_attributes = [
        attr for attr in GET_ADJACENCY_WITH_DISTANCES_ROW_ATTRIBUTES
        if not hasattr(row, attr)
    ]
    assert not missing_attributes, missing_attributes
    # Load structure
    url = f"{structure_url_prefix}{row.structure_id.lower()}.cif.gz"
    structure = PDB.load(url)
    # Template sequence
    chain_sequence = structure_tools.get_chain_sequence(
        structure[row.model_id][row.chain_id], if_unknown="replace")
    template_sequence = chain_sequence[int(row.s_start - 1):int(row.s_end)]
    assert len(template_sequence) == len(row.a2b)
    # Target sequence
    target_sequence = row.sequence[int(row.q_start - 1):int(row.q_end)]
    assert len(target_sequence) == len(row.b2a)
    # Extract domain
    dd = structure_tools.DomainDef(row.model_id, row.chain_id,
                                   int(row.s_start), int(row.s_end))
    domain = structure_tools.extract_domain(structure, [dd])
    assert template_sequence == structure_tools.get_chain_sequence(
        domain, if_unknown="replace")
    assert template_sequence == row.sseq.replace("-", "")
    # Get interactions
    distances_core = structure_tools.get_distances(domain,
                                                   max_cutoff,
                                                   min_cutoff,
                                                   groupby="residue")
    assert (distances_core["residue_idx_1"] <=
            distances_core["residue_idx_2"]).all()
    # Map interactions to target
    for i in [1, 2]:
        distances_core[f"residue_idx_{i}_corrected"] = distances_core[
            f"residue_idx_{i}"].apply(lambda idx: sequence_tools.
                                      convert_residue_index_a2b(idx, row.b2a))
    # Remove missing values
    distances_core = distances_core[
        distances_core["residue_idx_1_corrected"].notnull()
        & distances_core["residue_idx_2_corrected"].notnull()]
    # Convert to integers
    distances_core[["residue_idx_1_corrected",
                    "residue_idx_2_corrected"]] = distances_core[[
                        "residue_idx_1_corrected", "residue_idx_2_corrected"
                    ]].astype(int)
    # Sanity check
    assert (distances_core["residue_idx_1_corrected"] <
            distances_core["residue_idx_2_corrected"]).all()
    # Get the set of interactions
    interactions_1 = set(distances_core[(distances_core["distance"] <= 5)][[
        "residue_idx_1_corrected", "residue_idx_2_corrected"
    ]].apply(tuple, axis=1))
    # Get the reference set of interactions
    interactions_2 = {(int(r1), int(r2)) if r1 <= r2 else (int(r2), int(r1))
                      for r1, r2 in zip(row.residue_idx_1_corrected,
                                        row.residue_idx_2_corrected)
                      if pd.notnull(r1) and pd.notnull(r2)}
    assert not interactions_1 ^ interactions_2
    return (
        distances_core["residue_idx_1_corrected"].values,
        distances_core["residue_idx_2_corrected"].values,
        distances_core["distance"].values,
    )
Beispiel #7
0
def run_modeller(structure, alignment, temp_dir: Union[str, Path, Callable]):
    """Run Modeller to create a homology model.

    Args:
        structure: Structure of the template protein.
        alignment_file: Alignment of the target sequence(s) to chain(s) of the template structure.
        temp_dir: Location to use for storing Modeller temporary files and output.

    Returns:
        results: A dictionary of model properties. Of particular interest are the followng:

            `name`: The name of the generated PDB structure.
            `Normalized DOPE score`: DOPE score that should be comparable between structures.
            `GA341 score`: GA341 score that should be comparable between structures.
    """
    import modeller
    from modeller.automodel import assess, automodel, autosched

    if isinstance(structure, (str, Path)):
        structure = PDB.load(structure)

    if callable(temp_dir):
        temp_dir = Path(temp_dir())
    else:
        temp_dir = Path(temp_dir)

    assert len(alignment) == 2
    target_id = alignment[0].id
    template_id = alignment[1].id

    PDB.save(structure, temp_dir.joinpath(f"{template_id}.pdb"))
    alignment_file = temp_dir.joinpath(f"{template_id}-{target_id}.aln")
    structure_tools.write_pir_alignment(alignment, alignment_file)

    # Don't display log messages
    modeller.log.none()

    # Create a new MODELLER environment
    env = modeller.environ()

    # Directories for input atom files
    env.io.atom_files_directory = [str(temp_dir)]
    env.schedule_scale = modeller.physical.values(default=1.0, soft_sphere=0.7)

    # Selected atoms do not feel the neighborhood
    # env.edat.nonbonded_sel_atoms = 2
    env.io.hetatm = True  # read in HETATM records from template PDBs
    env.io.water = True  # read in WATER records (including waters marked as HETATMs)

    a = automodel(
        env,
        # alignment filename
        alnfile=str(alignment_file),
        # codes of the templates
        knowns=(str(template_id)),
        # code of the target
        sequence=str(target_id),
        # wich method for validation should be calculated
        assess_methods=(assess.DOPE, assess.normalized_dope, assess.GA341),
    )
    a.starting_model = 1  # index of the first model
    a.ending_model = 1  # index of the last model

    # Very thorough VTFM optimization:
    a.library_schedule = autosched.slow
    a.max_var_iterations = 300

    # Thorough MD optimization:
    # a.md_level = refine.slow
    a.md_level = None

    # a.repeat_optimization = 2

    # Stop if the objective function is higher than this value
    a.max_molpdf = 2e6

    with py_tools.log_print_statements(logger), system_tools.switch_paths(temp_dir):
        a.make()

    assert len(a.outputs) == 1
    return a.outputs[0]
Beispiel #8
0
def uploadFile(request):
    if not request.FILES:
        raise Http404
    if not ("fileToUpload" in request.FILES):
        raise Http404

    myfile = request.FILES["fileToUpload"]
    filetype = request.POST["filetype"]
    random_id = ""

    if myfile.size > 10000000:
        jsonDict = {"msg": "File is too large (>10 MB)", "error": 1}
        return HttpResponse(json.dumps(jsonDict),
                            content_type="application/json")

    try:
        process = Popen(["/usr/bin/file", "-i",
                         myfile.temporary_file_path()],
                        stdout=PIPE)
        stdout, stderr = process.communicate()

        if stdout.decode().split(" ")[1].split("/")[0] not in ("text",
                                                               "chemical"):
            msg = "Uploaded file has to be raw text (not '{0}')".format(
                stdout.decode().split(" ")[1][:-1])
            jsonDict = {"msg": msg, "error": 1}
            return HttpResponse(json.dumps(jsonDict),
                                content_type="application/json")

        # Protein list upload.
        if filetype == "prot":

            # Remove white-spaces and empty lines.
            lines = myfile.read().decode().split("\n")
            trimmedLines = []
            for idx, line in enumerate(lines):
                if idx >= 500:
                    break
                newline = sub(r"\s+", "", line)
                if newline:
                    trimmedLines.append(newline)
            msg = "\n".join(trimmedLines)

    except Exception as e:
        logger.error("Caught exception '%s': %s", type(e), e)
        jsonDict = {
            "msg": "File could not be uploaded - try again",
            "error": 1
        }
        return HttpResponse(json.dumps(jsonDict),
                            content_type="application/json")

    if filetype == "pdb":
        try:
            random_id = fn.get_random_id()
            user_path = fn.get_user_path(random_id)
            suffix = myfile.name.split(".")[-1]
            if suffix in ["cif", "mmcif"]:
                input_pdb = op.join(user_path, "input.cif")
            else:
                input_pdb = op.join(user_path, "input.pdb")

            with open(input_pdb, "w") as ofh:
                ofh.write(myfile.read().decode())

            structure = PDB.load(input_pdb)
            # Save cleaned up version of file
            PDB.save(structure, Path(input_pdb).with_suffix(".pdb"))
            structure_tools.process_structure(structure)
            seq = [(
                chain.id,
                structure_tools.get_chain_sequence(chain,
                                                   if_unknown="replace",
                                                   unknown_residue_marker="X"),
            ) for chain in structure.chains]
            logger.debug("seq: '%s'", seq)

            if len(seq) < 1:
                jsonDict = {
                    "msg": "PDB does not have any valid chains. ",
                    "error": 1
                }
                return HttpResponse(json.dumps(jsonDict),
                                    content_type="application/json")

            with open(op.join(user_path, "pdb_parsed.pickle"), "bw") as f:
                f.write(pickle.dumps(seq))

            msg = seq

        except Exception as e:
            logger.error("Caught exception '%s': %s", type(e), e)
            jsonDict = {"msg": f"PDB could not be parsed: {e}.", "error": 1}
            return HttpResponse(json.dumps(jsonDict),
                                content_type="application/json")

    jsonDict = {
        "inputfile": myfile.name or "uploadedFile",
        "userpath": random_id,
        "msg": msg,
        "error": 0,
    }

    return HttpResponse(json.dumps(jsonDict), content_type="application/json")
Beispiel #9
0
def f(pdb_id):
    session = Session()
    exists = session.query(ProteinSolverResult).filter_by(
        pdb_id=pdb_id).scalar()
    if exists:
        print(f"DIDZ {pdb_id}")
        return
    #!/usr/bin/env python
    # coding: utf-8

    # # ProteinSolver Demo
    #
    # Here, we load the ProteinSolver network and use it to design sequences that match the geometry of the PDZ domain.

    # In[1]:

    # In[2]:

    #
    # Globals
    #

    # PICK YOUR PROTEIN HERE!
    PDB_ID = pdb_id  #os.environ.get('PDB_ID', '2HE4')

    DATA_PATH = "/home/home3/fny/cs590/data/proteinsolver"
    PDB_PATH = Bio.PDB.PDBList().retrieve_pdb_file(PDB_ID,
                                                   file_format="pdb",
                                                   pdir=DATA_PATH)
    STRUCTURE = PDB.Structure(PDB_ID + "_A",
                              PDB.load(PDB_PATH)[0].extract('A'))
    MODEL_ID = "191f05de"
    MODEL_STATE = "protein_train/191f05de/e53-s1952148-d93703104.state"

    print('Protein ID:', PDB_ID)

    # The following should return True indicating GPUs are available.

    # In[3]:

    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    torch.cuda.is_available()

    # ## Load Model

    # In[4]:

    torch.cuda.empty_cache()

    # In[5]:

    get_ipython().run_line_magic('run', '../protein_train/{MODEL_ID}/model.py')

    # In[6]:

    # Model configuration
    batch_size = 1
    num_features = 20
    adj_input_size = 2
    hidden_size = 128
    frac_present = 0.5
    frac_present_valid = frac_present
    info_size = 1024
    state_file = MODEL_STATE

    net = Net(x_input_size=num_features + 1,
              adj_input_size=adj_input_size,
              hidden_size=hidden_size,
              output_size=num_features)
    net.load_state_dict(torch.load('../' + state_file, map_location=device))
    net.eval()
    net = net.to(device)

    # ## Data Preprocessing

    # Many proteins from the PDB did not work due to functional groups being placed at the residue locations. This portion of the script corrects for that.

    # In[7]:

    from typing import NamedTuple

    class ProteinData(NamedTuple):
        sequence: str
        row_index: torch.LongTensor
        col_index: torch.LongTensor
        distances: torch.FloatTensor

    def extract_seq_and_adj(structure, chain_id):
        domain, result_df = get_interaction_dataset_wdistances(
            StructureWrapper(structure), 0, chain_id, r_cutoff=12)
        domain_sequence = structure_tools.get_chain_sequence(domain)
        assert max(result_df["residue_idx_1"].values) < len(domain_sequence)
        assert max(result_df["residue_idx_2"].values) < len(domain_sequence)

        data = ProteinData(domain_sequence, result_df["residue_idx_1"].values,
                           result_df["residue_idx_2"].values,
                           result_df["distance"].values)
        return data

    def get_interaction_dataset_wdistances(structure,
                                           model_id,
                                           chain_id,
                                           r_cutoff=100):
        chain = structure[0][chain_id]
        num_residues = len(list(chain.residues))
        dd = structure_tools.DomainDef(model_id, chain_id, 1, num_residues)
        domain = structure_tools.extract_domain(structure, [dd])
        distances_core = structure_tools.get_distances(domain.to_dataframe(),
                                                       r_cutoff,
                                                       groupby="residue")
        assert (distances_core["residue_idx_1"] <=
                distances_core["residue_idx_2"]).all()
        return domain, distances_core

    class StructureWrapper(object):
        def __init__(self, structure):
            self.structure = structure

        def __getitem__(self, item):
            return StructureWrapper(self.structure[item])

        def __getattr__(self, name):
            if name == 'residues':
                rs = []
                for residue in STRUCTURE.residues:
                    x, _, _ = residue.id
                    if x == ' ':
                        rs.append(residue)
                return rs
            return getattr(self.structure, name)

    def preprocess(structure):
        return extract_seq_and_adj(StructureWrapper(STRUCTURE), 'A')

    STRUCTURE_SUMMARY = preprocess(STRUCTURE)

    # ## Searching for Designs

    # The model returns probabilities for every amino acid for each residue in the sequence. One method to search the space is using uniform cost search (i.e. single-source, greedy Djikstra's).
    #
    # We start with the initial sequence and run it through the model. We then find the amino acid with the highest score for each residue, create a series of new chains with those residues updated, and place the newly created chains back in the prioirty queue which is ordered by score.

    # In[8]:

    @torch.no_grad()
    def frontier(net, x, x_score, edge_index, edge_attr, cutoff):
        index_array = torch.arange(len(x))
        mask = x == 20

        # Compute the output
        output = torch.softmax(net(x, edge_index, edge_attr), dim=1)[mask]
        # Select missing positions
        index_array = index_array[mask]

        # Find the entry with the highest probability
        max_score, max_index = output.max(dim=1)[0].max(dim=0)
        row_with_max_score = output[max_index]

        # Build nodes to search where each node updates one
        # probability from the maximum found
        nodes = []
        for i, p in enumerate(row_with_max_score):
            x_clone = x.clone()
            x_score_clone = x_score.clone()
            x_clone[index_array[max_index]] = i
            x_score_clone[index_array[max_index]] = torch.log(p)
            nodes.append((x_clone, x_score_clone))
        return nodes

    @torch.no_grad()
    def protein_search(net,
                       x,
                       edge_index,
                       edge_attr,
                       candidates,
                       cutoff,
                       max_iters=1000000,
                       verbose=False):
        x_score = torch.ones_like(x).to(torch.float) * cutoff
        heap = [(0, torch.randn(1), x, x_score)]

        iters = tqdm.tqdm(range(max_iters)) if verbose else range(max_iters)

        for i in iters:
            p, tiebreaker, x, x_score = heapq.heappop(heap)
            n_missing = torch.sum(x == 20)
            if verbose and i % 1000 == 0:
                print(i, p, "- Heap:", len(heap), f", Results:",
                      len(candidates), f", Missing: {n_missing}/{x.shape[0]}")
            if n_missing == 0:
                candidates.append((p.cpu(), x.data.cpu().numpy(),
                                   x_score.data.cpu().numpy()))
                continue
            for x, x_score in frontier(net, x, x_score, edge_index, edge_attr,
                                       cutoff):
                pre_p = -x_score.sum()
                heapq.heappush(heap,
                               (-x_score.sum(), torch.randn(1), x, x_score))
            if len(heap) > 1_000_000:
                heap = heap[:700_000]
                heapq.heapify(heap)
        return candidates

    # Convert protein data and load it into the to GPU
    row_data = proteinsolver.datasets.protein.row_to_data(STRUCTURE_SUMMARY)
    data = proteinsolver.datasets.protein.transform_edge_attr(row_data)
    data.to(device)
    data.y = data.x

    candidates = []
    try:
        protein_search(net,
                       torch.ones_like(data.x) * 20,
                       data.edge_index,
                       data.edge_attr,
                       candidates=candidates,
                       cutoff=np.log(0.15),
                       verbose=False,
                       max_iters=5000)
    except KeyboardInterrupt:
        pass

    # ## Results

    # In[9]:

    df = pd.DataFrame(
        [(''.join(proteinsolver.utils.AMINO_ACIDS[i] for i in candidate[1]),
          candidate[2].sum(), candidate[2].sum() / len(candidate[1]),
          float((candidate[1] == data.x.data.cpu().numpy()).sum().item()) /
          data.x.size(0)) for candidate in candidates],
        columns=["sequence", "log_prob_sum", "log_prob_avg", "seq_identity"])

    # In[10]:

    df = df.sort_values("log_prob_avg", ascending=False).iloc[:200_000]

    # In[11]:

    df

    # In[12]:

    result = ProteinSolverResult(
        pdb_id=PDB_ID,
        n_results=df.shape[0],
        max_prob_avg=df['log_prob_avg'].max(),
        sequences=df['sequence'].values,
        log_prob_sums=df['log_prob_sum'].values,
        log_prob_avgs=df['log_prob_avg'].values,
        seq_identities=df['seq_identity'].values,
    )

    exists = session.query(ProteinSolverResult).filter_by(
        pdb_id=PDB_ID).scalar()
    if not exists:
        session.add(result)
        session.commit()

    print(f"DONE {pdb_id}")