Python DomainResidueMapping 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: apo_holo_structure_stats.core.dataclasses

클래스/타입: DomainResidueMapping

hotexamples.com에서의 예제들: 7

Python DomainResidueMapping - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 apo_holo_structure_stats.core.dataclasses.DomainResidueMapping에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

DomainResidueMapping(4)

from_domain_on_another_chain(3)

예제 #1

파일 보기

    def test_interdomain_surface_paper(self):
        s1, (s1_mapping,
             s1_entity_poly_seqs) = self.load_test_structure('1vr6')
        s2, (s2_mapping,
             s2_entity_poly_seqs) = self.load_test_structure('1rzm')

        # analyze just A chain (like in the paper)
        s1_chain_a = s1[0]['A']
        s2_chain_a = s2[0]['A']

        c1_mapping = s1_mapping[0]['A']
        c2_mapping = s2_mapping[0]['A']

        # divide into domains exactly like in the paper
        # todo check that label_seq_ids do correspond to those (changed it now) NO they dont, use bio residue mapping

        # todo s1d1, ....
        d1 = DomainResidueMapping(
            'D1', 'A', [1], [64]
        )  # can reuse the domain for both structures (here for testing purposes)
        d2 = DomainResidueMapping('D2', 'A', [65], [338])

        s1d1 = DomainResidues.from_domain(d1, s1[0], c1_mapping)
        s1d2 = DomainResidues.from_domain(d2, s1[0], c1_mapping)

        s2d1 = DomainResidues.from_domain(d1, s2[0], c2_mapping)
        s2d2 = DomainResidues.from_domain(d2, s2[0], c2_mapping)

        interdomain_surface_computer = GetInterfaceBuriedArea(
            (GetSASAForStructure(), ))
        apo__domain_interface_area = interdomain_surface_computer(s1d1, s1d2)
        holo__domain_interface_area = interdomain_surface_computer(s2d1, s2d2)

        # *2 = 218, 933 vs paper -- 288, 1024  # oni to asi nedělej dvěma..., ale priblibzne to odpovida
        self.assertAlmostEqual(288,
                               apo__domain_interface_area,
                               delta=0.3 * 288)  # 218
        self.assertAlmostEqual(1024,
                               holo__domain_interface_area,
                               delta=0.3 * 1024)  # 933

        # rmsd
        get_c_alpha_coords = GetCAlphaCoords()
        get_centroid = GetCentroid((get_c_alpha_coords, ))
        get_centered_c_alpha_coords = GetCenteredCAlphaCoords(
            (get_c_alpha_coords, get_centroid))
        get_rotation_matrix = GetRotationMatrix(
            (get_centered_c_alpha_coords, ))
        get_rmsd = GetRMSD((get_centered_c_alpha_coords, get_rotation_matrix))

        print(get_rmsd(
            s1d1 + s1d2,
            s2d1 + s2d2))  # 10.1 vs 8.0 in paper, todo celkem velky rozdil...

        # hinge
        get_hinge_angle = GetHingeAngle(
            (get_c_alpha_coords, get_centroid, get_rotation_matrix))
        screw_motion = get_hinge_angle(s1d1, s1d2, s2d1, s2d2)
        self.assertAlmostEqual(147, 180 / np.pi * screw_motion.angle, delta=2)

예제 #2

파일 보기

 def get_paper_domain(d: DomainResidueMapping, paper_spans, residue_id_mapping):
     # translate spans to label seq ids and return a domain object
     segment_beginnings = list(map(residue_id_mapping.find_label_seq, np.array(paper_spans)[:, 0].tolist()))
     segment_ends = list(map(residue_id_mapping.find_label_seq, np.array(paper_spans)[:, 1].tolist()))
     logger.debug(segment_beginnings)
     logger.debug(segment_ends)
     return DomainResidueMapping(d.domain_id, d.chain_id, segment_beginnings, segment_ends)

예제 #3

파일 보기

        def get_2DA(row, apo_or_holo: str):
            assert row[f'pdb_code_{apo_or_holo}_d1'] == row[f'pdb_code_{apo_or_holo}_d2'], \
                f'Domains from {apo_or_holo} 2DA should be from the same PDB structure.'

            return TwoDomainArrangement(
                row[f'pdb_code_{apo_or_holo}_d1'],
                *(DomainResidueMapping(
                    row[f'domain_id_{apo_or_holo}_{d1_or_d2}'],
                    row[f'chain_id_{apo_or_holo}_{d1_or_d2}'],
                    segment_beginnings=np.array(row[f'spans_{apo_or_holo}_{d1_or_d2}'])[:, 0].tolist(),
                    segment_ends=      np.array(row[f'spans_{apo_or_holo}_{d1_or_d2}'])[:, 1].tolist(),
                ) for d1_or_d2 in ('d1', 'd2'))
            )

예제 #4

파일 보기

    def test_hinge_guanylate_kinase_paper(self):
        apo = self.load_test_structure('1ex6')
        holo = self.load_test_structure('1ex7')

        apo_chain = apo[0][
            'B']  # note that different chain (as by dyndom), why?
        holo_chain = holo[0]['A']

        logging.root.setLevel(logging.INFO)
        self.assertTrue(sequences_same(apo_chain, holo_chain))

        apo_d1 = DomainResidues.from_domain(
            DomainResidueMapping('D1', 'B', [200 + 1, 200 + 84],
                                 [200 + 32, 200 + 186]), apo)
        apo_d2 = DomainResidues.from_domain(
            DomainResidueMapping('D2', 'B', [200 + 33], [200 + 83]), apo)

        # zmena chainu preci nepomahala, tak kde je zakopany pes?

        holo_d1 = DomainResidues.from_domain(
            DomainResidueMapping('D1', 'A', [1, 84], [32, 186]), holo)
        holo_d2 = DomainResidues.from_domain(
            DomainResidueMapping('D2', 'A', [33], [83]), holo)

        get_c_alpha_coords = GetCAlphaCoords()
        get_centroid = GetCentroid((get_c_alpha_coords, ))
        get_centered_c_alpha_coords = GetCenteredCAlphaCoords(
            (get_c_alpha_coords, get_centroid))
        get_rotation_matrix = GetRotationMatrix(
            (get_centered_c_alpha_coords, ))
        get_hinge_angle = GetHingeAngle(
            (get_c_alpha_coords, get_centroid, get_rotation_matrix))

        screw_motion = get_hinge_angle(apo_d1, apo_d2, holo_d1, holo_d2)
        self.assertAlmostEqual(
            47, 180 / np.pi * screw_motion.angle,
            delta=0.2)  # in paper: dyndom: 47°, their principal axes 43.9

예제 #5

파일 보기

def compare_chains(chain1: Chain, chain2: Chain,
                   c1_residue_mapping: BiopythonToMmcifResidueIds.Mapping,
                   c2_residue_mapping: BiopythonToMmcifResidueIds.Mapping,
                   c1_seq: Dict[int, str], c2_seq: Dict[int, str],  # in 3-letter codes
                   comparators__residues_param: List[Analyzer],
                   comparators__residue_ids_param: List[Analyzer],
                   comparators__domains__residues_param: List[Analyzer],
                   comparators__domains__residue_ids_param: List[Analyzer],
                   comparators__2domains__residues_param: List[Analyzer],
                       serializer_or_analysis_handler: AnalysisHandler,
                   domains_info: list,
                   ) -> None:
    """ Runs comparisons between two chains. E.g. one ligand-free (apo) and another ligand-bound (holo).
    :param chain1: A Bio.PDB Chain, obtained as a part of BioPython Structure object as usual
    :param chain2: A corresponding chain (same sequence), typically from a different PDB structure. See chain1.

    :param c1_residue_mapping:
    :param apo_poly_seqs:
    """
    s1_pdb_code = chain1.get_parent().get_parent().id
    s2_pdb_code = chain2.get_parent().get_parent().id

    logging.info(f'running analyses for ({s1_pdb_code}, {s2_pdb_code}) pair...')
    #
    # with warnings.catch_warnings():
    #     warnings.simplefilter("ignore")
    #     pp1 = chain_to_polypeptide(chain1)
    #     pp2 = chain_to_polypeptide(chain2)

    # c1_seq, c2_seq todo, is the order in atom_site loop guaranteed? If not, I should sort the dict by label_seq_id
    # also todo, is label_seq_id sequential, that is one-by-one always +1?
    # todo assert entity_poly_seq have no gaps (always +1), they say they're sequential, I think they mean exactly this

    # crop polypeptides to longest common substring
    c1_common_seq, c2_common_seq = get_longest_common_polypeptide(c1_seq, c2_seq)
    c1_label_seq_ids = list(c1_common_seq.keys())
    c2_label_seq_ids = list(c2_common_seq.keys())
    return
    label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0]

    # up to this point, we have residue ids of the protein sequence in the experiment. This also includes unobserved
    # residues, but those we will exclude from our analysis as their positions weren't determined
    c1_residues, c1_label_seq_ids, c2_residues, c2_label_seq_ids = get_observed_residues(
        chain1,
        c1_label_seq_ids,
        c1_residue_mapping,
        chain2,
        c2_label_seq_ids,
        c2_residue_mapping,
    )

    c1_residues = ChainResidues(c1_residues, s1_pdb_code, chain1.id)
    c2_residues = ChainResidues(c2_residues, s2_pdb_code, chain2.id)

    # todo trochu nesikovny
    c1_residue_ids = ChainResidueData[ResidueId]([ResidueId(label_seq_id, chain1.id) for label_seq_id in
                                                  c1_label_seq_ids], s1_pdb_code, chain1.id)
    c2_residue_ids = ChainResidueData[ResidueId]([ResidueId(label_seq_id, chain2.id) for label_seq_id in
                                                  c2_label_seq_ids], s2_pdb_code, chain2.id)

    # [done] tady nahradit pp pomocí apo_seq nějak
    # [done] v analyzerech (APIs) nahradit author_seq_id
    # todo tady matchovaní domén pomocí tohodle - zas mohu pouzit Sequence Matcher
    #   - ale spany, je to složitější -> zatím přeindexovat apo nebo holo do druhý...

    for a in comparators__residues_param:
        # this fn (run_analyses_for_isoform_group) does not know anything about serialization?
        # But it will know how nested it is (domain->structure) and can pass full identifiers of structures/domains

        serializer_or_analysis_handler.handle('chain2chain', a, a(c1_residues, c2_residues), c1_residues,
                                              c2_residues)  # in future maybe pass apo and holo. Will serialize itself. And output the object in rdf for example?
        # because what I would like is to output the analysis with objects identifiers, and then output the objects, what they contain (e.g. domain size?)


    for c in comparators__residue_ids_param:
        serializer_or_analysis_handler.handle('chain2chain', c, c(c1_residue_ids, c2_residue_ids), c1_residue_ids,
                                              c2_residue_ids)

    # domain-level analyses

    # get domains (set of auth_seq_id), sort them by domain id and hope they will correspond to each other
    # or could map corresponding domains by choosing the ones that have the most overlap?
    try:
        c1_domains = sorted(filter(lambda d: d.chain_id == chain1.id, get_domains(s1_pdb_code)), key=lambda d: d.domain_id)
        c2_domains = sorted(filter(lambda d: d.chain_id == chain2.id, get_domains(s2_pdb_code)), key=lambda d: d.domain_id)
        # todo zaznamenat total počet domén (pro obě struktury), zapsat do jinýho jsonu třeba

        for pdb_code, domains in ((s1_pdb_code, c1_domains), (s2_pdb_code, c2_domains)):
            for d in domains:
                domains_info.append(
                    {'type': 'full_domain',
                     'full_id': (pdb_code, d.chain_id, d.domain_id),
                     'pdb_code': pdb_code,
                     'chain_id': d.chain_id,
                     'domain_id': d.domain_id,
                     'spans': d.get_spans(),})


        # for d in c2_domains:
        #         domains_info.append(
        #     {'type': 'total_domains_found', 'result': len(c2_domains), 'pdb_code': s2_pdb_code, 'chain_id': chain2.id})
        # todo  spany domén, hlavně

    except APIException as e:
        if e.__cause__ and '404' in str(e.__cause__):
            logging.warning(f'{s1_pdb_code} {s2_pdb_code} no domains found, skip the domain-level analysis')
            return  # no domains found, skip the domain-level analysis
        raise


    # assert len(c1_domains) == len(c2_domains) # not always true, as expected, but now OK

    # SequenceMatcher on domain resiudes
    c1_domains__residues = []
    c2_domains__residues = []

    for c1_d in c1_domains:  # or c2_domains:
        # first remap first domain to second (or in future use longest common substrings, but not trivial since domains can be composed of multiple segments)
        # offset nemusí být všude stejný
        c1_domain_mapped_to_c2 = DomainResidueMapping.from_domain_on_another_chain(c1_d, chain2.id, label_seq_id_offset)

        # todo proc chain.get_parent?? Asi abych chain nemusel specifikovat (ale ted pracuju jenom s nima..)
        c1_d_residues = DomainResidues.from_domain(c1_d, chain1.get_parent(), c1_residue_mapping,
                                                   lambda id: id not in c1_label_seq_ids)
        c2_d_residues = DomainResidues.from_domain(c1_domain_mapped_to_c2, chain2.get_parent(), c2_residue_mapping,
                                                   lambda id: id not in c2_label_seq_ids)

        if not c1_d_residues or not c2_d_residues:
            # the domain is not within the processed LCS of both chains (empty intersection with chain residues)
            logging.warning(f'domain {c1_d.domain_id} is not within the processed LCS of both chains (empty '
                            f'intersection with '
                            f'chain residues)')
            continue

        c1_domains__residues.append(DomainResidues(c1_d_residues.data, c1_d_residues.structure_id, c1_d_residues.chain_id, c1_d_residues.domain_id))
        c2_domains__residues.append(DomainResidues(c2_d_residues.data, c2_d_residues.structure_id, c2_d_residues.chain_id, c2_d_residues.domain_id))

    for residue_mapping, domains in ((c1_residue_mapping, c1_domains__residues),
                                     (c2_residue_mapping, c2_domains__residues)):
        for d in domains:
            domains_info.append(
                {'type': 'analyzed_domain',
                 'full_id': d.get_full_id(),
                 'pdb_code': d.structure_id,
                 'chain_id': d.chain_id,
                 'domain_id': d.domain_id,
                 'spans': d.get_spans(residue_mapping),
                 'spans_auth_seq_id': d.get_spans(residue_mapping, auth_seq_id=True),
                 })

    #
    # # todo zaznamenat počet domén jdoucích do analýz
    # domains_info.append({'type': 'analyzed_domain_count', 'result': len(c1_domains__residues), 'pdb_code': s1_pdb_code, 'chain_id': chain1.id})
    # domains_info.append({'type': 'analyzed_domain_count', 'result': len(c2_domains__residues), 'pdb_code': s2_pdb_code, 'chain_id': chain2.id})

    # todo to tam taky neni v argumentech, ale harcoded.., to je ten muj fix...
    # todo tohle totiž neni párový porovnání.., ale 'jednotkový'
    #  - stejně jako get domains, get_ss (nikoliv compare ss), vlastne i sequence atp
    #  - cachovat surface area teda nedava smysl, nacte se proste z predvypocitanyho, jako normalne
    #  - nebo, proste jenom tyhle structure-level veci ma smysl "cachovat" resp nepocitat tady, pro kazdej par, ale
    #  - nacitat z filu/unpicklovat - to asi ne, mít serialize/deserialize (stejne chci to mit jako citelny vystup). 4
    #  -  A pak to klidně všechno pro rychlost deserializovat do pameti...
    # no, tak to abych se těšil zas na json/pandas-merge hell.. Vsude merge.. Vsude dupe cols/delat index (ten pak ale nekdy zas potrebujes v cols...)

    for chain_domains in (c1_domains__residues, c2_domains__residues):
        for d1, d2 in itertools.combinations(chain_domains, 2):
            serializer_or_analysis_handler.handle('2DA', get_interdomain_surface, get_interdomain_surface(d1, d2),
                                                  d1, d2)

    for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues):
        for a in comparators__domains__residues_param:
            serializer_or_analysis_handler.handle('domain2domain', a, a(d_chain1, d_chain2), d_chain1, d_chain2)

    # todo vyres ty divny idcka
    for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues):
        # Convert DomainResidues to DomainResidueData[ResidueId]
        # asi zas přes mapping... lepší by to bylo, kdyby byl implicitně schovaný třeba na to biopython residue (
        # jinak by to nešlo moc ani, leda mit CustomResidue s fieldama bioresidue a label_seq_id, to je ale celkem
        # naprd, nebo ne? Nefungovalo by to s chainem, ale to stejně nikde nepoužívám...
        d_chain1 = DomainResidueData[ResidueId]([ResidueId.from_bio_residue(r, c1_residue_mapping) for r in d_chain1],
                                                d_chain1.structure_id, d_chain1.chain_id, d_chain1.domain_id)
        d_chain2 = DomainResidueData[ResidueId]([ResidueId.from_bio_residue(r, c2_residue_mapping) for r in d_chain2],
                                                d_chain2.structure_id, d_chain2.chain_id, d_chain2.domain_id)

        for a in comparators__domains__residue_ids_param:
            serializer_or_analysis_handler.handle('domain2domain', a, a(d_chain1, d_chain2), d_chain1, d_chain2)

    # two-domain arrangements to two-domain arrangements
    for (d1_chain1, d1_chain2), (d2_chain1, d2_chain2) in itertools.combinations(zip(c1_domains__residues, c2_domains__residues), 2):
        # (in paper considered if of both apo and holo interdomain iface >= 200 A^2
        # if get_interdomain_surface(d1_chain1, d2_chain1) < 200 or get_interdomain_surface(d1_chain2, d2_chain2) < 200:
        #     continue

        for a in comparators__2domains__residues_param:
            serializer_or_analysis_handler.handle('chain2DA2chain2DA', a, a(d1_chain1, d2_chain1, d1_chain2,
                                                                            d2_chain2),
                                                  d1_chain1,
                                                  d2_chain1, d1_chain2, d2_chain2)

        d1d2_chain1 = d1_chain1 + d2_chain1
        d1d2_chain2 = d1_chain2 + d2_chain2
        serializer_or_analysis_handler.handle('chain2DA2chain2DA', get_rmsd, get_rmsd(d1d2_chain1, d1d2_chain2),
                                              d1d2_chain1,
                                              d1d2_chain2)  # todo hardcoded analysis

예제 #6

파일 보기

def visualize_2DA(apo_2DA, holo_2DA, paper_apo_spans):
    """ Writes superimposed holo structure to a file, prints Pymol script which can be directly pasted in pymol.

     Printed Pymol script will:
     1) automatically load both structures (superimposed holo from filesystem, apo from the internet)
     2) create objects and selections for domains, and the two-domain arrangements
     3) color the selections by domain, apo/holo and paper/ours
        - colors - ours more saturation, paper faded
            - red, yellow apo (first and second domain respectively)
            - green, blue holo
     4) provide example usage in the last script paragraph
     """

    # load the structure from file
    a = parse_mmcif(apo_2DA.pdb_code)
    h = parse_mmcif(holo_2DA.pdb_code)
    apo = a.structure
    holo = h.structure

    ###### vlozene z mainu
    apo_mapping = a.bio_to_mmcif_mappings[0][apo_2DA.d1.chain_id]
    holo_mapping = h.bio_to_mmcif_mappings[0][holo_2DA.d1.chain_id]

    # crop polypeptides to longest common substring
    c1_common_seq, c2_common_seq = get_longest_common_polypeptide(a.poly_seqs[apo_mapping.entity_poly_id], h.poly_seqs[holo_mapping.entity_poly_id])
    c1_label_seq_ids = list(c1_common_seq.keys())
    c2_label_seq_ids = list(c2_common_seq.keys())

    label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0]
    ###### end vlozene

    # get residues of the first domain, in both apo and holo structures
    apo_d1 = DomainResidues.from_domain(apo_2DA.d1, apo[0], apo_mapping)
    holo_d1 = DomainResidues.from_domain(holo_2DA.d1, holo[0], holo_mapping)
    # superimpose holo onto apo, using the first domain
    superimposed_holo_model = superimpose_structure(holo[0], holo_d1, apo_d1)
    # save the structure
    name = holo.id + f'_{holo_d1.domain_id}onto_{apo_d1.domain_id}'
    io = MMCIFIO()
    superimposed_holo = Structure(name)
    superimposed_holo.add(superimposed_holo_model)
    io.set_structure(superimposed_holo)
    sholo_file_path = Path(OUTPUT_DIR, name + '.cif')
    io.save(str(sholo_file_path), preserve_atom_numbering=True)

    def get_resi_selection(spans):
        selection = []
        for from_, to in spans:
            selection.append(f'resi {from_}-{to}')

        return '(' + ' or '.join(selection) + ')'

    # convert paper spans to label seqs, so we can show them in Pymol
    def get_paper_domain(d: DomainResidueMapping, paper_spans, residue_id_mapping):
        # translate spans to label seq ids and return a domain object
        segment_beginnings = list(map(residue_id_mapping.find_label_seq, np.array(paper_spans)[:, 0].tolist()))
        segment_ends = list(map(residue_id_mapping.find_label_seq, np.array(paper_spans)[:, 1].tolist()))
        logger.debug(segment_beginnings)
        logger.debug(segment_ends)
        return DomainResidueMapping(d.domain_id, d.chain_id, segment_beginnings, segment_ends)

    logger.debug(paper_apo_spans)  # [d1, d2] where d1 [(), (),...]
    paper_apo_drm1 = get_paper_domain(apo_2DA.d1, paper_apo_spans[0], apo_mapping)
    paper_apo_drm2 = get_paper_domain(apo_2DA.d2, paper_apo_spans[1], apo_mapping)
    label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0]
    paper_holo_drm1 = DomainResidueMapping.from_domain_on_another_chain(paper_apo_drm1, holo_d1.chain_id, label_seq_id_offset)
    paper_holo_drm2 = DomainResidueMapping.from_domain_on_another_chain(paper_apo_drm2, holo_d1.chain_id, label_seq_id_offset)  # same chain, for now, as in d1

    # create highlight script (by the spans, or just create multiple selections)
    # copy the 2 structures to 4 (paper spans vs our spans), so we can color them differently
    # select only the domains (2), and make only them visible

    sholo = superimposed_holo

    pymol_script = f"""
fetch {apo.id}
load {sholo_file_path.absolute()}

sele apo_d1, {apo.id} and chain {apo_2DA.d1.chain_id} and {get_resi_selection(apo_2DA.d1.get_spans())}
sele apo_d2, {apo.id} and chain {apo_2DA.d2.chain_id} and {get_resi_selection(apo_2DA.d2.get_spans())}
sele apo_2DA, apo_d1 or apo_d2

sele holo_d1, {sholo.id} and chain {holo_2DA.d1.chain_id} and {get_resi_selection(holo_2DA.d1.get_spans())}
sele holo_d2, {sholo.id} and chain {holo_2DA.d2.chain_id} and {get_resi_selection(holo_2DA.d2.get_spans())}
sele holo_2DA, holo_d1 or holo_d2

# copy objects, so we can color them differently
copy paper_{apo.id}, {apo.id}
copy paper_{sholo.id}, {sholo.id}

sele paper_apo_d1, paper_{apo.id} and chain {apo_2DA.d1.chain_id} and {get_resi_selection(paper_apo_drm1.get_spans())}
sele paper_apo_d2, paper_{apo.id} and chain {apo_2DA.d2.chain_id} and {get_resi_selection(paper_apo_drm2.get_spans())}
sele paper_apo_2DA, paper_apo_d1 or paper_apo_d2

sele paper_holo_d1, paper_{sholo.id} and chain {holo_2DA.d1.chain_id} and {get_resi_selection(paper_holo_drm1.get_spans())}
sele paper_holo_d2, paper_{sholo.id} and chain {holo_2DA.d2.chain_id} and {get_resi_selection(paper_holo_drm2.get_spans())}
sele paper_holo_2DA, paper_holo_d1 or paper_holo_d2

color red, apo_d1
color yellow, apo_d2
color green, holo_d1
color blue, holo_d2

color salmon, paper_apo_d1
color paleyellow, paper_apo_d2
color palegreen, paper_holo_d1
color lightblue, paper_holo_d2

# example usage: 
hide; show surface, apo_2DA
hide; show surface, paper_apo_2DA
hide; show surface, holo_2DA
hide; show surface, paper_holo_2DA

hide; show surface, apo_2DA or holo_2DA or paper_apo_2DA or paper_holo_2DA
    """

    print(pymol_script)

예제 #7

파일 보기

def compare_chains(
    chain1: Chain,
    chain2: Chain,
    c1_residue_mapping: BiopythonToMmcifResidueIds.Mapping,
    c2_residue_mapping: BiopythonToMmcifResidueIds.Mapping,
    c1_seq: Dict[int, str],
    c2_seq: Dict[int, str],  # in 3-letter codes
    lcs_result: LCSResult,
    # comparators__residues_param: List[Analyzer],  # todo tohle v pythonu 3.8 neni,
    # # ale spis bych mel mit duck-typing type annotations, protože to muze byt wrapply v cachich/a (de)serializerech..
    # # a nebude to inheritance ale composition (napr nebudu muset delat subclassy pro kazdy analyzer na to, abych tam pridal cache funkcionalitu...
    # comparators__residue_ids_param: List[Analyzer],
    # comparators__domains__residues_param: List[Analyzer],
    # comparators__domains__residue_ids_param: List[Analyzer],
    # comparators__2DA__residues_param: List[Analyzer],
    # get_domains,
    # get_rmsd,
    # get_interdomain_surface,
    # serializer_or_analysis_handler: AnalysisHandler,
    # domains_info: list,
    # one_struct_analyses_done_set: dict,
) -> None:
    """ Runs comparisons between two chains. E.g. one ligand-free (apo) and another ligand-bound (holo).
    :param chain1: A Bio.PDB Chain, obtained as a part of BioPython Structure object as usual
    :param chain2: A corresponding chain (same sequence), typically from a different PDB structure. See chain1.

    :param c1_residue_mapping:
    :param apo_poly_seqs:
    """
    s1_pdb_code = chain1.get_parent().get_parent().id
    s2_pdb_code = chain2.get_parent().get_parent().id

    logger.info(f'running analyses for ({s1_pdb_code}, {s2_pdb_code}) pair...')
    #
    # with warnings.catch_warnings():
    #     warnings.simplefilter("ignore")
    #     pp1 = chain_to_polypeptide(chain1)
    #     pp2 = chain_to_polypeptide(chain2)

    # c1_seq, c2_seq todo, is the order in atom_site loop guaranteed? If not, I should sort the dict by label_seq_id
    # also todo, is label_seq_id sequential, that is one-by-one always +1? - I use that contiguity assumption already...
    # - todo assert in code entity_poly_seq have no gaps (always +1), they say they're sequential, I think they mean exactly this
    #    - I could have done in filter structures, just to know for sure. If it were extensible already

    # crop polypeptides to longest common substring
    # todo - vzit z inputu (mam i1 a i2, staci ziskat offset a real label seq)
    # crop polypeptides to longest common substring
    i1, i2, length = lcs_result.i1, lcs_result.i2, lcs_result.length
    c1_common_seq = dict(itertools.islice(c1_seq.items(), i1, i1 + length))
    c2_common_seq = dict(itertools.islice(c2_seq.items(), i2, i2 + length))
    c1_label_seq_ids = list(c1_common_seq.keys())
    c2_label_seq_ids = list(c2_common_seq.keys())

    label_seq_id_offset = c2_label_seq_ids[0] - c1_label_seq_ids[0]

    # up to this point, we have residue ids of the protein sequence in the experiment. This also includes unobserved
    # residues, but those we will exclude from our analysis as their positions weren't determined
    c1_residues, c1_label_seq_ids, c2_residues, c2_label_seq_ids = get_observed_residues(
        chain1,
        c1_label_seq_ids,
        c1_residue_mapping,
        chain2,
        c2_label_seq_ids,
        c2_residue_mapping,
    )

    c1_residues = ChainResidues(c1_residues, s1_pdb_code, chain1.id)
    c2_residues = ChainResidues(c2_residues, s2_pdb_code, chain2.id)

    # todo trochu nesikovny
    c1_residue_ids = ChainResidueData[ResidueId]([
        ResidueId(label_seq_id, chain1.id) for label_seq_id in c1_label_seq_ids
    ], s1_pdb_code, chain1.id)
    c2_residue_ids = ChainResidueData[ResidueId]([
        ResidueId(label_seq_id, chain2.id) for label_seq_id in c2_label_seq_ids
    ], s2_pdb_code, chain2.id)

    # [done] tady nahradit pp pomocí apo_seq nějak
    # [done] v analyzerech (APIs) nahradit author_seq_id
    # todo tady matchovaní domén pomocí tohodle - zas mohu pouzit Sequence Matcher
    #   - ale spany, je to složitější -> zatím přeindexovat apo nebo holo do druhý...

    def run_analysis(level_tag, analysis, *args):
        try:
            plocal.serializer_or_analysis_handler.handle(
                level_tag, analysis, analysis(*args), *args)
        except AnalysisException:
            logger.exception(
                'Caught exception while computing analysis, all others will be run normally'
            )

    for a in plocal.comparators__residues_param:
        # this fn (run_analyses_for_isoform_group) does not know anything about serialization?
        # But it will know how nested it is (domain->structure) and can pass full identifiers of structures/domains

        run_analysis(
            'chain2chain', a, c1_residues, c2_residues
        )  # in future maybe pass apo and holo. Will serialize itself. And output the object in rdf for example?
        # because what I would like is to output the analysis with objects identifiers, and then output the objects, what they contain (e.g. domain size?)

    for a in plocal.comparators__residue_ids_param:
        run_analysis('chain2chain', a, c1_residue_ids, c2_residue_ids)

    # domain-level analyses

    # get domains (set of auth_seq_id), sort them by domain id and hope they will correspond to each other
    # or could map corresponding domains by choosing the ones that have the most overlap?
    try:
        c1_domains = sorted(filter(lambda d: d.chain_id == chain1.id,
                                   plocal.get_domains(s1_pdb_code)),
                            key=lambda d: d.domain_id)
        c2_domains = sorted(filter(lambda d: d.chain_id == chain2.id,
                                   plocal.get_domains(s2_pdb_code)),
                            key=lambda d: d.domain_id)

        # # todo tohle muzu dat uplne jinam.. treba do 1struct remote analyses, jestli to dá do ramky celej json vubec?
        # no ale ja musim nejak vyhodnotit ty analyzy v tom jupyter notebooku, jak to vůbec nactu do ramky úplně všechno vlastně??
        # asi nenačtu... Takže to musim zrušit z jupytera, nebo si udělám interactive job a pustim jupytera na metacentru s ramkou 48 gb treba a pripojim se na nej
        # jde tam vubec delat server?

        for pdb_code, domains in ((s1_pdb_code, c1_domains), (s2_pdb_code,
                                                              c2_domains)):
            key = (plocal.get_domains.get_name(), pdb_code)

            if key not in plocal.one_struct_analyses_done_set:  # todo, tady jsem to zapomnel nastavit, takze to vlastne nepouzivam. Jsou tam dupes, ale je to jedno v podstate..
                for d in domains:
                    plocal.domains_info.append({
                        'type':
                        'full_domain',
                        'full_id': (pdb_code, d.chain_id, d.domain_id),
                        'pdb_code':
                        pdb_code,
                        'chain_id':
                        d.chain_id,
                        'domain_id':
                        d.domain_id,
                        'spans':
                        d.get_spans(),
                    })

    except APIException as e:
        if e.__cause__ and '404' in str(e.__cause__):
            logger.warning(
                f'{s1_pdb_code} {s2_pdb_code} no domains found, skip the domain-level analysis'
            )
            return  # no domains found, skip the domain-level analysis
        raise
    except MissingDataException:
        logger.warning(
            f'{s1_pdb_code} {s2_pdb_code} no domains found, skip the domain-level analysis'
        )
        return  # no domains found, skip the domain-level analysis

    # assert len(c1_domains) == len(c2_domains) # not always true, as expected, but now OK

    # SequenceMatcher on domain resiudes
    c1_domains__residues = []
    c2_domains__residues = []

    # todo zatim to necham, ale uz mam i to defakto domain lcs
    for c1_d in c1_domains:  # or c2_domains:
        # first remap first domain to second (or in future use longest common substrings, but not trivial since domains can be composed of multiple segments)
        # offset nemusí být všude stejný
        c1_domain_mapped_to_c2 = DomainResidueMapping.from_domain_on_another_chain(
            c1_d, chain2.id, label_seq_id_offset)

        # todo proc chain.get_parent?? Asi abych chain nemusel specifikovat (ale ted pracuju jenom s nima..)
        c1_d_residues = DomainResidues.from_domain(
            c1_d, chain1.get_parent(), c1_residue_mapping,
            lambda id: id not in c1_label_seq_ids)
        c2_d_residues = DomainResidues.from_domain(
            c1_domain_mapped_to_c2, chain2.get_parent(), c2_residue_mapping,
            lambda id: id not in c2_label_seq_ids)

        if not c1_d_residues or not c2_d_residues:
            # the domain is not within the processed LCS of both chains (empty intersection with chain residues)
            logger.warning(
                f'domain {c1_d.domain_id} is not within the processed LCS of both chains (empty '
                f'intersection with '
                f'chain residues)')
            continue

        c1_domains__residues.append(
            DomainResidues(c1_d_residues.data, c1_d_residues.structure_id,
                           c1_d_residues.chain_id, c1_d_residues.domain_id))
        c2_domains__residues.append(
            DomainResidues(c2_d_residues.data, c2_d_residues.structure_id,
                           c2_d_residues.chain_id, c2_d_residues.domain_id))

    for residue_mapping, domains in ((c1_residue_mapping,
                                      c1_domains__residues),
                                     (c2_residue_mapping,
                                      c2_domains__residues)):
        for d in domains:
            # samozrejme blbost to ukladat pokazdy, kdyz je to paru s necim klidne nekolikrat...
            # EDIT Ne uplne, je to croply na observed residues z obou párů (a lcs)
            # ale musel bych upravit idcko a dat tam nejak ten pár...
            # dupes pres joby muzu mazat pres ['pair_id', 'full_id']
            plocal.domains_info.append({
                'type':
                'analyzed_domain',
                'pair_id':
                (c1_residues.get_full_id(), c2_residues.get_full_id()
                 ),  # domain cropping depends on the paired chains
                'full_id':
                d.get_full_id(),
                'pdb_code':
                d.structure_id,
                'chain_id':
                d.chain_id,
                'domain_id':
                d.domain_id,
                'spans':
                d.get_spans(residue_mapping),
                'spans_auth_seq_id':
                d.get_spans(residue_mapping, auth_seq_id=True),
            })

    # todo to tam taky neni v argumentech, ale harcoded.., to je ten muj fix...
    # todo tohle totiž neni párový porovnání.., ale 'jednotkový'
    #  - stejně jako get domains, get_ss (nikoliv compare ss), vlastne i sequence atp
    #  - cachovat surface area teda nedava smysl, nacte se proste z predvypocitanyho, jako normalne
    #  - nebo, proste jenom tyhle structure-level veci ma smysl "cachovat" resp nepocitat tady, pro kazdej par, ale
    #  - nacitat z filu/unpicklovat - to asi ne, mít serialize/deserialize (stejne chci to mit jako citelny vystup). 4
    #  -  A pak to klidně všechno pro rychlost deserializovat do pameti...
    # no, tak to abych se těšil zas na json/pandas-merge hell.. Vsude merge.. Vsude dupe cols/delat index (ten pak ale nekdy zas potrebujes v cols...)

    # TODO bud dát do 1struct (ale tam nechci nacitat mmcify- rekl bych hodne pomaly (Kolik to bylo procent casu 17?, urcite dost... nevim jestli bych 40K struktur nacet tak rychle.. spis ne)
    # 50/75 percentile trvá 0.5 s, takze klidne 40 K sekund = 10 hodin... dlouho, i v dost vlaknech..
    # budu to delat jednou per job teda..
    # tohle je jeste s domain argama, nevadi
    # asi ne.. rikal jsem, ze kazda domena muze byt jinak definovana.. zalezi podle druheho v paru..
    # takze todo, zase pridat pair id?
    #                - to pak ale zas neco budu muset upravit...
    for chain_domains in (c1_domains__residues, c2_domains__residues):
        for d1, d2 in itertools.combinations(chain_domains, 2):
            # key = (plocal.get_interdomain_surface.get_name(),) + (d1.get_full_id(), d2.get_full_id())

            pair_id = (c1_residues.get_full_id(), c2_residues.get_full_id())
            # hack, chci tam i pair_id
            plocal.serializer_or_analysis_handler.handle(
                '2DA', plocal.get_interdomain_surface,
                plocal.get_interdomain_surface(d1, d2), d1, d2, pair_id)
            # if key not in plocal.one_struct_analyses_done_set:
            # plocal.one_struct_analyses_done_set[key] = 1

    for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues):
        for a in plocal.comparators__domains__residues_param:
            run_analysis('domain2domain', a, d_chain1, d_chain2)

    # todo vyres ty divny idcka
    for d_chain1, d_chain2 in zip(c1_domains__residues, c2_domains__residues):
        # Convert DomainResidues to DomainResidueData[ResidueId]
        # asi zas přes mapping... lepší by to bylo, kdyby byl implicitně schovaný třeba na to biopython residue (
        # jinak by to nešlo moc ani, leda mit CustomResidue s fieldama bioresidue a label_seq_id, to je ale celkem
        # naprd, nebo ne? Nefungovalo by to s chainem, ale to stejně nikde nepoužívám...
        d_chain1 = DomainResidueData[ResidueId]([
            ResidueId.from_bio_residue(r, c1_residue_mapping) for r in d_chain1
        ], d_chain1.structure_id, d_chain1.chain_id, d_chain1.domain_id)
        d_chain2 = DomainResidueData[ResidueId]([
            ResidueId.from_bio_residue(r, c2_residue_mapping) for r in d_chain2
        ], d_chain2.structure_id, d_chain2.chain_id, d_chain2.domain_id)

        for a in plocal.comparators__domains__residue_ids_param:
            run_analysis('domain2domain', a, d_chain1, d_chain2)

    # two-domain arrangements to two-domain arrangements
    for (d1_chain1,
         d1_chain2), (d2_chain1, d2_chain2) in itertools.combinations(
             zip(c1_domains__residues, c2_domains__residues), 2):
        # (in paper considered if of both apo and holo interdomain iface >= 200 A^2
        # if get_interdomain_surface(d1_chain1, d2_chain1) < 200 or get_interdomain_surface(d1_chain2, d2_chain2) < 200:
        #     continue

        for a in plocal.comparators__2DA__residues_param:
            run_analysis('chain2DA2chain2DA', a, d1_chain1, d2_chain1,
                         d1_chain2, d2_chain2)

        d1d2_chain1 = d1_chain1 + d2_chain1
        d1d2_chain2 = d1_chain2 + d2_chain2
        run_analysis('chain2DA2chain2DA', plocal.get_rmsd, d1d2_chain1,
                     d1d2_chain2)  # todo hardcoded analysis