def member_info(self, member):
        with self.session() as session:
            info = session.query(mod.IfeInfo.pdb_id.label('pdb'),
                                 mod.IfeInfo.model).\
                filter_by(ife_id=member['id']).\
                one()
            info = row2dict(info)
            info.update(member)

            with self.session() as session:
                query = session.query(mod.ChainInfo.chain_name,
                                      mod.IfeChains.is_structured,
                                      ).\
                    join(mod.IfeChains,
                         mod.IfeChains.chain_id == mod.ChainInfo.chain_id).\
                    filter_by(ife_id=member['id'])

                if not query.count():
                    raise core.InvalidState("Could not find chains for %s" %
                                            member)

                all_chains = [row2dict(c) for c in query]
                valid = op.itemgetter('is_structured')
                chains = [c['chain_name'] for c in all_chains if valid(c)]
                if not chains:
                    chains = [c['chain_name'] for c in all_chains]

            info['chains'] = chains
            loader = self._create(IfeLoader)
            info['sym_op'] = loader.sym_op(info['pdb'])

            return info
    def position_info(self, unit):
        """Get the information about a position in an experimental sequence
        using a unit id.
        """

        self.logger.debug("Finding position for %s", unit)
        try:
            with self.session() as session:
                pos = mod.ExpSeqPosition
                mapping = mod.ExpSeqUnitMapping
                result = session.query(pos.index,
                                       pos.exp_seq_id,
                                       mod.UnitInfo.chain,
                                       mod.UnitInfo.model,
                                       mod.UnitInfo.sym_op,
                                       ).\
                    join(mapping,
                         mapping.exp_seq_position_id == pos.exp_seq_position_id).\
                    join(mod.UnitInfo,
                         mod.UnitInfo.unit_id == mapping.unit_id).\
                    filter(mapping.unit_id == unit).\
                    one()

            return row2dict(result)
        except:
            # handle the case where the unit id in the database table ends with ||A or ||B
            # but that is not being stored in unit.  Not sure why not.
            self.logger.info('Looking up sequence position of alternates of ' +
                             unit)
            newunit = '%' + unit + '%'
            with self.session() as session:
                pos = mod.ExpSeqPosition
                mapping = mod.ExpSeqUnitMapping
                result = session.query(pos.index,
                                       pos.exp_seq_id,
                                       mod.UnitInfo.chain,
                                       mod.UnitInfo.model,
                                       mod.UnitInfo.sym_op,
                                       ).\
                    join(mapping,
                         mapping.exp_seq_position_id == pos.exp_seq_position_id).\
                    join(mod.UnitInfo,
                         mod.UnitInfo.unit_id == mapping.unit_id).\
                    filter(mapping.unit_id.like(newunit)).\
                    first()

            if not result:
                self.logger.info('No experimental sequence position for ' +
                                 unit)

            return row2dict(result)
Exemple #3
0
    def unit_mapping(self, pdb):
        """
        Create a dictionary that maps from data produced by `as_key` to unit
        ids that are in the database. This will lookup all unit ids in the
        database and create the required mapping.

        Parameters
        ----------
        pdb : str
            The pdb id to look up a mapping for

        Returns
        -------
        mapping : dict
            The mapping dictionary to use.
        """
        mapping = coll.defaultdict(set)
        with self.session() as session:
            query = session.query(mod.UnitInfo).\
                filter_by(pdb_id=pdb)

            for result in query:
                entry = ut.row2dict(result)
                generic_key = as_key(entry, ignore_model=True)
                model_key = as_key(entry)
                mapping[generic_key].add(result.unit_id)
                mapping[model_key].add(result.unit_id)

        return mapping
    def load_nr_classes(self, release, resolution):
        with self.session() as session:
            query = session.query(
                mod.NrChains.rank.label('index'),
                mod.NrChains.ife_id.label('id'),
                mod.IfeInfo.pdb_id,
                mod.IfeInfo.length,
                mod.IfeChains.chain_id,
                mod.NrClasses.name,
            ).\
                join(mod.IfeInfo, mod.IfeInfo.ife_id == mod.NrChains.ife_id).\
                join(mod.IfeChains,
                     mod.IfeChains.ife_id == mod.IfeInfo.ife_id).\
                join(mod.NrClasses,
                     mod.NrClasses.nr_class_id == mod.NrChains.nr_class_id).\
                filter(mod.NrClasses.nr_release_id == release).\
                filter(mod.NrClasses.resolution == resolution).\
                filter(mod.IfeChains.index == 0)

            data = coll.defaultdict(list)
            for result in query:
                entry = row2dict(result)
                entry['rep'] = (entry['index'] == 0)
                nr = entry['name']
                data[nr].append(entry)
        return data.values()
    def normalized_mapping(self, pdb_id):
        """This produces a dictonary that can be used to correct bad unit ids.
        Some of the loops stored after we migrated the database have incorrect
        unit ids. The errors appear to be of 2 kinds, incorrect model number
        and possibly bad alt ids. By producing this mapping we try to correct
        the issue by finding the correct unit id.

        :param str pdb_id: The pdb id to get units for.
        :returns: A dictonary with Unit keys mapping to the unit id.
        """
        with self.session() as session:
            query = session.query(mod.UnitInfo.unit_id,
                                  mod.UnitInfo.pdb_id.label('pdb'),
                                  mod.UnitInfo.model,
                                  mod.UnitInfo.chain,
                                  mod.UnitInfo.number.label('component_number'),
                                  mod.UnitInfo.ins_code.label('insertion_code'),
                                  mod.UnitInfo.alt_id,
                                  mod.UnitInfo.sym_op.label('symmetry'),
                                  ).\
                filter(mod.UnitInfo.pdb_id == pdb_id)

            if not query.count():
                raise core.InvalidState("No units in %s" % pdb_id)

            mapping = {}
            for result in query:
                data = row2dict(result)
                unit_id = data.pop('unit_id')
                key = Unit(**data)
                if key in mapping:
                    raise core.InvalidState("Non unique mapping")
                mapping[key] = unit_id
        return mapping
Exemple #6
0
    def incomplete(self, pdb):
        """Load all incomplete nucleotides from the database. This will query
        the unit_incomplete for all incomplete data.

        Parameters
        ----------
        pdb : str
            The pdb id to use.

        Returns
        -------
        incomplete : set
            A set of unit ids that are incomplete.
        """
        with self.session() as session:
            query = session.query(mod.UnitIncomplete.pdb_id,
                                  mod.UnitIncomplete.model,
                                  mod.UnitIncomplete.chain,
                                  mod.UnitIncomplete.number,
                                  mod.UnitIncomplete.unit,
                                  mod.UnitIncomplete.alt_id,
                                  mod.UnitIncomplete.ins_code,
                                  ).\
                filter_by(pdb_id=pdb)
        return {Entry(**row2dict(r)) for r in query}
    def interactions(self, pdb):
        """Lookup all interactions for the given structure. This gets all
        interaction entries. If there are none this returns an empty list. The
        entries in the list are dictonaries with the same names as in
        `Exporter.headers`.

        Parameters
        ----------
        pdb : str
            The PDB id to look up interactions for

        Returns
        -------
        interactions : list
            A list of all interactions.
        """

        with self.session() as session:
            query = session.query(
                mod.UnitPairsInteractions.unit_id_1.label(self.headers[0]),
                mod.UnitPairsInteractions.unit_id_2.label(self.headers[1]),
                mod.UnitPairsInteractions.f_lwbp.label(self.headers[2]),
                mod.UnitPairsInteractions.f_stacks.label(self.headers[3]),
                mod.UnitPairsInteractions.f_bphs.label(
                    self.headers[4])).filter_by(pdb_id=pdb)

            count = query.count()
            if not count:
                self.logger.warning("No interactions found for %s", pdb)
            else:
                self.logger.info("Found %s interactions for %s", count, pdb)

            return [row2dict(result) for result in query]
    def interactions(self, pdb):
        """Lookup all interactions for the given structure. This gets all
        interaction entries. If there are none this returns an empty list. The
        entries in the list are dictonaries with the same names as in
        `Exporter.headers`.

        Parameters
        ----------
        pdb : str
            The PDB id to look up interactions for

        Returns
        -------
        interactions : list
            A list of all interactions.
        """

        with self.session() as session:
            query = session.query(
                mod.UnitPairsInteractions.unit_id_1.label(self.headers[0]),
                mod.UnitPairsInteractions.unit_id_2.label(self.headers[1]),
                mod.UnitPairsInteractions.f_lwbp.label(self.headers[2]),
                mod.UnitPairsInteractions.f_stacks.label(self.headers[3]),
                mod.UnitPairsInteractions.f_bphs.label(self.headers[4])
            ).filter_by(pdb_id=pdb)

            count = query.count()
            if not count:
                self.logger.warning("No interactions found for %s", pdb)
            else:
                self.logger.info("Found %s interactions for %s", count, pdb)

            return [row2dict(result) for result in query]
    def mapping(self, pdb):
        """Create a dictionary that maps from data produced by `as_key` to unit
        ids that are in the database. This will lookup all unit ids in the
        database and create the required mapping.

        Parameters
        ----------
        pdb : str
            The pdb id to look up a mapping for

        Returns
        -------
        mapping : dict
            The mapping dictionary to use.
        """

        mapping = coll.defaultdict(list)
        with self.session() as session:
            query = session.query(mod.UnitInfo).\
                filter_by(pdb_id=pdb)

            for result in query:
                key = as_key(ut.row2dict(result))
                mapping[key].append(result.unit_id)

        return mapping
Exemple #10
0
    def load_quality(self, members):
        def as_quality(data):
            return {
                'has': {key for key, value in data.items() if value},
                'rsrz': data.get('rsrz') or 100,
                'backbone': data.get('backbone') or 100,
                'clashscore': data.get('clashscore') or 500,
            }

        known = {m['pdb'] for m in members}
        with self.session() as session:
            query = session.query(mod.PdbQuality.pdb_id,
                                  mod.PdbQuality.percent_rsrz_outliers.
                                  label('rsrz'),
                                  mod.PdbQuality.clashscore,
                                  mod.PdbQuality.percent_rota_outliers.
                                  label('backbone'),
                                  ).\
                filter(mod.PdbQuality.pdb_id.in_(known))

            measures = {}
            for result in query:
                result = row2dict(result)
                pdb_id = result.pop('pdb_id')
                measures[pdb_id] = as_quality(result)

        for member in members:
            pdb_id = member['pdb']
            member['quality'] = measures.get(pdb_id, as_quality({}))

        return members
    def chain_status(self, reps, release, resolution, **kwargs):
        with self.session() as session:
            query = session.query(mod.NrClasses.name.label('Group'),
                                  mod.NrClasses.nr_release_id.label('Release'),
                                  mod.NrChains.ife_id.label('IFE'),
                                  mod.IfeInfo.bp_count.label('BP'),
                                  mod.IfeInfo.pdb_id.label('PDB'),
                                  mod.IfeInfo.length.label('NT'),
                                  mod.NrChains.rep,
                                  ).\
                join(mod.NrChains,
                     mod.NrChains.nr_class_id == mod.NrClasses.nr_class_id).\
                join(mod.IfeInfo,
                     mod.IfeInfo.ife_id == mod.NrChains.ife_id).\
                filter(mod.NrClasses.nr_release_id == release).\
                filter(mod.NrClasses.resolution == resolution)

            data = []
            for result in query:
                entry = row2dict(result)
                rep = entry.pop('rep')
                entry['Current'] = 'Member'
                entry['Ratio'] = round(float(entry['BP']) / entry['NT'], 4)
                if rep:
                    entry['Current'] = 'Representative'
                for method, status in reps[entry['IFE']].items():
                    entry[method] = 'Member'
                    if status:
                        entry[method] = 'Representative'
                data.append(entry)
            return data
    def chains(self, release_id, resolution):
        with self.session() as session:
            chains = mod.NrChains
            ife = mod.IfeInfo
            pdbs = mod.PdbInfo
            classes = mod.NrClasses
            query = session.query(chains.nr_release_id,
                                  classes.name,
                                  classes.handle,
                                  classes.version,
                                  ife.ife_id.label('id'),
                                  ife.bp_count.label('bp'),
                                  ife.length,
                                  pdbs.resolution,
                                  pdbs.experimental_technique.label('method'),
                                  ).\
                join(ife, ife.ife_id == chains.ife_id).\
                join(classes, classes.nr_class_id == chains.nr_class_id).\
                join(pdbs, pdbs.pdb_id == ife.pdb_id).\
                filter(classes.nr_release_id == release_id).\
                filter(classes.resolution == resolution).\
                order_by(classes.name)

            found = [row2dict(r) for r in query]
            grouped = it.groupby(found, op.itemgetter('name'))
            return [list(g) for n, g in grouped]
    def normalized_mapping(self, pdb_id):
        """This produces a dictonary that can be used to correct bad unit ids.
        Some of the loops stored after we migrated the database have incorrect
        unit ids. The errors appear to be of 2 kinds, incorrect model number
        and possibly bad alt ids. By producing this mapping we try to correct
        the issue by finding the correct unit id.

        :param str pdb_id: The pdb id to get units for.
        :returns: A dictonary with Unit keys mapping to the unit id.
        """
        with self.session() as session:
            query = session.query(mod.UnitInfo.unit_id,
                                  mod.UnitInfo.pdb_id.label('pdb'),
                                  mod.UnitInfo.model,
                                  mod.UnitInfo.chain,
                                  mod.UnitInfo.number.label('component_number'),
                                  mod.UnitInfo.ins_code.label('insertion_code'),
                                  mod.UnitInfo.alt_id,
                                  mod.UnitInfo.sym_op.label('symmetry'),
                                  ).\
                filter(mod.UnitInfo.pdb_id == pdb_id)

            if not query.count():
                raise core.InvalidState("No units in %s" % pdb_id)

            mapping = {}
            for result in query:
                data = row2dict(result)
                unit_id = data.pop('unit_id')
                key = Unit(**data)
                if key in mapping:
                    raise core.InvalidState("Non unique mapping")
                mapping[key] = unit_id
        return mapping
    def pdb_info(self, ifes):
        pdb_ids = self.class_property(ifes, 'pdb_id')
        with self.session() as session:
            query = session.query(
                mod.PdbInfo.pdb_id.label('PDB'),
                mod.PdbInfo.resolution.label('Resolution'),
                mod.PdbInfo.experimental_technique.label('Method'),
                mod.PdbInfo.title.label('Title'),
            ).filter(mod.PdbInfo.pdb_id.in_(pdb_ids))

            data = {}
            for result in query:
                entry = row2dict(result)
                method = entry.pop('Method')
                if method == 'X-RAY DIFFRACTION':
                    method = 'x-ray'
                elif method == 'SOLUTION NMR':
                    method = 'nmr'
                elif method == 'ELECTRON MICROSCOPY':
                    method = 'cryo-em'
                elif method == 'FIBER DIFFRACTION':
                    method = 'fib-dif'
                elif method == 'FLUORESCENCE TRANSFER':
                    method = 'fluo-trans'
                elif method == 'SOLID-STATE NMR':
                    method = 'nmr-sld-sta'
                elif method == 'SOLUTION NMR, SOLUTION SCATTERING':
                    method = 'nmr-sol-scat'
                elif method == 'SOLUTION NMR, THEORETICAL MODEL':
                    method = 'nmr-sol-theo'
                else:
                    method = method
                entry['Method'] = method
                data[entry['PDB']] = entry
        return data
    def units_between(self, unit1, unit2):
        """Get a list of all units between two units. This assumes they are on
        the same chain and have the same symmetry operator.
        """

        start = self.position_info(unit1)
        stop = self.position_info(unit2)
        with self.session() as session:
            units = mod.UnitInfo
            mapping = mod.ExpSeqUnitMapping
            pos = mod.ExpSeqPosition
            query = session.query(units.pdb_id,
                                  units.model,
                                  units.chain,
                                  units.number,
                                  units.unit,
                                  units.alt_id,
                                  units.ins_code,
                                  ).\
                join(mapping,
                     mapping.unit_id == units.unit_id).\
                join(pos,
                     mapping.exp_seq_position_id == pos.exp_seq_position_id).\
                filter(pos.exp_seq_id == start['exp_seq_id']).\
                filter(pos.index >= start['index']).\
                filter(pos.index <= stop['index']).\
                filter(units.chain == start['chain']).\
                filter(units.model == start['model']).\
                filter(units.sym_op == start['sym_op']).\
                distinct().\
                order_by(asc(pos.index))

            return [Entry(**row2dict(r)) for r in query]
Exemple #16
0
    def units_between(self, unit1, unit2):
        """Get a list of all units between two units. This assumes they are on
        the same chain and have the same symmetry operator.
        """

        start = self.position_info(unit1)
        stop = self.position_info(unit2)
        with self.session() as session:
            units = mod.UnitInfo
            mapping = mod.ExpSeqUnitMapping
            pos = mod.ExpSeqPosition
            query = session.query(units.pdb_id,
                                  units.model,
                                  units.chain,
                                  units.number,
                                  units.unit,
                                  units.alt_id,
                                  units.ins_code,
                                  ).\
                join(mapping,
                     mapping.unit_id == units.unit_id).\
                join(pos,
                     mapping.exp_seq_position_id == pos.exp_seq_position_id).\
                filter(pos.exp_seq_id == start['exp_seq_id']).\
                filter(pos.index >= start['index']).\
                filter(pos.index <= stop['index']).\
                filter(units.chain == start['chain']).\
                filter(units.model == start['model']).\
                filter(units.sym_op == start['sym_op']).\
                distinct().\
                order_by(asc(pos.index))

            return [Entry(**row2dict(r)) for r in query]
Exemple #17
0
    def positions(self, pdb, chain):
        exp_seq = self.exp_seq(pdb, chain)
        with self.session() as session:
            esum = mod.ExpSeqUnitMapping
            esp = mod.ExpSeqPosition
            escm = mod.ExpSeqChainMapping
            ci = mod.ChainInfo
            query = session.query(
                esum.unit_id,
                esp.index,
                esp.unit,
            ).join(esp, esp.exp_seq_position_id == esum.exp_seq_position_id).\
                join(escm, escm.exp_seq_chain_mapping_id == esum.exp_seq_chain_mapping_id).\
                join(ci, ci.chain_id == escm.chain_id).\
                filter(ci.pdb_id == pdb).\
                filter(ci.chain_name == chain)

            if not query.count():
                raise core.InvalidState(
                    "Could not load positions for %s|1|%s" % (pdb_id, chain))

            positions = []
            for result in query:
                entry = row2dict(result)
                entry['observed'] = int(result.unit_id is not None)
                entry['index'] = entry['index'] + 1
                positions.append(entry)
            return positions
Exemple #18
0
    def pairs(self, pdb):
        pairs = coll.defaultdict(lambda: {
            'Pairs': set(),
            'Stacks': set(),
            'Basephosphate': set()
        })
        with self.session() as session:
            interactions = mod.UnitPairsInteractions
            query = session.query(interactions.unit_id_1,
                                  interactions.unit_id_2,
                                  interactions.f_lwbp.label('Pairs'),
                                  interactions.f_stacks.label('Stacks'),
                                  interactions.f_bphs.label('Basephosphate'),
                                  ).\
                filter(interactions.pdb_id == pdb)
            for result in query:
                data = row2dict(result)
                unit1 = data.pop('unit_id_1')
                unit2 = data.pop('unit_id_2')
                if unit1 == unit2 and data['Basephosphate'] and \
                        '0BPh' in data['Basephosphate']:
                    data['Basephosphate'] = None
                for name, value in data.items():
                    if value and not value.startswith('n'):
                        pairs[unit1][name].add(unit2)

        return pairs
    def revised_chain_info(self, ifes):
        self.logger.debug('ifes: %s' % ifes)
        ife_ids = self.class_property(ifes, 'id')
        with self.session() as session:
            query = session.query(
                mod.IfeInfo.ife_id,
                func.sum(mod.ChainInfo.chain_length).label('Exp Length (CI)'),
                func.group_concat(mod.ChainInfo.sequence.op('SEPARATOR')('+')).label('Exp Sequence (CI)'),
                func.group_concat(mod.ChainInfo.compound.op('SEPARATOR')(' + ')).label('Nucleic Acid Compound'),
                func.group_concat(mod.SpeciesMapping.species_name.op('SEPARATOR')(' / ')).label('RNA Species'),
            ).\
                join(mod.IfeChains,
                     mod.IfeChains.ife_id == mod.IfeInfo.ife_id).\
                join(mod.ChainInfo,
                     mod.ChainInfo.chain_id == mod.IfeChains.chain_id).\
                join(mod.ChainSpecies,
                     mod.ChainSpecies.chain_id == mod.ChainInfo.chain_id).\
                outerjoin(mod.SpeciesMapping,
                     mod.SpeciesMapping.species_mapping_id == mod.ChainSpecies.species_id).\
                filter(mod.IfeInfo.ife_id.in_(ife_ids)).\
                group_by(mod.IfeInfo.ife_id)

            data = {}
            for result in query:
                entry = row2dict(result)
                #entry['Exp Length (CI)'] = len(entry['Exp Sequence (CI)'])
                ife_id = entry.pop('ife_id')
                data[ife_id] = entry
        return data
Exemple #20
0
    def positions(self, pdb, chain):
        exp_seq = self.exp_seq(pdb, chain)
        with self.session() as session:
            esum = mod.ExpSeqUnitMapping
            esp = mod.ExpSeqPosition
            escm = mod.ExpSeqChainMapping
            ci = mod.ChainInfo
            query = session.query(
                esum.unit_id,
                esp.index,
                esp.unit,
            ).join(esp, esp.exp_seq_position_id == esum.exp_seq_position_id).\
                join(escm, escm.exp_seq_chain_mapping_id == esum.exp_seq_chain_mapping_id).\
                join(ci, ci.chain_id == escm.chain_id).\
                filter(ci.pdb_id == pdb).\
                filter(ci.chain_name == chain)

            if not query.count():
                raise core.InvalidState("Could not load positions for %s|1|%s" % (pdb_id, chain))

            positions = []
            for result in query:
                entry = row2dict(result)
                entry['observed'] = int(result.unit_id is not None)
                entry['index'] = entry['index'] + 1
                positions.append(entry)
            return positions
    def incomplete(self, pdb):
        """Load all incomplete nucleotides from the database. This will query
        the unit_incomplete for all incomplete data.

        Parameters
        ----------
        pdb : str
            The pdb id to use.

        Returns
        -------
        incomplete : set
            A set of unit ids that are incomplete.
        """
        with self.session() as session:
            query = session.query(mod.UnitIncomplete.pdb_id,
                                  mod.UnitIncomplete.model,
                                  mod.UnitIncomplete.chain,
                                  mod.UnitIncomplete.number,
                                  mod.UnitIncomplete.unit,
                                  mod.UnitIncomplete.alt_id,
                                  mod.UnitIncomplete.ins_code,
                                  ).\
                filter_by(pdb_id=pdb)
        return {Entry(**row2dict(r)) for r in query}
Exemple #22
0
 def test_computes_both_discrepancies(self):
     c1 = self.chain_id('1X8W', 'D')
     c2 = self.chain_id('1GRZ', 'B')
     corr_id = self.corr_id(c1, c2)
     val = [row2dict(d) for d in self.loader.data((c1, c2))]
     assert len(val) == 2
     # Remove discrepancy since it needs a different method
     assert_almost_equal(val[0].pop('discrepancy'), 0.227388, decimal=6)
     assert_almost_equal(val[1].pop('discrepancy'), 0.227388, decimal=6)
     del val[0]['chain_chain_similarity_id']
     del val[1]['chain_chain_similarity_id']
     assert val[0] == {
         'chain_id_1': c1,
         'chain_id_2': c2,
         'model_1': 1,
         'model_2': 1,
         'correspondence_id': corr_id,
         'num_nucleotides': 242
     }
     assert val[1] == {
         'chain_id_1': c2,
         'chain_id_2': c1,
         'model_1': 1,
         'model_2': 1,
         'correspondence_id': corr_id,
         'num_nucleotides': 242
     }
def dump(filename, **kwargs):
    """Dump chain chain comparison data to a file. This will dump all chain
    chain comparison data to a file for later import. The data is pickled for
    easy reading and writing in python.

    Parameters
    ----------
    filename : str
        Name of the file to write to.
    """

    session = setup(**kwargs)
    with session() as sess:
        chain1 = aliased(mod.ChainInfo)
        chain2 = aliased(mod.ChainInfo)
        query = sess.query(mod.ChainChainSimilarity.discrepancy,
                           mod.ChainChainSimilarity.num_nucleotides,
                           mod.ChainChainSimilarity.model_1,
                           mod.ChainChainSimilarity.model_2,
                           chain1.pdb_id.label('pdb_id1'),
                           chain1.chain_name.label('chain_name1'),
                           chain2.pdb_id.label('pdb_id2'),
                           chain2.chain_name.label('chain_name2'),
                           ).\
            join(chain1,
                 chain1.chain_id == mod.ChainChainSimilarity.chain_id_1).\
            join(chain2,
                 chain2.chain_id == mod.ChainChainSimilarity.chain_id_2)

        results = [row2dict(r) for r in query]
        with open(filename, 'wb') as out:
            pickle.dump(results, out)
    def current(self, corr_id):
        """Get the current data for the correspondence.
        """

        with self.session() as session:
            info = session.query(mod.CorrespondenceInfo).get(corr_id)
            return utils.row2dict(info)
def dump(filename, **kwargs):
    """Dump chain chain comparison data to a file. This will dump all chain
    chain comparison data to a file for later import. The data is pickled for
    easy reading and writing in python.

    Parameters
    ----------
    filename : str
        Name of the file to write to.
    """

    session = setup(**kwargs)
    with session() as sess:
        chain1 = aliased(mod.ChainInfo)
        chain2 = aliased(mod.ChainInfo)
        query = sess.query(mod.ChainChainSimilarity.discrepancy,
                           mod.ChainChainSimilarity.num_nucleotides,
                           mod.ChainChainSimilarity.model_1,
                           mod.ChainChainSimilarity.model_2,
                           chain1.pdb_id.label('pdb_id1'),
                           chain1.chain_name.label('chain_name1'),
                           chain2.pdb_id.label('pdb_id2'),
                           chain2.chain_name.label('chain_name2'),
                           ).\
            join(chain1,
                 chain1.chain_id == mod.ChainChainSimilarity.chain_id_1).\
            join(chain2,
                 chain2.chain_id == mod.ChainChainSimilarity.chain_id_2)

        results = [row2dict(r) for r in query]
        with open(filename, 'wb') as out:
            pickle.dump(results, out)
    def info(self, chain_id):
        """Load the required information about a chain. Since we want to use
        the results of this loader for the NR stages we use the same data as
        was in the IFE's the given chain is a part of.

        Parameters
        ----------
        chain_id : int
            The chain id to look up.

        Returns
        -------
        ife_info : dict
            A dict with a 'chain_name', 'chain_id', `pdb`, `model`, `ife_id`,
            `sym_op`, and `name` keys.
        """

        with self.session() as session:
            query = session.query(mod.ChainInfo.chain_name,
                                  mod.ChainInfo.chain_id,
                                  mod.IfeInfo.pdb_id.label('pdb'),
                                  mod.IfeInfo.model,
                                  mod.IfeInfo.ife_id,
                                  ).\
                join(mod.IfeChains,
                     mod.IfeChains.chain_id == mod.ChainInfo.chain_id).\
                join(mod.IfeInfo,
                     mod.IfeInfo.ife_id == mod.IfeChains.ife_id).\
                filter(mod.IfeInfo.new_style == 1).\
                filter(mod.ChainInfo.chain_id == chain_id)

        if not query.count():
            raise core.InvalidState("Could not load chain with id %s" %
                                    chain_id)
        ife = ut.row2dict(query.first())

        with self.session() as session:
            query = session.query(mod.UnitInfo.sym_op,
                                  mod.UnitInfo.alt_id,
                                  ).\
                join(mod.ChainInfo,
                     (mod.ChainInfo.pdb_id == mod.UnitInfo.pdb_id) &
                     (mod.ChainInfo.chain_name == mod.UnitInfo.chain)).\
                join(mod.UnitCenters,
                     mod.UnitCenters.unit_id == mod.UnitInfo.unit_id).\
                join(mod.UnitRotations,
                     mod.UnitRotations.unit_id == mod.UnitInfo.unit_id).\
                filter(mod.ChainInfo.chain_id == chain_id).\
                distinct()

            if not query.count():
                raise core.InvalidState("Could not get info for chain %s" %
                                        chain_id)

            ife['sym_op'] = pick(['1_555', 'P_1'], 'sym_op', query)
            ife['alt_id'] = pick([None, 'A', 'B'], 'alt_id', query)
            ife['name'] = ife['ife_id'] + '+' + ife['sym_op']
            return ife
def correspondence_id_mapping(session, data, ignore_missing=False):
    """Create a mapping from compared chain chain to correspondence ids. This
    will fail if not all chains in the input data could be mapped, if
    ignore_missing is False (the default behavior), otherwise it will only log
    the error.

    Parameters
    ----------
    session : pymotifs.core.Session
        The sesson to use
    data : list
        A list of dictionaries with pdb_id1, chain_name1, pdb_id2, chain_name2
        entries.
    ignore_missing : bool, optional
        A flag to make this ignore missing chains. In this case errors are only
        logged.

    Returns
    -------
    mapping : dict
        A dictionary mapping (chain_id, chain_id) to correspondence id.
    """

    entries = {(chain1(e), chain2(e)) for e in data}

    with session() as sess:
        corr = mod.CorrespondencePdbs
        query = sess.query(
            corr.correspondence_id,
            corr.pdb_id_1.label('pdb_id1'),
            corr.pdb_id_2.label('pdb_id2'),
            corr.chain_name_1.label('chain_name1'),
            corr.chain_name_2.label('chain_name2'),
            corr.chain_id_1,
            corr.chain_id_2,
        )

        mapping = {}
        for result in query:
            result = row2dict(result)
            ids = (chain1(result), chain2(result))
            if ids not in entries:
                continue
            key = (result['chain_id_1'], result['chain_id_2'])
            if key in mapping:
                raise ValueError("Duplicate mapping found %s" % ids)
            mapping[key] = result['correspondence_id']
            entries.remove(ids)

    logger.info("Found %i/%i correspondences", len(mapping), len(entries))
    if entries:
        logger.error("Could not map all correspondences %s", str(entries))
        if not ignore_missing:
            raise ValueError("Could not map all correspondences %s" %
                             str(entries))

    return mapping
 def test_it_assigns_valid_data(self):
     assert row2dict(self.motifs[0]) == {
         'motif_id': 'IL_85752.1',
         'ml_release_id': '0.1',
         'type': 'IL',
         'handle': '85752',
         'version': 1,
         'comment': 'New id, no parents',
     }
def correspondence_id_mapping(session, data, ignore_missing=False):
    """Create a mapping from compared chain chain to correspondence ids. This
    will fail if not all chains in the input data could be mapped, if
    ignore_missing is False (the default behavior), otherwise it will only log
    the error.

    Parameters
    ----------
    session : pymotifs.core.Session
        The sesson to use
    data : list
        A list of dictionaries with pdb_id1, chain_name1, pdb_id2, chain_name2
        entries.
    ignore_missing : bool, optional
        A flag to make this ignore missing chains. In this case errors are only
        logged.

    Returns
    -------
    mapping : dict
        A dictionary mapping (chain_id, chain_id) to correspondence id.
    """

    entries = {(chain1(e), chain2(e)) for e in data}

    with session() as sess:
        corr = mod.CorrespondencePdbs
        query = sess.query(corr.correspondence_id,
                           corr.pdb_id_1.label('pdb_id1'),
                           corr.pdb_id_2.label('pdb_id2'),
                           corr.chain_name_1.label('chain_name1'),
                           corr.chain_name_2.label('chain_name2'),
                           corr.chain_id_1,
                           corr.chain_id_2,
                           )

        mapping = {}
        for result in query:
            result = row2dict(result)
            ids = (chain1(result), chain2(result))
            if ids not in entries:
                continue
            key = (result['chain_id_1'], result['chain_id_2'])
            if key in mapping:
                raise ValueError("Duplicate mapping found %s" % ids)
            mapping[key] = result['correspondence_id']
            entries.remove(ids)

    logger.info("Found %i/%i correspondences", len(mapping), len(entries))
    if entries:
        logger.error("Could not map all correspondences %s", str(entries))
        if not ignore_missing:
            raise ValueError("Could not map all correspondences %s" %
                             str(entries))

    return mapping
Exemple #30
0
 def known(self):
     annotations = {}
     with self.session() as session:
         query = session.query(self.table).\
             order_by(self.table.date.asc())
         for result in query:
             current = row2dict(result)
             for key, value in current.items():
                 if value == '':
                     current[key] = None
             annotations[result.motif_id] = current
     return annotations
Exemple #31
0
    def known(self, pdb):
        """Determine the known
        """

        mapping = {}
        with self.session() as session:
            query = session.query(mod.LoopPositions).\
                join(mod.LoopInfo,
                     mod.LoopInfo.loop_id == mod.LoopPositions.loop_id).\
                filter(mod.LoopInfo.pdb_id == pdb)

            for result in query:
                data = utils.row2dict(result)
                mapping[(result.loop_id, result.position)] = data

        return mapping
Exemple #32
0
    def known(self, pdb):
        """Determine the known
        """

        mapping = {}
        with self.session() as session:
            query = session.query(mod.LoopPositions).\
                join(mod.LoopInfo,
                     mod.LoopInfo.loop_id == mod.LoopPositions.loop_id).\
                filter(mod.LoopInfo.pdb_id == pdb)

            for result in query:
                data = utils.row2dict(result)
                mapping[(result.loop_id, result.position)] = data

        return mapping
 def possible_classes(self, start, stop, resolution):
     start_index = self.release_index(start)
     stop_index = self.release_index(stop)
     rel = mod.NrReleases
     with self.session() as session:
         query = session.query(mod.NrClasses.handle,
                               mod.NrClasses.nr_release_id,
                               mod.NrClasses.nr_class_id,
                               ).\
             join(rel,
                  mod.NrClasses.nr_release_id == rel.nr_release_id).\
             filter(rel.index >= start_index).\
             filter(rel.index <= stop_index).\
             filter(mod.NrClasses.resolution == resolution).\
             order_by(mod.NrClasses.nr_release_id)
         return [row2dict(r) for r in query]
    def ife_info(self, nr_class):
        ife_id = self.class_property(nr_class, 'id')
        with self.session() as session:
            data = {}

            query = session.query(mod.IfeInfo.ife_id).\
                filter(mod.IfeInfo.ife_id == ife_id)

            for result in query:
                entry = row2dict(result)
                ife_id = entry.pop('ife_id')
                chain_ids = ife_id.split('+')
                chains = [p.split('|')[-1] for p in chain_ids]
                entry['Chains'] = ', '.join(chains)
                data[ife_id] = entry
        return data
Exemple #35
0
    def interactions(self, pdb, chain):
        c1 = aliased(mod.UnitInfo)
        c2 = aliased(mod.UnitInfo)
        interactions = []
        with self.session() as session:
            query = session.query(mod.UnitPairsInteractions).\
                join(c1, c1.id == mod.UnitPairsInteractions.unit_id_1).\
                join(c2, c2.id == mod.UnitPairsInteractions.unit_id_2).\
                filter(mod.UnitPairsInteractions.pdb_id == pdb).\
                filter(c1.chain == c2.chain, c1.chain == chain)

            for result in query:
                data = ut.row2dict(result)
                data['id'] = int(data['unit_pairs_interactions_id'])
                interactions.append(data)

        return interactions
    def interactions(self, pdb, chain):
        c1 = aliased(mod.UnitInfo)
        c2 = aliased(mod.UnitInfo)
        interactions = []
        with self.session() as session:
            query = session.query(mod.UnitPairsInteractions).\
                join(c1, c1.id == mod.UnitPairsInteractions.unit_id_1).\
                join(c2, c2.id == mod.UnitPairsInteractions.unit_id_2).\
                filter(mod.UnitPairsInteractions.pdb_id == pdb).\
                filter(c1.chain == c2.chain, c1.chain == chain)

            for result in query:
                data = ut.row2dict(result)
                data['id'] = int(data['unit_pairs_interactions_id'])
                interactions.append(data)

        return interactions
    def load_ife_cqs_data(self, ife_list, nr_name):
        with self.session() as session:
            query = session.query(
                mod.IfeCqs.ife_id,
                mod.IfeCqs.obs_length,
                mod.IfeCqs.clashscore,
                mod.IfeCqs.average_rsr,
                mod.IfeCqs.average_rscc,
                mod.IfeCqs.percent_clash,
                mod.IfeCqs.rfree,
                mod.IfeCqs.resolution,
                ).\
                filter(mod.IfeCqs.ife_id.in_(ife_list))

            data = coll.defaultdict(list)

            max_exp_len = 0

            for result in query:
                entry = row2dict(result)
                ii = entry['ife_id']
                entry['nr_name'] = nr_name
                data[ii].append(entry)
                if result[1] > max_exp_len:
                    max_exp_len = result[1]

        for ife in ife_list:
            if data[ife]:
                ife_data = data[ife]
                obs_length = ife_data[0]['obs_length']
                ife_data[0]['max_exp_len'] = max_exp_len
            else:
                self.logger.warning("NQL: data: LICD: no data for %s" % ife)
                continue
            truth, fraction_unobserved = self.fraction_unobserved(
                obs_length, max_exp_len)
            percent_observed = (1 - fraction_unobserved)
            data[ife][0]['fraction_unobserved'] = fraction_unobserved
            data[ife][0]['percent_observed'] = percent_observed
            compscore = self.compscore(data[ife])
            data[ife][0]['compscore'] = compscore

        return data.values()
Exemple #38
0
    def load_ife_cqs_data(self, ife_list, nr_name):
        with self.session() as session:
            query = session.query(
                mod.IfeCqs.ife_id,
                mod.IfeCqs.obs_length,
                mod.IfeCqs.clashscore,
                mod.IfeCqs.average_rsr,
                mod.IfeCqs.average_rscc,
                mod.IfeCqs.percent_clash,
                mod.IfeCqs.rfree,
                mod.IfeCqs.resolution,
                ).\
                filter(mod.IfeCqs.ife_id.in_(ife_list))

            data = coll.defaultdict(list)

            max_exp_len = 0

            for result in query: 
                entry = row2dict(result)
                ii = entry['ife_id']
                entry['nr_name'] = nr_name
                data[ii].append(entry)
                if result[1] > max_exp_len:
                    max_exp_len = result[1]

        for ife in ife_list:
            if data[ife]:
                ife_data = data[ife]
                obs_length = ife_data[0]['obs_length']
                ife_data[0]['max_exp_len'] = max_exp_len
            else:
                self.logger.warning("NQL: data: LICD: no data for %s" % ife)
                continue
            truth, fraction_unobserved = self.fraction_unobserved(obs_length, max_exp_len)
            percent_observed = (1 - fraction_unobserved)
            data[ife][0]['fraction_unobserved'] = fraction_unobserved
            data[ife][0]['percent_observed'] = percent_observed
            compscore = self.compscore(data[ife])
            data[ife][0]['compscore'] = compscore

        return data.values()
Exemple #39
0
    def loop_quality(self, loop_release, pdb, loop_ids, **kwargs):

        if not loop_ids:
            raise core.InvalidState("No loops to get data for")

        pairs = self.pairs(pdb)
        known_positions = self.positions(pdb)
        with self.session() as session:
            info = mod.LoopInfo
            positions = mod.LoopPositions
            quality = mod.UnitQuality
            motifs = mod.MlLoops
            status = mod.LoopQa
            query = session.query(info.loop_id.label('Loop'),
                                  info.pdb_id.label('Pdb'),
                                  info.type.label('Type'),
                                  motifs.motif_id.label('Motif'),
                                  positions.unit_id.label('Nt'),
                                  status.status.label('Status'),
                                  quality.real_space_r.label('RSR'),
                                  quality.z_score.label('RSRZ'),
                                  ).\
                join(positions, positions.loop_id == info.loop_id).\
                join(status, status.loop_id == info.loop_id).\
                outerjoin(motifs, (motifs.loop_id == info.loop_id) &
                          (motifs.ml_release_id == kwargs['motif_release'])).\
                outerjoin(quality, quality.unit_id == positions.unit_id).\
                filter(info.pdb_id == pdb).\
                filter(status.loop_release_id == loop_release).\
                filter(info.loop_id.in_(loop_ids))

            as_result = ft.partial(self.as_result, pairs, known_positions)
            quality = [as_result(row2dict(r)) for r in query]

            found = {q['Loop'] for q in quality}
            required = set(loop_ids)
            if found != required:
                missing = required - found
                self.logger.error("Missing data for %s", missing)
                raise core.InvalidState("Did not find data on all loops")
        return quality
Exemple #40
0
    def loop_quality(self, loop_release, pdb, loop_ids, **kwargs):

        if not loop_ids:
            raise core.InvalidState("No loops to get data for")

        pairs = self.pairs(pdb)
        known_positions = self.positions(pdb)
        with self.session() as session:
            info = mod.LoopInfo
            positions = mod.LoopPositions
            quality = mod.UnitQuality
            motifs = mod.MlLoops
            status = mod.LoopQa
            query = session.query(info.loop_id.label('Loop'),
                                  info.pdb_id.label('Pdb'),
                                  info.type.label('Type'),
                                  motifs.motif_id.label('Motif'),
                                  positions.unit_id.label('Nt'),
                                  status.status.label('Status'),
                                  quality.real_space_r.label('RSR'),
                                  quality.z_score.label('RSRZ'),
                                  ).\
                join(positions, positions.loop_id == info.loop_id).\
                join(status, status.loop_id == info.loop_id).\
                outerjoin(motifs, (motifs.loop_id == info.loop_id) &
                          (motifs.ml_release_id == kwargs['motif_release'])).\
                outerjoin(quality, quality.unit_id == positions.unit_id).\
                filter(info.pdb_id == pdb).\
                filter(status.loop_release_id == loop_release).\
                filter(info.loop_id.in_(loop_ids))

            as_result = ft.partial(self.as_result, pairs, known_positions)
            quality = [as_result(row2dict(r)) for r in query]

            found = {q['Loop'] for q in quality}
            required = set(loop_ids)
            if found != required:
                missing = required - found
                self.logger.error("Missing data for %s", missing)
                raise core.InvalidState("Did not find data on all loops")
        return quality
    def chains(self, class_id):
        with self.session() as session:
            chains = mod.NrChains
            ife = mod.IfeInfo
            pdbs = mod.PdbInfo
            classes = mod.NrClasses
            query = session.query(chains.nr_release_id,
                                  classes.name,
                                  classes.handle,
                                  classes.version,
                                  ife.ife_id.label('id'),
                                  ife.bp_count.label('bp'),
                                  ife.length,
                                  pdbs.resolution,
                                  pdbs.experimental_technique.label('method'),
                                  ).\
                join(ife, ife.ife_id == chains.ife_id).\
                join(classes, classes.nr_class_id == chains.nr_class_id).\
                join(pdbs, pdbs.pdb_id == ife.pdb_id).\
                filter(classes.nr_class_id == class_id)

            return [row2dict(r) for r in query]
    def chain_info(self, ifes):
        chain_ids = self.class_property(ifes, 'chain_id')
        with self.session() as session:
            query = session.query(
                mod.ChainInfo.chain_id,
                mod.ChainInfo.sequence.label('Exp Sequence (CI)'),
                mod.ChainInfo.compound.label('Nucleic Acid Compound'),
                mod.SpeciesMapping.species_name.label('RNA Species'),
            ).\
                join(mod.ChainSpecies,
                     mod.ChainSpecies.chain_id == mod.ChainInfo.chain_id).\
                join(mod.SpeciesMapping,
                     mod.SpeciesMapping.species_id == mod.ChainSpecies.species_id).\
                filter(mod.ChainInfo.chain_id.in_(chain_ids))

            data = {}
            for result in query:
                entry = row2dict(result)
                entry['Exp Length (CI)'] = len(entry['Exp Sequence (CI)'])
                chain_id = entry.pop('chain_id')
                data[chain_id] = entry
        return data
    def ife_info(self, ifes):
        ife_ids = self.class_property(ifes, 'id')
        with self.session() as session:
            query = session.query(
                mod.IfeInfo.ife_id,
                mod.IfeInfo.bp_count,
                mod.IfeInfo.length.label('Obs Length (II)')
            ).\
            filter(mod.IfeInfo.ife_id.in_(ife_ids))

            data = {}
            for result in query:
                entry = row2dict(result)
                nt = entry['Obs Length (II)']
                bp = entry.pop('bp_count')
                ife_id = entry.pop('ife_id')
                entry['BP/NT'] = float(bp) / float(nt)
                chain_ids = ife_id.split('+')
                chains = [p.split('|')[-1] for p in chain_ids]
                entry['Chains'] = ', '.join(chains)
                data[ife_id] = entry
        return data
Exemple #44
0
    def position_info(self, unit):
        """Get the information about a position in an experimental sequence
        using a unit id.
        """

        self.logger.debug("Finding position for %s", unit)
        with self.session() as session:
            pos = mod.ExpSeqPosition
            mapping = mod.ExpSeqUnitMapping
            result = session.query(pos.index,
                                   pos.exp_seq_id,
                                   mod.UnitInfo.chain,
                                   mod.UnitInfo.model,
                                   mod.UnitInfo.sym_op,
                                   ).\
                join(mapping,
                     mapping.exp_seq_position_id == pos.exp_seq_position_id).\
                join(mod.UnitInfo,
                     mod.UnitInfo.unit_id == mapping.unit_id).\
                filter(mapping.unit_id == unit).\
                one()

            return row2dict(result)
Exemple #45
0
    def pairs(self, pdb):
        pairs = coll.defaultdict(lambda: {'Pairs': set(), 'Stacks': set(),
                                          'Basephosphate': set()})
        with self.session() as session:
            interactions = mod.UnitPairsInteractions
            query = session.query(interactions.unit_id_1,
                                  interactions.unit_id_2,
                                  interactions.f_lwbp.label('Pairs'),
                                  interactions.f_stacks.label('Stacks'),
                                  interactions.f_bphs.label('Basephosphate'),
                                  ).\
                filter(interactions.pdb_id == pdb)
            for result in query:
                data = row2dict(result)
                unit1 = data.pop('unit_id_1')
                unit2 = data.pop('unit_id_2')
                if unit1 == unit2 and data['Basephosphate'] and \
                        '0BPh' in data['Basephosphate']:
                    data['Basephosphate'] = None
                for name, value in data.items():
                    if value and not value.startswith('n'):
                        pairs[unit1][name].add(unit2)

        return pairs
Exemple #46
0
    def loops(self, pdb):
        """Get all loops in the current structure. If the loop is part of the
        current motif atlas release we will fetch the motif assignment as well.

        Parameters
        ----------
        pdb : str
            The pdb id to look up structures for.

        Returns
        -------
        loops : list
            A list of loop dictonaries that contain an 'id', 'pdb', 'nts' and
            'motif_id' column.
        """

        current_ml_release = self.current_ml_release()
        with self.session() as session:
            query = session.query(mod.LoopInfo.loop_id.label('id'),
                                  mod.LoopInfo.pdb_id.label('pdb'),
                                  mod.LoopInfo.unit_ids.label('nts'),
                                  mod.MlLoops.motif_id.label('motif_id')
                                  ).\
                outerjoin(mod.MlLoops,
                          (mod.MlLoops.loop_id == mod.LoopInfo.loop_id) &
                          (mod.MlLoops.ml_release_id == current_ml_release)).\
                filter(mod.LoopInfo.pdb_id == pdb).\
                order_by(mod.LoopInfo.loop_id)

            count = query.count()
            if not count:
                self.logger.info("No loops found for %s", pdb)
            else:
                self.logger.info("Found %s loops for %s", count, pdb)

            return [row2dict(result) for result in query]
Exemple #47
0
    def lookup_sequences(self, pdb):
        """Return all exp_seq_ids for the given pdb. This only assign the
        species id from the given pdb.

        :param str pdb: The pdb id to get all sequences for.
        :returns: A list of dictionaries of unique sequences.
        """

        with self.session() as session:
            query = session.query(ExpSeqPdb.exp_seq_id.label('id'),
                                  ExpSeqInfo.normalized_length.label('length'),
                                  ChainSpecies.species_id.label('species')).\
                join(ExpSeqInfo,
                     ExpSeqInfo.exp_seq_id == ExpSeqPdb.exp_seq_id).\
                outerjoin(ChainSpecies,
                          ChainSpecies.chain_id == ExpSeqPdb.chain_id).\
                filter(ExpSeqPdb.pdb_id == pdb).\
                filter(ExpSeqInfo.was_normalized).\
                distinct()

            if not query.count():
                self.logger.warning("No sequences for %s" % pdb)

            return [ut.row2dict(result) for result in query]
 def reference(self, pdb):
     """Get all correlated reference structures.
     """
     with self.session() as session:
         query = session.query(mod.CorrespondenceInfo).filter_by(pdb2=pdb)
         return [ut.row2dict(result) for result in query]
Exemple #49
0
 def from_dict(cls, result):
     return cls(**row2dict(result))