Exemple #1
0
    def data(self, pair, **kwargs):
        """Compute the parentage data. This will raise a skip exception if
        there are not parents, or if this is the first release (parent release
        is the same as the current release). This requires that there is data
        stored in the NR_CACHE_NAME file. If there is not, then an exception is
        raised.

        Parameters
        ----------
        release : str
            The nr release id to process.

        Raises
        ------
        Skip
            If this is the first release, or there are no parents.

        Returns
        -------
        data : list
            A list of dicts that can be written to the ml_parents table.
        """

        loop_type, release = pair
        cached = self.cached(loop_type)
        if not cached:
            raise core.InvalidState("No cached data")

        if cached['release'] == cached['parent']:
            raise core.Skip("No parents for first release")
        if self.no_parents(cached):
            raise core.Skip("Parent counts show no parents")

        return self.parents(cached)
Exemple #2
0
 def data(self, release, **kwargs):
     data = self.cached(NR_CACHE_NAME)
     if not data:
         raise core.InvalidState("No grouping loaded")
     if data['parent'] == data['release']:
         raise core.Skip("First release has no parents")
     if self.no_parents(data):
         raise core.Skip("Parent counts shows no parents")
     return self.parents(release, data['groups'])
Exemple #3
0
    def data(self, name, **kwargs):
        try:
            content = self.gzip(self.url(name, **kwargs))
        except Exception as err:
            self.logger.error('%s could not be downloaded', name)
            self.logger.exception(err)
            raise core.Skip("Couldn't get %s" % name)

        if not content:
            core.Skip("Downloaded empty file %s" % name)

        return content
Exemple #4
0
    def data(self, pair, **kwargs):
        loop_type, release = pair
        cached = self.cached(loop_type)
        if not cached:
            raise core.InvalidState("No cached data")

        if cached['release'] == cached['parent']:
            raise core.Skip("No annotations for first release")

        known = self.known()
        if not known:
            raise core.Skip("No existing annotations")

        return self.new_annotations(known, cached)
    def data(self, pdb, **kwargs):
        """Compute the quality assignments for residues in the structure. This
        will fetch the validation report from PDB and convert the entries there
        into forms suitable to write to the database. If the report has no RSR
        or DCC data then a `core.Skip` exception will be raised.

        Parameters
        ----------
        pdb : str
            The pdb id to use.

        Returns
        -------
        data : iterable
            An iterable of a quality assignments to store in the database.
        """

        with open(self.filename(pdb), 'rb') as raw:
            parser = Parser(raw.read())

        if not parser.has_rsr() and not parser.has_dcc():
            raise core.Skip("No RsR found for %s" % pdb)

        mapping = self.mapping(pdb)
        as_quality = ft.partial(self.as_quality, mapping)
        data = it.imap(as_quality, parser.nts())
        data = it.chain.from_iterable(data)
        return it.imap(lambda d: mod.UnitQuality(**d), data)
Exemple #6
0
    def members(self, class_id):
        """Get all members of the class.

        Parameters
        ----------
        class_id : in
            The id of the the NR class.

        Returns
        -------
        members : list
            A list of tuples (ife_id, nr_chain_id) for all members that are
            part of the class.
            ife_id is like 2A43|1|A and nr_chain_id is like 11890928
        """

        self.logger.info("members: class_id: %s" % class_id)

        with self.session() as session:
            query = session.query(mod.NrChains.ife_id,
                                  mod.NrChains.nr_chain_id).\
                filter_by(nr_class_id=class_id)
            members = [(r.ife_id, r.nr_chain_id) for r in query]

        if len(members) == 1:
            raise core.Skip("Skip group of size 1")

        if not members:
            raise core.InvalidState("No members in NR class: %i" % class_id)

        return members
Exemple #7
0
    def distances(self, nr_release_id, class_id, members):
        """Load all compute distances for members of the NR class. This may not
        load distances for all members, as we do not compute discrepancies for
        all possible chain to chain comparisons. For example, chains with very
        poor resolution are skipped when computing discrepancies.

        Parameters
        ----------
        class_id : int
            The NR class id
        members : list
            A list of members as from `Loader.members`.

        Returns
        -------
        distances : collections.defaultdict
            A dict of dict's that represents the distances. The keys will be
            ife ids, and the values will be the discrepancies between each ife.
        """

        self.logger.info("distances: class_id (%s) has %s members" %
                         (class_id, len(members)))

        with self.session() as session:
            chains1 = aliased(mod.IfeChains)
            chains2 = aliased(mod.IfeChains)
            nr1 = aliased(mod.NrChains)
            nr2 = aliased(mod.NrChains)
            sim = mod.ChainChainSimilarity
            query = session.query(sim.discrepancy,
                                  chains1.ife_id.label('ife1'),
                                  chains2.ife_id.label('ife2'),
                                  ).\
                join(chains1, chains1.chain_id == sim.chain_id_1).\
                join(chains2, chains2.chain_id == sim.chain_id_2).\
                join(nr1, nr1.ife_id == chains1.ife_id).\
                join(nr2, nr2.ife_id == chains2.ife_id).\
                filter(nr1.nr_class_id == nr2.nr_class_id).\
                filter(nr1.nr_class_id == class_id).\
                filter(nr1.nr_release_id == nr2.nr_release_id).\
                filter(nr1.nr_release_id == nr_release_id).\
                order_by(nr1.ife_id, nr2.ife_id)

            distances = coll.defaultdict(lambda: coll.defaultdict(int))
            ifes = set(m[0] for m in members)
            for result in query:
                if result.ife1 not in ifes or result.ife2 not in ifes:
                    continue
                distances[result.ife1][result.ife2] = result.discrepancy

        if not distances:
            raise core.Skip("No distances, skipping class: %i" % class_id)

        if set(distances.keys()) != ifes:
            missing = ', '.join(ifes - set(distances.keys()))
            self.logger.warning(
                "Did not load distances for all pairs in: %i."
                " Missing %s", class_id, missing)

        return distances
    def summary_query(self, pdb, chain, element1, element2, range_type):
        query = self.build(element1, element2, range_type)
        with self.session() as session:
            results = session.execute(query, {'pdb': pdb, 'chain': chain})

            results = results.fetchone()
            if not results:
                raise core.Skip("Couldn't compute %s %s", element1, element2)

            return results['bps'], results['stacks'], results['bphs']
Exemple #9
0
    def data(self, pdb, **kwargs):
        """Compute the interaction annotations for a pdb file.

        :pdb: The pdb id to process.
        :kwargs: Keyword arguments.
        :returns: The interaction annotations.
        """
        mlab = matlab.Matlab(str(self.config['locations']['fr3d_root']))

        self.logger.info('Running matlab on %s', pdb)
        ifn, status, err_msg = mlab.loadFlankings(pdb, nout=3)
        status = status[0][0]
        if status == 0:
            data = self.parse(ifn, pdb)
            os.remove(ifn)
            return data
        elif status == 2:
            raise core.Skip('PDB file %s has no nucleotides' % pdb)
        elif status == 3:
            raise core.Skip('PDB file %s has no flanking interactions' % pdb)
        raise core.InvalidState('Matlab error code %i when analyzing %s' %
                                status, pdb)
    def to_process(self, pdbs, **kwargs):

        with self.session() as session:
            query = session.query(mod.LoopInfo.pdb_id).\
                distinct()
            known = [r.pdb_id for r in query]

        to_use = sorted(
            set(pdbs).difference(known)
        )  #We want to process ONLY the pdbs that are NOT in loop_info

        self.logger.info("Extracting loops from %s" % to_use)

        if not to_use:
            raise core.Skip("no new PDB ids that need loops extracted")
        return to_use
    def to_process(self, pdbs, **kwargs):
        """We transform all the pdbs into the correspondences to do. While this
        does not respect the pdbs given but it does make all other code a lot
        cleaner and easier to understand. This pulls out all stored
        correspondence ids.

        :param list pdb: The list of pdb ids. Currently ignored.
        :param dict kwargs: The keyword arguments which are ignored.
        :returns: A list of correspondence ids to process.
        """

        with self.session() as session:
            query = session.query(mod.CorrespondenceInfo.correspondence_id)
            if not query.count():
                raise core.Skip("Skipping positions, no new correspondences")
            return [result.correspondence_id for result in query]
    def data(self, pdb, **kwargs):
        """Compute the quality assignments for the structure.

        Parameters
        ----------
        pdb : str
            The pdb id to use.

        Returns
        -------
        data : mod.UnitQuality
            The quality data for the structure.
        """
        filename = self._create(Utils).filename(pdb)
        if not os.path.exists(filename):
            raise core.Skip("No quality for %s" % pdb)
        return self.parse(filename)
    def parse(self, filename):
        """Parse the file to extract the structure level data.

        Parameters
        ----------
        filename : str
            The file to parse.

        Returns
        -------
        data : mod.UnitQuality
            The quality data for the structure.
        """
        if not os.path.exists(filename):
            raise core.Skip("Missing file %s" % filename)
        with open(filename, 'rb') as raw:
            parser = Parser(raw.read())
        entity = parser.entity()
        return mod.PdbQuality(**entity)
    def to_process(self, pdbs, **kwargs):
        recalculate = kwargs.get('recalculate', False)
        if recalculate is True:
            return pdbs

        if isinstance(recalculate, (set, list, tuple)):
            try:
                if self.name in recalculate:
                    return pdbs
            except:
                return pdbs

        if bool(self.config[self.name].get('recompute')):
            return True

        PDBs_to_process = sorted(set(pdbs) - self.known())

        if len(PDBs_to_process) > 0:
            return PDBs_to_process
        else:
            raise core.Skip("No new distances to compute")
    def to_process(self, pdbs, **kwargs):
        """Convert the list of pdbs to only those PDB's with loops that have not been checked by quality yet. By
        doing this this stage is able assert that data is always produced.

        Parameters
        ----------
        pdbs : list
            List of PDB ids

        Returns
        -------
        pdbs : list
            A list of PDBs from the original list that contain loops and have not been checked for quality yet.
        """

        # accumulate PDB ids for which loops already appear in the loop info table
        with self.session() as session:
            query = session.query(mod.LoopInfo.pdb_id).\
                join(mod.LoopPositions,
                     mod.LoopPositions.loop_id == mod.LoopInfo.loop_id).\
                distinct()
            have_loops = {r.pdb_id for r in query}

        # get list of pdbs with related entries in loop_qa
        with self.session() as session:
            query = session.query(mod.LoopInfo.pdb_id).\
                join(mod.LoopQa,
                     mod.LoopQa.loop_id == mod.LoopInfo.loop_id).\
                distinct()
            checked = {r.pdb_id for r in query}

        #Get list of pdbs with loops
        to_use = sorted(set(pdbs).intersection(have_loops))
        #Get list of pdbs with loops that have NOT been checked for quality yet
        to_use = sorted(set(to_use).difference(checked))

        if not to_use:
            raise core.Skip(
                "All loops in the current PDB list have gone through QA")
        return to_use
Exemple #16
0
    def to_process(self, pdbs, **kwargs):
        with self.session() as session:
            query = session.query(mod.LoopInfo.pdb_id).\
                join(mod.LoopPositions,
                     mod.LoopPositions.loop_id == mod.LoopInfo.loop_id).\
                distinct()
            dn_process = [r.pdb_id for r in query] #List of pdbs with corresponding entries in loop_positions

        to_use = sorted(set(pdbs).difference(dn_process)) #Remove pdbs with entries in loop_positions

        with self.session() as session:
            query = session.query(mod.LoopInfo.pdb_id).\
                filter(mod.LoopInfo.type == 'NA').\
                distinct()
            dn_process = [r.pdb_id for r in query] #list of pdbs with corresponding entries in loop_info and type='NA'

        to_use = sorted(set(to_use).difference(dn_process)) #Remove pdbs with no loops

        if not to_use:
            raise core.Skip("Nothing to process")

        return to_use
 def has_data(self, *args, **kwargs):
     grouping = self.cached(NR_CACHE_NAME)
     if not grouping:
         raise core.Skip("No precomputed grouping to cleanup")
     return False
Exemple #18
0
    def distances_revised(self, release_id, class_id, members):
        """Load all available computed distances between members of the NR class.
        Note that all possible chain-to-chain comparisons are not computed;
        for example, chains with very poor resolution are skipped during the
        discrepancy calculations.

        Parameters
        ----------
        class_id : in
            The first class_id value for the NR class.

        release_id : in
            The first representative sets release that contains the class.

        members : list
            A list of members as from `Loader.members`.  (Actually generated
            in members_revised)

        Returns
        -------
        distances_revised : collections.defaultdict
            A dict-of-dicts that represents the distances. The keys will be
            ife_ids, and the values will be the discrepancies between each pair
            of IFEs.
        """

        self.logger.info(
            "distances_revised: class_id (%s) has %s members including %s" %
            (class_id, len(members), members[0]))

        with self.session() as session:
            chains1 = aliased(mod.IfeChains)
            chains2 = aliased(mod.IfeChains)
            nr1 = aliased(mod.NrChains)
            nr2 = aliased(mod.NrChains)
            sim = mod.ChainChainSimilarity

            query = session.query(sim.discrepancy,
                                  chains1.ife_id.label('ife1'),
                                  chains2.ife_id.label('ife2'),
                                  ).\
                join(chains1, chains1.chain_id == sim.chain_id_1).\
                join(chains2, chains2.chain_id == sim.chain_id_2).\
                join(nr1, nr1.ife_id == chains1.ife_id).\
                join(nr2, nr2.ife_id == chains2.ife_id).\
                filter(nr1.nr_class_id == nr2.nr_class_id).\
                filter(nr1.nr_class_id == class_id).\
                filter(nr1.nr_release_id == nr2.nr_release_id).\
                filter(nr1.nr_release_id == release_id).\
                order_by(nr1.ife_id, nr2.ife_id)

            distances_revised = coll.defaultdict(lambda: coll.defaultdict(int))

            ifes = set(m[0] for m in members)

            for result in query:
                if result.ife1 not in ifes or result.ife2 not in ifes:
                    continue
                distances_revised[result.ife1][
                    result.ife2] = result.discrepancy

        if not distances_revised:
            raise core.Skip("No distances, skipping class: %i" % class_id)

        if set(distances_revised.keys()) != ifes:
            missing = ', '.join(ifes - set(distances_revised.keys()))
            self.logger.warning(
                "Did not load distances for all pairs in: %i."
                " Missing %s", class_id, missing)

        return distances_revised
    def data(self, exp_seq, **kwargs):
        ss_id = self.match(exp_seq, **kwargs)
        if ss_id is None:
            raise core.Skip("Nothing to align %s to", exp_seq)

        return {'ss_id': ss_id, 'exp_seq_id': exp_seq}
Exemple #20
0
 def to_process(self, pdbs, **kwargs):
     if 'filename' not in kwargs:
         raise core.Skip("No filename to import")
     return [kwargs['filename']]