def data(self, pair, **kwargs): """Compute the parentage data. This will raise a skip exception if there are not parents, or if this is the first release (parent release is the same as the current release). This requires that there is data stored in the NR_CACHE_NAME file. If there is not, then an exception is raised. Parameters ---------- release : str The nr release id to process. Raises ------ Skip If this is the first release, or there are no parents. Returns ------- data : list A list of dicts that can be written to the ml_parents table. """ loop_type, release = pair cached = self.cached(loop_type) if not cached: raise core.InvalidState("No cached data") if cached['release'] == cached['parent']: raise core.Skip("No parents for first release") if self.no_parents(cached): raise core.Skip("Parent counts show no parents") return self.parents(cached)
def data(self, release, **kwargs): data = self.cached(NR_CACHE_NAME) if not data: raise core.InvalidState("No grouping loaded") if data['parent'] == data['release']: raise core.Skip("First release has no parents") if self.no_parents(data): raise core.Skip("Parent counts shows no parents") return self.parents(release, data['groups'])
def data(self, name, **kwargs): try: content = self.gzip(self.url(name, **kwargs)) except Exception as err: self.logger.error('%s could not be downloaded', name) self.logger.exception(err) raise core.Skip("Couldn't get %s" % name) if not content: core.Skip("Downloaded empty file %s" % name) return content
def data(self, pair, **kwargs): loop_type, release = pair cached = self.cached(loop_type) if not cached: raise core.InvalidState("No cached data") if cached['release'] == cached['parent']: raise core.Skip("No annotations for first release") known = self.known() if not known: raise core.Skip("No existing annotations") return self.new_annotations(known, cached)
def data(self, pdb, **kwargs): """Compute the quality assignments for residues in the structure. This will fetch the validation report from PDB and convert the entries there into forms suitable to write to the database. If the report has no RSR or DCC data then a `core.Skip` exception will be raised. Parameters ---------- pdb : str The pdb id to use. Returns ------- data : iterable An iterable of a quality assignments to store in the database. """ with open(self.filename(pdb), 'rb') as raw: parser = Parser(raw.read()) if not parser.has_rsr() and not parser.has_dcc(): raise core.Skip("No RsR found for %s" % pdb) mapping = self.mapping(pdb) as_quality = ft.partial(self.as_quality, mapping) data = it.imap(as_quality, parser.nts()) data = it.chain.from_iterable(data) return it.imap(lambda d: mod.UnitQuality(**d), data)
def members(self, class_id): """Get all members of the class. Parameters ---------- class_id : in The id of the the NR class. Returns ------- members : list A list of tuples (ife_id, nr_chain_id) for all members that are part of the class. ife_id is like 2A43|1|A and nr_chain_id is like 11890928 """ self.logger.info("members: class_id: %s" % class_id) with self.session() as session: query = session.query(mod.NrChains.ife_id, mod.NrChains.nr_chain_id).\ filter_by(nr_class_id=class_id) members = [(r.ife_id, r.nr_chain_id) for r in query] if len(members) == 1: raise core.Skip("Skip group of size 1") if not members: raise core.InvalidState("No members in NR class: %i" % class_id) return members
def distances(self, nr_release_id, class_id, members): """Load all compute distances for members of the NR class. This may not load distances for all members, as we do not compute discrepancies for all possible chain to chain comparisons. For example, chains with very poor resolution are skipped when computing discrepancies. Parameters ---------- class_id : int The NR class id members : list A list of members as from `Loader.members`. Returns ------- distances : collections.defaultdict A dict of dict's that represents the distances. The keys will be ife ids, and the values will be the discrepancies between each ife. """ self.logger.info("distances: class_id (%s) has %s members" % (class_id, len(members))) with self.session() as session: chains1 = aliased(mod.IfeChains) chains2 = aliased(mod.IfeChains) nr1 = aliased(mod.NrChains) nr2 = aliased(mod.NrChains) sim = mod.ChainChainSimilarity query = session.query(sim.discrepancy, chains1.ife_id.label('ife1'), chains2.ife_id.label('ife2'), ).\ join(chains1, chains1.chain_id == sim.chain_id_1).\ join(chains2, chains2.chain_id == sim.chain_id_2).\ join(nr1, nr1.ife_id == chains1.ife_id).\ join(nr2, nr2.ife_id == chains2.ife_id).\ filter(nr1.nr_class_id == nr2.nr_class_id).\ filter(nr1.nr_class_id == class_id).\ filter(nr1.nr_release_id == nr2.nr_release_id).\ filter(nr1.nr_release_id == nr_release_id).\ order_by(nr1.ife_id, nr2.ife_id) distances = coll.defaultdict(lambda: coll.defaultdict(int)) ifes = set(m[0] for m in members) for result in query: if result.ife1 not in ifes or result.ife2 not in ifes: continue distances[result.ife1][result.ife2] = result.discrepancy if not distances: raise core.Skip("No distances, skipping class: %i" % class_id) if set(distances.keys()) != ifes: missing = ', '.join(ifes - set(distances.keys())) self.logger.warning( "Did not load distances for all pairs in: %i." " Missing %s", class_id, missing) return distances
def summary_query(self, pdb, chain, element1, element2, range_type): query = self.build(element1, element2, range_type) with self.session() as session: results = session.execute(query, {'pdb': pdb, 'chain': chain}) results = results.fetchone() if not results: raise core.Skip("Couldn't compute %s %s", element1, element2) return results['bps'], results['stacks'], results['bphs']
def data(self, pdb, **kwargs): """Compute the interaction annotations for a pdb file. :pdb: The pdb id to process. :kwargs: Keyword arguments. :returns: The interaction annotations. """ mlab = matlab.Matlab(str(self.config['locations']['fr3d_root'])) self.logger.info('Running matlab on %s', pdb) ifn, status, err_msg = mlab.loadFlankings(pdb, nout=3) status = status[0][0] if status == 0: data = self.parse(ifn, pdb) os.remove(ifn) return data elif status == 2: raise core.Skip('PDB file %s has no nucleotides' % pdb) elif status == 3: raise core.Skip('PDB file %s has no flanking interactions' % pdb) raise core.InvalidState('Matlab error code %i when analyzing %s' % status, pdb)
def to_process(self, pdbs, **kwargs): with self.session() as session: query = session.query(mod.LoopInfo.pdb_id).\ distinct() known = [r.pdb_id for r in query] to_use = sorted( set(pdbs).difference(known) ) #We want to process ONLY the pdbs that are NOT in loop_info self.logger.info("Extracting loops from %s" % to_use) if not to_use: raise core.Skip("no new PDB ids that need loops extracted") return to_use
def to_process(self, pdbs, **kwargs): """We transform all the pdbs into the correspondences to do. While this does not respect the pdbs given but it does make all other code a lot cleaner and easier to understand. This pulls out all stored correspondence ids. :param list pdb: The list of pdb ids. Currently ignored. :param dict kwargs: The keyword arguments which are ignored. :returns: A list of correspondence ids to process. """ with self.session() as session: query = session.query(mod.CorrespondenceInfo.correspondence_id) if not query.count(): raise core.Skip("Skipping positions, no new correspondences") return [result.correspondence_id for result in query]
def data(self, pdb, **kwargs): """Compute the quality assignments for the structure. Parameters ---------- pdb : str The pdb id to use. Returns ------- data : mod.UnitQuality The quality data for the structure. """ filename = self._create(Utils).filename(pdb) if not os.path.exists(filename): raise core.Skip("No quality for %s" % pdb) return self.parse(filename)
def parse(self, filename): """Parse the file to extract the structure level data. Parameters ---------- filename : str The file to parse. Returns ------- data : mod.UnitQuality The quality data for the structure. """ if not os.path.exists(filename): raise core.Skip("Missing file %s" % filename) with open(filename, 'rb') as raw: parser = Parser(raw.read()) entity = parser.entity() return mod.PdbQuality(**entity)
def to_process(self, pdbs, **kwargs): recalculate = kwargs.get('recalculate', False) if recalculate is True: return pdbs if isinstance(recalculate, (set, list, tuple)): try: if self.name in recalculate: return pdbs except: return pdbs if bool(self.config[self.name].get('recompute')): return True PDBs_to_process = sorted(set(pdbs) - self.known()) if len(PDBs_to_process) > 0: return PDBs_to_process else: raise core.Skip("No new distances to compute")
def to_process(self, pdbs, **kwargs): """Convert the list of pdbs to only those PDB's with loops that have not been checked by quality yet. By doing this this stage is able assert that data is always produced. Parameters ---------- pdbs : list List of PDB ids Returns ------- pdbs : list A list of PDBs from the original list that contain loops and have not been checked for quality yet. """ # accumulate PDB ids for which loops already appear in the loop info table with self.session() as session: query = session.query(mod.LoopInfo.pdb_id).\ join(mod.LoopPositions, mod.LoopPositions.loop_id == mod.LoopInfo.loop_id).\ distinct() have_loops = {r.pdb_id for r in query} # get list of pdbs with related entries in loop_qa with self.session() as session: query = session.query(mod.LoopInfo.pdb_id).\ join(mod.LoopQa, mod.LoopQa.loop_id == mod.LoopInfo.loop_id).\ distinct() checked = {r.pdb_id for r in query} #Get list of pdbs with loops to_use = sorted(set(pdbs).intersection(have_loops)) #Get list of pdbs with loops that have NOT been checked for quality yet to_use = sorted(set(to_use).difference(checked)) if not to_use: raise core.Skip( "All loops in the current PDB list have gone through QA") return to_use
def to_process(self, pdbs, **kwargs): with self.session() as session: query = session.query(mod.LoopInfo.pdb_id).\ join(mod.LoopPositions, mod.LoopPositions.loop_id == mod.LoopInfo.loop_id).\ distinct() dn_process = [r.pdb_id for r in query] #List of pdbs with corresponding entries in loop_positions to_use = sorted(set(pdbs).difference(dn_process)) #Remove pdbs with entries in loop_positions with self.session() as session: query = session.query(mod.LoopInfo.pdb_id).\ filter(mod.LoopInfo.type == 'NA').\ distinct() dn_process = [r.pdb_id for r in query] #list of pdbs with corresponding entries in loop_info and type='NA' to_use = sorted(set(to_use).difference(dn_process)) #Remove pdbs with no loops if not to_use: raise core.Skip("Nothing to process") return to_use
def has_data(self, *args, **kwargs): grouping = self.cached(NR_CACHE_NAME) if not grouping: raise core.Skip("No precomputed grouping to cleanup") return False
def distances_revised(self, release_id, class_id, members): """Load all available computed distances between members of the NR class. Note that all possible chain-to-chain comparisons are not computed; for example, chains with very poor resolution are skipped during the discrepancy calculations. Parameters ---------- class_id : in The first class_id value for the NR class. release_id : in The first representative sets release that contains the class. members : list A list of members as from `Loader.members`. (Actually generated in members_revised) Returns ------- distances_revised : collections.defaultdict A dict-of-dicts that represents the distances. The keys will be ife_ids, and the values will be the discrepancies between each pair of IFEs. """ self.logger.info( "distances_revised: class_id (%s) has %s members including %s" % (class_id, len(members), members[0])) with self.session() as session: chains1 = aliased(mod.IfeChains) chains2 = aliased(mod.IfeChains) nr1 = aliased(mod.NrChains) nr2 = aliased(mod.NrChains) sim = mod.ChainChainSimilarity query = session.query(sim.discrepancy, chains1.ife_id.label('ife1'), chains2.ife_id.label('ife2'), ).\ join(chains1, chains1.chain_id == sim.chain_id_1).\ join(chains2, chains2.chain_id == sim.chain_id_2).\ join(nr1, nr1.ife_id == chains1.ife_id).\ join(nr2, nr2.ife_id == chains2.ife_id).\ filter(nr1.nr_class_id == nr2.nr_class_id).\ filter(nr1.nr_class_id == class_id).\ filter(nr1.nr_release_id == nr2.nr_release_id).\ filter(nr1.nr_release_id == release_id).\ order_by(nr1.ife_id, nr2.ife_id) distances_revised = coll.defaultdict(lambda: coll.defaultdict(int)) ifes = set(m[0] for m in members) for result in query: if result.ife1 not in ifes or result.ife2 not in ifes: continue distances_revised[result.ife1][ result.ife2] = result.discrepancy if not distances_revised: raise core.Skip("No distances, skipping class: %i" % class_id) if set(distances_revised.keys()) != ifes: missing = ', '.join(ifes - set(distances_revised.keys())) self.logger.warning( "Did not load distances for all pairs in: %i." " Missing %s", class_id, missing) return distances_revised
def data(self, exp_seq, **kwargs): ss_id = self.match(exp_seq, **kwargs) if ss_id is None: raise core.Skip("Nothing to align %s to", exp_seq) return {'ss_id': ss_id, 'exp_seq_id': exp_seq}
def to_process(self, pdbs, **kwargs): if 'filename' not in kwargs: raise core.Skip("No filename to import") return [kwargs['filename']]