def mapping(self, release_id, names): """Create a mapping from nr class names to id in the database. This will raise an exception if it cannot find all names or if not given a list of names and a release id. :names: A list of names. :release_id: The release id to use. :returns: A dictonary mapping class_name to id. """ if not names or not release_id: raise core.InvalidState("Must give names and release id") with self.session() as session: query = session.query(mod.NrClasses.nr_class_id, mod.NrClasses.name, ).\ filter(mod.NrClasses.name.in_(names)).\ filter(mod.NrClasses.nr_release_id == release_id) if query.count() == 0: self.logger.info(names) raise core.InvalidState("Found no clases with given names") mapping = {} for result in query: mapping[result.name] = result.nr_class_id if len(mapping) != len(names): raise core.InvalidState("Could not map all names") return mapping
def to_process(self, pdbs, **kwargs): """Compute the data to process. The input PDB's are ignored and instead the cache is examine for motif data, IL and HL to import. This will produce a list of tuples of the motifs to import. Parameters ---------- pdbs : list Ingored Returns ------- A list of tuples like [('IL', '1.0'), ('HL', '1.0'] to import. """ current, _ = ReleaseLoader(self.config, self.session).current_id() data = [] for loop_type in ReleaseLoader.types: cached = self.cached(loop_type) if not cached: raise core.InvalidState("No cached data") if cached['release'] != current: raise core.InvalidState("Caching does not match excepted ID") data.append((loop_type, current)) return data
def normalized_mapping(self, pdb_id): """This produces a dictonary that can be used to correct bad unit ids. Some of the loops stored after we migrated the database have incorrect unit ids. The errors appear to be of 2 kinds, incorrect model number and possibly bad alt ids. By producing this mapping we try to correct the issue by finding the correct unit id. :param str pdb_id: The pdb id to get units for. :returns: A dictonary with Unit keys mapping to the unit id. """ with self.session() as session: query = session.query(mod.UnitInfo.unit_id, mod.UnitInfo.pdb_id.label('pdb'), mod.UnitInfo.model, mod.UnitInfo.chain, mod.UnitInfo.number.label('component_number'), mod.UnitInfo.ins_code.label('insertion_code'), mod.UnitInfo.alt_id, mod.UnitInfo.sym_op.label('symmetry'), ).\ filter(mod.UnitInfo.pdb_id == pdb_id) if not query.count(): raise core.InvalidState("No units in %s" % pdb_id) mapping = {} for result in query: data = row2dict(result) unit_id = data.pop('unit_id') key = Unit(**data) if key in mapping: raise core.InvalidState("Non unique mapping") mapping[key] = unit_id return mapping
def correct_structure(self, pdb, mapping, units, require_all=True): """Correct units from a single structure. :param str pdb: The PDB id to use. :param dict mapping: The mapping to use. :param list units: The list of Unit's to correct. :returns: A list of the corrected units. """ valid = [] for unit in units: if unit.pdb != pdb: continue fixed = self.correct(mapping, unit) if not fixed: msg = "Could not correct {unit}".format(unit=str(unit)) if require_all: raise core.InvalidState(msg) else: self.logger.error(msg) continue valid.append(fixed) if len(valid) != len(set(valid)): raise core.InvalidState("Did not produce unique normalization") return valid
def map(self, nts, mapping): if not mapping: raise core.InvalidState("Given empty mapping") for nt in nts: if nt not in mapping: raise core.InvalidState("Missing nt %s" % nt) yield mapping[nt]
def __init__(self, *args, **kwargs): super(CsvLoader, self).__init__(*args, **kwargs) if not self.name: raise core.InvalidState("Must specify name") if not self.table: raise core.InvalidState("Must specify the table name") if not self.headers: raise core.InvalidState("Must specify file headers")
def __call__(self, initial, length_increase=NR_LENGTH_PERCENT_INCREASE, bp_increase=NR_BP_PERCENT_INCREASE): """ Find the representative for the group. Parameters ---------- group : list List of IFE's to find the representative of. length_increase : float The fraction increase in resolved nucleotides that an IFE must have to be selected as representative. bp_increase : float The fraction increase of basepairs that an IFE must have to be selected as representative. Returns ------- representative : dict The ife which should be the representative. """ group = self.filter_group_by_method(initial) best = self.initial_representative(group) if not best: raise core.InvalidState("No current representative") self.logger.debug("Naive representative: %s", best['id']) rep = best while True: candidates = self.candidates(rep, group['members']) self.logger.debug("Found %i representative candidates", len(candidates)) new_rep = self.best_above_cutoffs(rep, candidates, length_increase, bp_increase) if new_rep == rep: break self.logger.info("Changed representative from %s to %s", rep['id'], new_rep['id']) rep = new_rep if not rep: raise core.InvalidState("No representative found") self.logger.debug("Computed representative: %s", rep['id']) return self.insert_as_representative(rep, initial['members'], sort=bp_per_nt)
def loops(self, loop_release_id, loop_type, ifes, size_limit=None, **kwargs): """Get the list of loop ids to use in clustering. These loops must be from IFE's in the given list and marked as valid in the loop quality step. Parameters ---------- loop_release_id : str The loop release id to use. loop_type : str The type of loop to use, eg, IL, HL. ifes : list A list of ife ids to find loops in. Returns ------- loops: str A list of loop ids to process. """ exclude = self.loops_to_exclude(**kwargs) found = set() with self.session() as session: loops = mod.LoopInfo quality = mod.LoopQa pos = mod.LoopPositions ife_chains = mod.IfeChains chain_info = mod.ChainInfo units = mod.UnitInfo query = session.query(loops.loop_id).\ join(quality, quality.loop_id == loops.loop_id).\ join(pos, pos.loop_id == loops.loop_id).\ join(units, units.unit_id == pos.unit_id).\ join(chain_info, (chain_info.chain_name == units.chain) & (chain_info.pdb_id == units.pdb_id)).\ join(ife_chains, ife_chains.chain_id == chain_info.chain_id).\ filter(quality.status == 1).\ filter(quality.loop_release_id == loop_release_id).\ filter(loops.type == loop_type).\ filter(ife_chains.ife_id.in_(ifes)).\ filter(~loops.loop_id.in_(BLACKLIST)).\ distinct() if size_limit is not None: query = query.filter(loops.length < size_limit) found.update(r.loop_id for r in query if r.loop_id not in exclude) if not found: raise core.InvalidState("No loops to cluster for %s" % loop_release_id) return sorted(found)
def annotations(self, pdb, remove=True): """Call matlab and parse the annotations to create a list of unit id to loop mappings. :param str pdb: The pdb id to use. :param Bool remove: Flag to indicate if the produced file should be removed. :returns: The annotations produced by matlab. """ mlab = matlab.Matlab(self.config['locations']['fr3d_root']) path = str(os.path.join(self.precomputed, pdb)) try: if not os.path.exists(path): os.mkdir(path) except: raise core.InvalidState("Could not create %s for matlab" % path) [output_file, err_msg] = mlab.loadLoopPositions(path, nout=2) if err_msg != '': raise matlab.MatlabFailed(err_msg) data = self.parse(output_file) if remove: os.remove(output_file) return data
def data(self, pdb, **kwargs): """Compute the coordinate entries for the given PDB. This will exclude water molecules as those aren't generally worth displaying in the coordinate server. Parameters ---------- pdb : str The PDB id to use. Yields ------ coord : UnitCoordinates A UnitCoordinates object with the coordinates to write. """ structure = self.structure(pdb) for unit in structure.residues(): if unit.sequence == 'HOH': continue coord = self.coordinates(pdb, unit) self.logger.debug("data: PDB: %s" % pdb) self.logger.debug("data: unit: %s" % unit) self.logger.debug("data: coordinates: %s" % coord) if not coord: raise core.InvalidState("No coordinates computed for %s" % unit) yield mod.UnitCoordinates( unit_id=unit.unit_id(), coordinates=coord, )
def guess_loop_release(self, pdbs, loop_release=None, **kwargs): if loop_release: return loop_release with self.session() as session: status = mod.LoopQa releases = mod.LoopReleases info = mod.LoopInfo query = session.query(releases.loop_release_id, info.pdb_id, ).\ join(status, status.loop_release_id == releases.loop_release_id).\ join(info, info.loop_id == status.loop_id).\ filter(info.pdb_id.in_(pdbs)).\ distinct().\ order_by(desc(releases.date)) possible = coll.defaultdict(set) ordering = [] for result in query: if result.loop_release_id not in possible: ordering.append(result.loop_release_id) possible[result.loop_release_id].add(result.pdb_id) required = set(pdbs) for loop_release in ordering: if possible[loop_release] == required: return loop_release raise core.InvalidState("No possible loop release for all PDBs")
def to_process(self, pdbs, **kwargs): """Collect the list of nr_class name values to process for the specified release. Ignores the given PDBs. Parameters ---------- pdbs : list Ignored. Returns ------- classlist : list The list of NR class names to process. """ resolution = 'all' latest = None if kwargs.get('manual', {}).get('nr_release_id', False): latest = kwargs['manual']['nr_release_id'] else: data = self.cached(NR_CACHE_NAME) if not data: raise core.InvalidState("No precomputed grouping to store") latest = data['release'] classlist = self.list_nr_classes(latest, resolution) with self.session() as session: return classlist
def data(self, pair, **kwargs): """Compute the parentage data. This will raise a skip exception if there are not parents, or if this is the first release (parent release is the same as the current release). This requires that there is data stored in the NR_CACHE_NAME file. If there is not, then an exception is raised. Parameters ---------- release : str The nr release id to process. Raises ------ Skip If this is the first release, or there are no parents. Returns ------- data : list A list of dicts that can be written to the ml_parents table. """ loop_type, release = pair cached = self.cached(loop_type) if not cached: raise core.InvalidState("No cached data") if cached['release'] == cached['parent']: raise core.Skip("No parents for first release") if self.no_parents(cached): raise core.Skip("Parent counts show no parents") return self.parents(cached)
def sequence(self, exp_id): """Load all information about the experimental sequence with the given id. This will load both the ids and the sequence. The ids will be a list of numbers, while the sequence is a string. :param int exp_id: The experimental sequence id. :returns: A dictionary of the ids and sequence for the given id. """ ids = [] sequence = [] with self.session() as session: query = session.query(mod.ExpSeqPosition).\ filter(mod.ExpSeqPosition.exp_seq_id == exp_id).\ order_by(mod.ExpSeqPosition.index) if not query.count(): raise core.InvalidState("Could not get sequence for %s" % exp_id) for index, result in enumerate(query): seq_id = result.exp_seq_position_id seq = result.normalized_unit or 'N' ids.append(seq_id) sequence.append(seq) return {'ids': ids, 'sequence': ''.join(sequence)}
def as_quality(self, mapping, entry): """Convert an entry from the parser into a form suitable for writing to the units_quality table. Since some entries from the parser expand to more than one unit due to symmetry operators this will produce an iterator that may have more than 1 value. Parameters ---------- mapping : dict The mapping as produced by `mapping`. entry : dict A dictionary from `Parser.nts`. Yields ------ entry : dict A dictionary of 'unit_id', 'real_space_r', 'density_correlation', 'real_space_r_z_score'. """ key = as_key(entry['id']) if not mapping[key]: raise core.InvalidState("Could not find unit id for %s" % entry) for unit_id in mapping[key]: yield { 'unit_id': unit_id, 'real_space_r': entry.get('real_space_r'), 'density_correlation': entry.get('density_correlation'), 'real_space_r_z_score': entry.get('real_space_r_z_score') }
def select_candidates(self, members): best_method = self.filter_by_method(members) best_nts = self.filter_by_nts(best_method) best_resolution = self.filter_by_resolution(best_nts) if not best_resolution: raise core.InvalidState("Nothing with good resolution") return best_resolution
def cross_chain_interactions(self, ifes, sym_op='1_555'): """Create a dictionary of the interactions between the listed chains. This will get only the counts. :chains: A list of chain dictionaries. :returns: A dictionary of like { 'A': { 'B': 10 }, 'B': { 'A': 10 } }. """ if not ifes: raise core.InvalidState("No ifes to get interactions between") pdb = ifes[0].pdb helper = st.BasePairQueries(self.session) interactions = coll.defaultdict(dict) pairs = it.product((ife.chain for ife in ifes), repeat=2) counter = ft.partial(helper.cross_chain, pdb, count=True, family='cWW', sym_op=sym_op) for name1, name2 in pairs: count = counter(name1, name2) if name1 == name2: count = 0 interactions[name1][name2] = count return dict(interactions)
def positions(self, pdb, chain): exp_seq = self.exp_seq(pdb, chain) with self.session() as session: esum = mod.ExpSeqUnitMapping esp = mod.ExpSeqPosition escm = mod.ExpSeqChainMapping ci = mod.ChainInfo query = session.query( esum.unit_id, esp.index, esp.unit, ).join(esp, esp.exp_seq_position_id == esum.exp_seq_position_id).\ join(escm, escm.exp_seq_chain_mapping_id == esum.exp_seq_chain_mapping_id).\ join(ci, ci.chain_id == escm.chain_id).\ filter(ci.pdb_id == pdb).\ filter(ci.chain_name == chain) if not query.count(): raise core.InvalidState( "Could not load positions for %s|1|%s" % (pdb_id, chain)) positions = [] for result in query: entry = row2dict(result) entry['observed'] = int(result.unit_id is not None) entry['index'] = entry['index'] + 1 positions.append(entry) return positions
def best_model(self, pdb, sym_op): """Determine what model to use for ifes. We will use the model with the most basepairs. It tiebreaks on model number, lower is better. :pdb: The pdb id to use. :sym_op: The symmetry operator to use. :returns: The model number to use. """ with self.session() as session: query = session.query(mod.UnitInfo.model).\ filter_by(pdb_id=pdb).\ distinct() models = [result.model for result in query] if not models: raise core.InvalidState("No models found for %s", pdb) if len(models) == 1: return models[0] helper = st.BasePairQueries(self.session) count = ft.partial(helper.representative, pdb, None, count=True, sym_op=sym_op) models = [(count(model=model), -1 * model) for model in models] return -1 * max(models)[1]
def interactions(self, pdb_id, chain, positions, remove_pseudoknots=False): mapping = {position['unit_id']: position for position in positions} with self.session() as session: uid1 = aliased(mod.UnitInfo) uid2 = aliased(mod.UnitInfo) query = session.query(mod.UnitPairsInteractions).\ join(uid1, uid1.unit_id == mod.UnitPairsInteractions.unit_id_1).\ join(uid2, uid2.unit_id == mod.UnitPairsInteractions.unit_id_2).\ filter(mod.UnitPairsInteractions.f_lwbp == 'cWW').\ filter(uid1.sym_op == uid2.sym_op) query = self.__limit_units__(query, uid1, pdb_id, chain) query = self.__limit_units__(query, uid2, pdb_id, chain) if remove_pseudoknots: query = query.filter(mod.UnitPairsInteractions.f_crossing < 4) if not query.count(): raise core.InvalidState( "Could not load interactions for %s|1|%s" % (pdb_id, chain)) interactions = {} for result in query: unit = mapping[result.unit_id_1]['unit_id'] interactions[unit] = mapping[result.unit_id_2] return interactions
def __call__(self, groups, parent_groups, handles): named = [] for group in groups: parents = self.parents(group, parent_groups) self.logger.info("Group with %i members", len(group['members'])) # No overlaps means new group thus new name name = {} if not parents: name = self.new_name(0, handles) elif len(parents) == 1: name = self.one_parent(group, parents[0], handles) elif len(parents) == 2: name = self.two_parents(group, parents, handles) else: name = self.many_parents(group, parents, handles) named_group = dict(group) named_group['parents'] = [p['group'] for p in parents] named_group['comment'] = name.pop('comment') named_group['name'] = dict(name) self.logger.info("Named group with %i members", len(named_group['members'])) named.append(named_group) handles.add(named_group['name']['handle']) if len(named) != len(groups): raise core.InvalidState("Missing groups in naming") return named
def members_revised(self, class_id, release_id): """Get all members of the class. Parameters ---------- class_id : in The first class_id value for the NR class. release_id : in The first representative sets release that contains the class. Returns ------- members : list A list of tuples (ife_id, nr_chain_id) for all members of the class. """ self.logger.info("members_revised: class_id: %s" % class_id) with self.session() as session: nch = aliased(mod.NrChains) query = session.query(nch.ife_id, nch.nr_chain_id).\ filter(nch.nr_class_id == class_id) members = [(r.ife_id, r.nr_chain_id) for r in query] # if len(members) == 1: # raise core.Skip("Skip group of size 1") if not members: raise core.InvalidState("No members in NR class: %i" % class_id) return members
def discrepancies(self, groups): """Load the discrepancies for the given groups. If use_discrepancy is False this will return an empty dictionary. The returned data structure will be a dictionary of dictionaries where the final values are Bools. The keys in each dictionary are the chain ids which have been aligned. :param list groups: The list of groups to use. :returns: A nested dictionary of dictionaries. """ if not self.use_discrepancy: return {} chain_ids = [] for group in groups: chain_ids.append(group['db_id']) with self.session() as session: sim = mod.ChainChainSimilarity query = session.query(sim).\ filter(sim.chain_id_1.in_(chain_ids)).\ filter(sim.chain_id_2.in_(chain_ids)) discrepancy = coll.defaultdict(dict) for result in query: id1 = result.chain_id_1 id2 = result.chain_id_2 discrepancy[id1][id2] = result.discrepancy discrepancy[id2][id1] = result.discrepancy discrepancy = dict(discrepancy) if not discrepancy and self.use_discrepancy: raise core.InvalidState("No discrepancy data to cluster with") return discrepancy
def ifes(self, nr_release_id): """Get a listing of all IFE's to use in clustering. The IFE's must be from the given list of structures, the ife must be representative for each class and the class should have the given resolution. The experimental method must be in MOTIF_ALLOWED_METHODS. This gives a subset of a representative set. :pdbs: The pdbs to get the best chains and models for. :returns: A dictionary mapping from pdb id to a set of the best chains and models. """ with self.session() as session: chains = mod.NrChains classes = mod.NrClasses ifes = mod.IfeInfo pdbs = mod.PdbInfo query = session.query(chains).\ join(classes, classes.nr_class_id == chains.nr_class_id).\ join(ifes, ifes.ife_id == chains.ife_id).\ join(pdbs, pdbs.pdb_id == ifes.pdb_id).\ filter(chains.rep == 1).\ filter(chains.nr_release_id == nr_release_id).\ filter(classes.resolution == MOTIF_RESOLUTION_CUTOFF).\ filter(pdbs.experimental_technique.in_(MOTIF_ALLOWED_METHODS)).\ order_by(chains.ife_id) if not query.count(): raise core.InvalidState("No ifes found for nr %s" % nr_release_id) return [result.ife_id for result in query]
def member_info(self, member): with self.session() as session: info = session.query(mod.IfeInfo.pdb_id.label('pdb'), mod.IfeInfo.model).\ filter_by(ife_id=member['id']).\ one() info = row2dict(info) info.update(member) with self.session() as session: query = session.query(mod.ChainInfo.chain_name, mod.IfeChains.is_structured, ).\ join(mod.IfeChains, mod.IfeChains.chain_id == mod.ChainInfo.chain_id).\ filter_by(ife_id=member['id']) if not query.count(): raise core.InvalidState("Could not find chains for %s" % member) all_chains = [row2dict(c) for c in query] valid = op.itemgetter('is_structured') chains = [c['chain_name'] for c in all_chains if valid(c)] if not chains: chains = [c['chain_name'] for c in all_chains] info['chains'] = chains loader = self._create(IfeLoader) info['sym_op'] = loader.sym_op(info['pdb']) return info
def exp_id(self, chain_id): """Compute the experimetnal sequence id for the given chain id. This will look up all experimental sequences with the same sequence as the given chain id. Parameters ---------- chain_id : int The chain id. Returns ------- exp_seq_ids : list List of int experimental sequence ids """ with self.session() as session: exp = mod.ExpSeqInfo query = session.query(exp.exp_seq_id).\ join(mod.ChainInfo, mod.ChainInfo.sequence == exp.sequence).\ filter(mod.ChainInfo.chain_id == chain_id) if query.count() != 1: raise core.InvalidState("There should be exactly one matching" " experimental sequence") return query.one().exp_seq_id
def members(self, class_id): """Get all members of the class. Parameters ---------- class_id : in The id of the the NR class. Returns ------- members : list A list of tuples (ife_id, nr_chain_id) for all members that are part of the class. ife_id is like 2A43|1|A and nr_chain_id is like 11890928 """ self.logger.info("members: class_id: %s" % class_id) with self.session() as session: query = session.query(mod.NrChains.ife_id, mod.NrChains.nr_chain_id).\ filter_by(nr_class_id=class_id) members = [(r.ife_id, r.nr_chain_id) for r in query] if len(members) == 1: raise core.Skip("Skip group of size 1") if not members: raise core.InvalidState("No members in NR class: %i" % class_id) return members
def __call__(self, pdbs, parent_release, current_release, cutoffs=RESOLUTION_GROUPS, **kwargs): """Build the nr set. :pdbs: The list of pdbs to process. :current: Current release id. :new: Id for the next release. :resolutions: Resolution groups to create a class for. :returns: A list of nr classes with their memebers and parents. """ if not pdbs: raise core.InvalidState("Must give pdbs to group") self.logger.info("Building nr release with %i pdbs", len(pdbs)) groups = self.group(pdbs, **kwargs) parents = self.load_parents(parent_release, cutoffs) named = self.name_groups(groups, parents['all']) filtered = self.filter_groups(named, cutoffs) with_parents = self.attach_parents(filtered, parents) with_reps = self.find_representatives(with_parents) return { 'parent_counts': self.counts(parents, with_reps), 'groups': with_reps, 'release': current_release, 'parent': parent_release, }
def nr_release_id(self, before_date=None, **kwargs): """Get the nr release, if not given manually. If no before_date is given then we get the latest, otherwise we get the release for the given date. If no release exists for that date then we fail. :param date before_date: The date to use. :returns: The nr release id. """ if 'nr_release' in kwargs.get('manual', {}): return kwargs['manual']['nr_release'] if before_date is None: nr_release, _ = NrReleaseLoader(self.config, self.session).\ current_id() return nr_release with self.session() as session: query = session.query(mod.NrReleases).\ filter_by(date=before_date) if query.count() != 1: raise core.InvalidState("No nr release on %s", before_date) return query.one().nr_release_id
def parse(self, filename, pdb): """Reads the csv file, imports all interactions, deletes the file when done to avoid stale data and free up disk space :filename: The input filename. :pdb: The pdb id. :returns: A list of Interaction objects. """ data = coll.defaultdict(dict) with open(filename, 'rb') as raw: reader = csv.reader(raw, delimiter=',', quotechar='"') for index, row in enumerate(reader): if not row[0] or not row[1]: msg = "Line %s did not include both units" raise core.InvalidState(msg % index) interaction = data[(row[0], row[1])] interaction['unit_id_1'] = row[0] interaction['unit_id_2'] = row[1] interaction['f_crossing'] = int(row[3]) interaction['pdb_id'] = pdb family = row[2].strip() inter_type = self.interaction_type(family) if inter_type: interaction[inter_type] = family key = op.itemgetter('unit_id_1', 'unit_id_2') return sorted(data.values(), key=key)