Beispiel #1
0
    def mapping(self, release_id, names):
        """Create a mapping from nr class names to id in the database. This
        will raise an exception if it cannot find all names or if not given a
        list of names and a release id.

        :names: A list of names.
        :release_id: The release id to use.
        :returns: A dictonary mapping class_name to id.
        """

        if not names or not release_id:
            raise core.InvalidState("Must give names and release id")

        with self.session() as session:
            query = session.query(mod.NrClasses.nr_class_id,
                                  mod.NrClasses.name,
                                  ).\
                filter(mod.NrClasses.name.in_(names)).\
                filter(mod.NrClasses.nr_release_id == release_id)

            if query.count() == 0:
                self.logger.info(names)
                raise core.InvalidState("Found no clases with given names")

            mapping = {}
            for result in query:
                mapping[result.name] = result.nr_class_id

        if len(mapping) != len(names):
            raise core.InvalidState("Could not map all names")

        return mapping
    def to_process(self, pdbs, **kwargs):
        """Compute the data to process. The input PDB's are ignored and instead
        the cache is examine for motif data, IL and HL to import. This will
        produce a list of tuples of the motifs to import.

        Parameters
        ----------
        pdbs : list
            Ingored

        Returns
        -------
        A list of tuples like [('IL', '1.0'), ('HL', '1.0'] to import.
        """
        current, _ = ReleaseLoader(self.config, self.session).current_id()
        data = []
        for loop_type in ReleaseLoader.types:
            cached = self.cached(loop_type)
            if not cached:
                raise core.InvalidState("No cached data")

            if cached['release'] != current:
                raise core.InvalidState("Caching does not match excepted ID")
            data.append((loop_type, current))
        return data
    def normalized_mapping(self, pdb_id):
        """This produces a dictonary that can be used to correct bad unit ids.
        Some of the loops stored after we migrated the database have incorrect
        unit ids. The errors appear to be of 2 kinds, incorrect model number
        and possibly bad alt ids. By producing this mapping we try to correct
        the issue by finding the correct unit id.

        :param str pdb_id: The pdb id to get units for.
        :returns: A dictonary with Unit keys mapping to the unit id.
        """
        with self.session() as session:
            query = session.query(mod.UnitInfo.unit_id,
                                  mod.UnitInfo.pdb_id.label('pdb'),
                                  mod.UnitInfo.model,
                                  mod.UnitInfo.chain,
                                  mod.UnitInfo.number.label('component_number'),
                                  mod.UnitInfo.ins_code.label('insertion_code'),
                                  mod.UnitInfo.alt_id,
                                  mod.UnitInfo.sym_op.label('symmetry'),
                                  ).\
                filter(mod.UnitInfo.pdb_id == pdb_id)

            if not query.count():
                raise core.InvalidState("No units in %s" % pdb_id)

            mapping = {}
            for result in query:
                data = row2dict(result)
                unit_id = data.pop('unit_id')
                key = Unit(**data)
                if key in mapping:
                    raise core.InvalidState("Non unique mapping")
                mapping[key] = unit_id
        return mapping
    def correct_structure(self, pdb, mapping, units, require_all=True):
        """Correct units from a single structure.

        :param str pdb: The PDB id to use.
        :param dict mapping: The mapping to use.
        :param list units: The list of Unit's to correct.
        :returns: A list of the corrected units.
        """

        valid = []
        for unit in units:
            if unit.pdb != pdb:
                continue

            fixed = self.correct(mapping, unit)
            if not fixed:
                msg = "Could not correct {unit}".format(unit=str(unit))
                if require_all:
                    raise core.InvalidState(msg)
                else:
                    self.logger.error(msg)
                continue
            valid.append(fixed)

        if len(valid) != len(set(valid)):
            raise core.InvalidState("Did not produce unique normalization")

        return valid
Beispiel #5
0
    def map(self, nts, mapping):
        if not mapping:
            raise core.InvalidState("Given empty mapping")

        for nt in nts:
            if nt not in mapping:
                raise core.InvalidState("Missing nt %s" % nt)
            yield mapping[nt]
Beispiel #6
0
 def __init__(self, *args, **kwargs):
     super(CsvLoader, self).__init__(*args, **kwargs)
     if not self.name:
         raise core.InvalidState("Must specify name")
     if not self.table:
         raise core.InvalidState("Must specify the table name")
     if not self.headers:
         raise core.InvalidState("Must specify file headers")
Beispiel #7
0
    def __call__(self,
                 initial,
                 length_increase=NR_LENGTH_PERCENT_INCREASE,
                 bp_increase=NR_BP_PERCENT_INCREASE):
        """
        Find the representative for the group.

        Parameters
        ----------
        group : list
            List of IFE's to find the representative of.
        length_increase : float
            The fraction increase in resolved nucleotides that an IFE must have
            to be selected as representative.
        bp_increase : float
            The fraction increase of basepairs that an IFE must have to be
            selected as representative.

        Returns
        -------
            representative : dict
        The ife which should be the representative.
        """

        group = self.filter_group_by_method(initial)
        best = self.initial_representative(group)
        if not best:
            raise core.InvalidState("No current representative")
        self.logger.debug("Naive representative: %s", best['id'])

        rep = best
        while True:
            candidates = self.candidates(rep, group['members'])
            self.logger.debug("Found %i representative candidates",
                              len(candidates))
            new_rep = self.best_above_cutoffs(rep, candidates, length_increase,
                                              bp_increase)
            if new_rep == rep:
                break
            self.logger.info("Changed representative from %s to %s", rep['id'],
                             new_rep['id'])
            rep = new_rep

        if not rep:
            raise core.InvalidState("No representative found")

        self.logger.debug("Computed representative: %s", rep['id'])
        return self.insert_as_representative(rep,
                                             initial['members'],
                                             sort=bp_per_nt)
    def loops(self,
              loop_release_id,
              loop_type,
              ifes,
              size_limit=None,
              **kwargs):
        """Get the list of loop ids to use in clustering. These loops must be
        from IFE's in the given list and marked as valid in the loop quality
        step.

        Parameters
        ----------
        loop_release_id : str
            The loop release id to use.
        loop_type : str
            The type of loop to use, eg, IL, HL.
        ifes : list
            A list of ife ids to find loops in.

        Returns
        -------
        loops: str
            A list of loop ids to process.
        """

        exclude = self.loops_to_exclude(**kwargs)

        found = set()
        with self.session() as session:
            loops = mod.LoopInfo
            quality = mod.LoopQa
            pos = mod.LoopPositions
            ife_chains = mod.IfeChains
            chain_info = mod.ChainInfo
            units = mod.UnitInfo
            query = session.query(loops.loop_id).\
                join(quality, quality.loop_id == loops.loop_id).\
                join(pos, pos.loop_id == loops.loop_id).\
                join(units, units.unit_id == pos.unit_id).\
                join(chain_info,
                     (chain_info.chain_name == units.chain) &
                     (chain_info.pdb_id == units.pdb_id)).\
                join(ife_chains,
                     ife_chains.chain_id == chain_info.chain_id).\
                filter(quality.status == 1).\
                filter(quality.loop_release_id == loop_release_id).\
                filter(loops.type == loop_type).\
                filter(ife_chains.ife_id.in_(ifes)).\
                filter(~loops.loop_id.in_(BLACKLIST)).\
                distinct()

            if size_limit is not None:
                query = query.filter(loops.length < size_limit)

            found.update(r.loop_id for r in query if r.loop_id not in exclude)

        if not found:
            raise core.InvalidState("No loops to cluster for %s" %
                                    loop_release_id)
        return sorted(found)
Beispiel #9
0
    def annotations(self, pdb, remove=True):
        """Call matlab and parse the annotations to create a list of unit id to
        loop mappings.

        :param str pdb: The pdb id to use.
        :param Bool remove: Flag to indicate if the produced file should be
        removed.
        :returns: The annotations produced by matlab.
        """

        mlab = matlab.Matlab(self.config['locations']['fr3d_root'])
        path = str(os.path.join(self.precomputed, pdb))
        try:
            if not os.path.exists(path):
                os.mkdir(path)
        except:
            raise core.InvalidState("Could not create %s for matlab" % path)

        [output_file, err_msg] = mlab.loadLoopPositions(path, nout=2)
        if err_msg != '':
            raise matlab.MatlabFailed(err_msg)

        data = self.parse(output_file)
        if remove:
            os.remove(output_file)
        return data
    def data(self, pdb, **kwargs):
        """Compute the coordinate entries for the given PDB. This will exclude
        water molecules as those aren't generally worth displaying in the
        coordinate server.

        Parameters
        ----------
        pdb : str
            The PDB id to use.

        Yields
        ------
        coord : UnitCoordinates
            A UnitCoordinates object with the coordinates to write.
        """

        structure = self.structure(pdb)
        for unit in structure.residues():
            if unit.sequence == 'HOH':
                continue
            coord = self.coordinates(pdb, unit)
            self.logger.debug("data: PDB: %s" % pdb)
            self.logger.debug("data: unit: %s" % unit)
            self.logger.debug("data: coordinates: %s" % coord)
            if not coord:
                raise core.InvalidState("No coordinates computed for %s" %
                                        unit)

            yield mod.UnitCoordinates(
                unit_id=unit.unit_id(),
                coordinates=coord,
            )
Beispiel #11
0
    def guess_loop_release(self, pdbs, loop_release=None, **kwargs):
        if loop_release:
            return loop_release

        with self.session() as session:
            status = mod.LoopQa
            releases = mod.LoopReleases
            info = mod.LoopInfo
            query = session.query(releases.loop_release_id,
                                  info.pdb_id,
                                  ).\
                join(status,
                     status.loop_release_id == releases.loop_release_id).\
                join(info,
                     info.loop_id == status.loop_id).\
                filter(info.pdb_id.in_(pdbs)).\
                distinct().\
                order_by(desc(releases.date))

            possible = coll.defaultdict(set)
            ordering = []
            for result in query:
                if result.loop_release_id not in possible:
                    ordering.append(result.loop_release_id)
                possible[result.loop_release_id].add(result.pdb_id)

            required = set(pdbs)
            for loop_release in ordering:
                if possible[loop_release] == required:
                    return loop_release

        raise core.InvalidState("No possible loop release for all PDBs")
Beispiel #12
0
    def to_process(self, pdbs, **kwargs):
        """Collect the list of nr_class name values to process for the
        specified release. Ignores the given PDBs.

        Parameters
        ----------
        pdbs : list
            Ignored.

        Returns
        -------
        classlist : list
            The list of NR class names to process.
        """

        resolution = 'all'

        latest = None
        if kwargs.get('manual', {}).get('nr_release_id', False):
            latest = kwargs['manual']['nr_release_id']
        else:
            data = self.cached(NR_CACHE_NAME)
            if not data:
                raise core.InvalidState("No precomputed grouping to store")
            latest = data['release']

        classlist = self.list_nr_classes(latest, resolution)

        with self.session() as session:
            return classlist
Beispiel #13
0
    def data(self, pair, **kwargs):
        """Compute the parentage data. This will raise a skip exception if
        there are not parents, or if this is the first release (parent release
        is the same as the current release). This requires that there is data
        stored in the NR_CACHE_NAME file. If there is not, then an exception is
        raised.

        Parameters
        ----------
        release : str
            The nr release id to process.

        Raises
        ------
        Skip
            If this is the first release, or there are no parents.

        Returns
        -------
        data : list
            A list of dicts that can be written to the ml_parents table.
        """

        loop_type, release = pair
        cached = self.cached(loop_type)
        if not cached:
            raise core.InvalidState("No cached data")

        if cached['release'] == cached['parent']:
            raise core.Skip("No parents for first release")
        if self.no_parents(cached):
            raise core.Skip("Parent counts show no parents")

        return self.parents(cached)
Beispiel #14
0
    def sequence(self, exp_id):
        """Load all information about the experimental sequence with the given
        id. This will load both the ids and the sequence. The ids will be a
        list of numbers, while the sequence is a string.

        :param int exp_id: The experimental sequence id.
        :returns: A dictionary of the ids and sequence for the given id.
        """

        ids = []
        sequence = []
        with self.session() as session:
            query = session.query(mod.ExpSeqPosition).\
                filter(mod.ExpSeqPosition.exp_seq_id == exp_id).\
                order_by(mod.ExpSeqPosition.index)

            if not query.count():
                raise core.InvalidState("Could not get sequence for %s" %
                                        exp_id)

            for index, result in enumerate(query):
                seq_id = result.exp_seq_position_id
                seq = result.normalized_unit or 'N'
                ids.append(seq_id)
                sequence.append(seq)

        return {'ids': ids, 'sequence': ''.join(sequence)}
    def as_quality(self, mapping, entry):
        """Convert an entry from the parser into a form suitable for writing to
        the units_quality table. Since some entries from the parser expand to
        more than one unit due to symmetry operators this will produce an
        iterator that may have more than 1 value.

        Parameters
        ----------
        mapping : dict
            The mapping as produced by `mapping`.
        entry : dict
            A dictionary from `Parser.nts`.

        Yields
        ------
        entry : dict
            A dictionary of 'unit_id', 'real_space_r', 'density_correlation',
            'real_space_r_z_score'.
        """

        key = as_key(entry['id'])

        if not mapping[key]:
            raise core.InvalidState("Could not find unit id for %s" % entry)

        for unit_id in mapping[key]:
            yield {
                'unit_id': unit_id,
                'real_space_r': entry.get('real_space_r'),
                'density_correlation': entry.get('density_correlation'),
                'real_space_r_z_score': entry.get('real_space_r_z_score')
            }
Beispiel #16
0
 def select_candidates(self, members):
     best_method = self.filter_by_method(members)
     best_nts = self.filter_by_nts(best_method)
     best_resolution = self.filter_by_resolution(best_nts)
     if not best_resolution:
         raise core.InvalidState("Nothing with good resolution")
     return best_resolution
    def cross_chain_interactions(self, ifes, sym_op='1_555'):
        """Create a dictionary of the interactions between the listed chains.
        This will get only the counts.

        :chains: A list of chain dictionaries.
        :returns: A dictionary of like { 'A': { 'B': 10 }, 'B': { 'A': 10 } }.
        """

        if not ifes:
            raise core.InvalidState("No ifes to get interactions between")

        pdb = ifes[0].pdb
        helper = st.BasePairQueries(self.session)
        interactions = coll.defaultdict(dict)
        pairs = it.product((ife.chain for ife in ifes), repeat=2)
        counter = ft.partial(helper.cross_chain,
                             pdb,
                             count=True,
                             family='cWW',
                             sym_op=sym_op)
        for name1, name2 in pairs:
            count = counter(name1, name2)
            if name1 == name2:
                count = 0
            interactions[name1][name2] = count

        return dict(interactions)
Beispiel #18
0
    def positions(self, pdb, chain):
        exp_seq = self.exp_seq(pdb, chain)
        with self.session() as session:
            esum = mod.ExpSeqUnitMapping
            esp = mod.ExpSeqPosition
            escm = mod.ExpSeqChainMapping
            ci = mod.ChainInfo
            query = session.query(
                esum.unit_id,
                esp.index,
                esp.unit,
            ).join(esp, esp.exp_seq_position_id == esum.exp_seq_position_id).\
                join(escm, escm.exp_seq_chain_mapping_id == esum.exp_seq_chain_mapping_id).\
                join(ci, ci.chain_id == escm.chain_id).\
                filter(ci.pdb_id == pdb).\
                filter(ci.chain_name == chain)

            if not query.count():
                raise core.InvalidState(
                    "Could not load positions for %s|1|%s" % (pdb_id, chain))

            positions = []
            for result in query:
                entry = row2dict(result)
                entry['observed'] = int(result.unit_id is not None)
                entry['index'] = entry['index'] + 1
                positions.append(entry)
            return positions
    def best_model(self, pdb, sym_op):
        """Determine what model to use for ifes. We will use the model with the
        most basepairs. It tiebreaks on model number, lower is better.

        :pdb: The pdb id to use.
        :sym_op: The symmetry operator to use.
        :returns: The model number to use.
        """

        with self.session() as session:
            query = session.query(mod.UnitInfo.model).\
                filter_by(pdb_id=pdb).\
                distinct()
            models = [result.model for result in query]
            if not models:
                raise core.InvalidState("No models found for %s", pdb)
            if len(models) == 1:
                return models[0]

        helper = st.BasePairQueries(self.session)
        count = ft.partial(helper.representative,
                           pdb,
                           None,
                           count=True,
                           sym_op=sym_op)
        models = [(count(model=model), -1 * model) for model in models]
        return -1 * max(models)[1]
Beispiel #20
0
    def interactions(self, pdb_id, chain, positions, remove_pseudoknots=False):
        mapping = {position['unit_id']: position for position in positions}
        with self.session() as session:
            uid1 = aliased(mod.UnitInfo)
            uid2 = aliased(mod.UnitInfo)
            query = session.query(mod.UnitPairsInteractions).\
                join(uid1,
                     uid1.unit_id == mod.UnitPairsInteractions.unit_id_1).\
                join(uid2,
                     uid2.unit_id == mod.UnitPairsInteractions.unit_id_2).\
                filter(mod.UnitPairsInteractions.f_lwbp == 'cWW').\
                filter(uid1.sym_op == uid2.sym_op)
            query = self.__limit_units__(query, uid1, pdb_id, chain)
            query = self.__limit_units__(query, uid2, pdb_id, chain)

            if remove_pseudoknots:
                query = query.filter(mod.UnitPairsInteractions.f_crossing < 4)

            if not query.count():
                raise core.InvalidState(
                    "Could not load interactions for %s|1|%s" %
                    (pdb_id, chain))

            interactions = {}
            for result in query:
                unit = mapping[result.unit_id_1]['unit_id']
                interactions[unit] = mapping[result.unit_id_2]
            return interactions
    def __call__(self, groups, parent_groups, handles):
        named = []
        for group in groups:
            parents = self.parents(group, parent_groups)
            self.logger.info("Group with %i members", len(group['members']))

            # No overlaps means new group thus new name
            name = {}
            if not parents:
                name = self.new_name(0, handles)

            elif len(parents) == 1:
                name = self.one_parent(group, parents[0], handles)

            elif len(parents) == 2:
                name = self.two_parents(group, parents, handles)

            else:
                name = self.many_parents(group, parents, handles)

            named_group = dict(group)
            named_group['parents'] = [p['group'] for p in parents]
            named_group['comment'] = name.pop('comment')
            named_group['name'] = dict(name)
            self.logger.info("Named group with %i members",
                             len(named_group['members']))

            named.append(named_group)
            handles.add(named_group['name']['handle'])

        if len(named) != len(groups):
            raise core.InvalidState("Missing groups in naming")

        return named
Beispiel #22
0
    def members_revised(self, class_id, release_id):
        """Get all members of the class.

        Parameters
        ----------
        class_id : in
            The first class_id value for the NR class.

        release_id : in
            The first representative sets release that contains the class.

        Returns
        -------
        members : list
            A list of tuples (ife_id, nr_chain_id) for all
            members of the class.
        """

        self.logger.info("members_revised:  class_id: %s" % class_id)

        with self.session() as session:
            nch = aliased(mod.NrChains)

            query = session.query(nch.ife_id, nch.nr_chain_id).\
                filter(nch.nr_class_id == class_id)

            members = [(r.ife_id, r.nr_chain_id) for r in query]

#        if len(members) == 1:
#            raise core.Skip("Skip group of size 1")

        if not members:
            raise core.InvalidState("No members in NR class: %i" % class_id)

        return members
Beispiel #23
0
    def discrepancies(self, groups):
        """Load the discrepancies for the given groups. If use_discrepancy is
        False this will return an empty dictionary. The returned data structure
        will be a dictionary of dictionaries where the final values are Bools.
        The keys in each dictionary are the chain ids which have been aligned.

        :param list groups: The list of groups to use.
        :returns: A nested dictionary of dictionaries.
        """

        if not self.use_discrepancy:
            return {}

        chain_ids = []
        for group in groups:
            chain_ids.append(group['db_id'])

        with self.session() as session:
            sim = mod.ChainChainSimilarity
            query = session.query(sim).\
                filter(sim.chain_id_1.in_(chain_ids)).\
                filter(sim.chain_id_2.in_(chain_ids))

            discrepancy = coll.defaultdict(dict)
            for result in query:
                id1 = result.chain_id_1
                id2 = result.chain_id_2
                discrepancy[id1][id2] = result.discrepancy
                discrepancy[id2][id1] = result.discrepancy

        discrepancy = dict(discrepancy)
        if not discrepancy and self.use_discrepancy:
            raise core.InvalidState("No discrepancy data to cluster with")
        return discrepancy
Beispiel #24
0
    def ifes(self, nr_release_id):
        """Get a listing of all IFE's to use in clustering. The IFE's must be
        from the given list of structures, the ife must be representative for
        each class and the class should have the given resolution.
        The experimental method must be in MOTIF_ALLOWED_METHODS.
        This gives a subset of a representative set.

        :pdbs: The pdbs to get the best chains and models for.
        :returns: A dictionary mapping from pdb id to a set of the best chains
        and models.
        """

        with self.session() as session:
            chains = mod.NrChains
            classes = mod.NrClasses
            ifes = mod.IfeInfo
            pdbs = mod.PdbInfo
            query = session.query(chains).\
                join(classes, classes.nr_class_id == chains.nr_class_id).\
                join(ifes, ifes.ife_id == chains.ife_id).\
                join(pdbs, pdbs.pdb_id == ifes.pdb_id).\
                filter(chains.rep == 1).\
                filter(chains.nr_release_id == nr_release_id).\
                filter(classes.resolution == MOTIF_RESOLUTION_CUTOFF).\
                filter(pdbs.experimental_technique.in_(MOTIF_ALLOWED_METHODS)).\
                order_by(chains.ife_id)

            if not query.count():
                raise core.InvalidState("No ifes found for nr %s" %
                                        nr_release_id)

            return [result.ife_id for result in query]
Beispiel #25
0
    def member_info(self, member):
        with self.session() as session:
            info = session.query(mod.IfeInfo.pdb_id.label('pdb'),
                                 mod.IfeInfo.model).\
                filter_by(ife_id=member['id']).\
                one()
            info = row2dict(info)
            info.update(member)

            with self.session() as session:
                query = session.query(mod.ChainInfo.chain_name,
                                      mod.IfeChains.is_structured,
                                      ).\
                    join(mod.IfeChains,
                         mod.IfeChains.chain_id == mod.ChainInfo.chain_id).\
                    filter_by(ife_id=member['id'])

                if not query.count():
                    raise core.InvalidState("Could not find chains for %s" %
                                            member)

                all_chains = [row2dict(c) for c in query]
                valid = op.itemgetter('is_structured')
                chains = [c['chain_name'] for c in all_chains if valid(c)]
                if not chains:
                    chains = [c['chain_name'] for c in all_chains]

            info['chains'] = chains
            loader = self._create(IfeLoader)
            info['sym_op'] = loader.sym_op(info['pdb'])

            return info
    def exp_id(self, chain_id):
        """Compute the experimetnal sequence id for the given chain id. This
        will look up all experimental sequences with the same sequence as the
        given chain id.

        Parameters
        ----------
        chain_id : int
            The chain id.

        Returns
        -------
        exp_seq_ids : list
            List of int experimental sequence ids
        """

        with self.session() as session:
            exp = mod.ExpSeqInfo
            query = session.query(exp.exp_seq_id).\
                join(mod.ChainInfo,
                     mod.ChainInfo.sequence == exp.sequence).\
                filter(mod.ChainInfo.chain_id == chain_id)

            if query.count() != 1:
                raise core.InvalidState("There should be exactly one matching"
                                        " experimental sequence")
            return query.one().exp_seq_id
Beispiel #27
0
    def members(self, class_id):
        """Get all members of the class.

        Parameters
        ----------
        class_id : in
            The id of the the NR class.

        Returns
        -------
        members : list
            A list of tuples (ife_id, nr_chain_id) for all members that are
            part of the class.
            ife_id is like 2A43|1|A and nr_chain_id is like 11890928
        """

        self.logger.info("members: class_id: %s" % class_id)

        with self.session() as session:
            query = session.query(mod.NrChains.ife_id,
                                  mod.NrChains.nr_chain_id).\
                filter_by(nr_class_id=class_id)
            members = [(r.ife_id, r.nr_chain_id) for r in query]

        if len(members) == 1:
            raise core.Skip("Skip group of size 1")

        if not members:
            raise core.InvalidState("No members in NR class: %i" % class_id)

        return members
Beispiel #28
0
    def __call__(self,
                 pdbs,
                 parent_release,
                 current_release,
                 cutoffs=RESOLUTION_GROUPS,
                 **kwargs):
        """Build the nr set.

        :pdbs: The list of pdbs to process.
        :current: Current release id.
        :new: Id for the next release.
        :resolutions: Resolution groups to create a class for.
        :returns: A list of nr classes with their memebers and parents.
        """

        if not pdbs:
            raise core.InvalidState("Must give pdbs to group")

        self.logger.info("Building nr release with %i pdbs", len(pdbs))

        groups = self.group(pdbs, **kwargs)
        parents = self.load_parents(parent_release, cutoffs)

        named = self.name_groups(groups, parents['all'])
        filtered = self.filter_groups(named, cutoffs)
        with_parents = self.attach_parents(filtered, parents)
        with_reps = self.find_representatives(with_parents)

        return {
            'parent_counts': self.counts(parents, with_reps),
            'groups': with_reps,
            'release': current_release,
            'parent': parent_release,
        }
Beispiel #29
0
    def nr_release_id(self, before_date=None, **kwargs):
        """Get the nr release, if not given manually.
        If no before_date is given then we get the latest,
        otherwise we get the release for the given date.
        If no release exists for that date then we fail.

        :param date before_date: The date to use.
        :returns: The nr release id.
        """

        if 'nr_release' in kwargs.get('manual', {}):
            return kwargs['manual']['nr_release']

        if before_date is None:
            nr_release, _ = NrReleaseLoader(self.config, self.session).\
                current_id()
            return nr_release

        with self.session() as session:
            query = session.query(mod.NrReleases).\
                filter_by(date=before_date)

            if query.count() != 1:
                raise core.InvalidState("No nr release on %s", before_date)
            return query.one().nr_release_id
    def parse(self, filename, pdb):
        """Reads the csv file, imports all interactions, deletes the file when
        done to avoid stale data and free up disk space

        :filename: The input filename.
        :pdb: The pdb id.
        :returns: A list of Interaction objects.
        """

        data = coll.defaultdict(dict)
        with open(filename, 'rb') as raw:
            reader = csv.reader(raw, delimiter=',', quotechar='"')
            for index, row in enumerate(reader):
                if not row[0] or not row[1]:
                    msg = "Line %s did not include both units"
                    raise core.InvalidState(msg % index)
                interaction = data[(row[0], row[1])]
                interaction['unit_id_1'] = row[0]
                interaction['unit_id_2'] = row[1]
                interaction['f_crossing'] = int(row[3])
                interaction['pdb_id'] = pdb

                family = row[2].strip()
                inter_type = self.interaction_type(family)
                if inter_type:
                    interaction[inter_type] = family

        key = op.itemgetter('unit_id_1', 'unit_id_2')
        return sorted(data.values(), key=key)