Ejemplo n.º 1
0
    def edge_lookup(self, keylookup_obj, id_strct, debug=False):
        """
        Follow an edge given a key.

        An edge represets a document and this method uses the data in the edge_object
        to find one key to another key using exactly one mongodb lookup.
        :param keylookup_obj:
        :param id_strct:
        :return:
        """
        if not isinstance(id_strct, IDStruct):
            raise TypeError("edge_lookup id_struct is of the wrong type")

        # Build up a new_id_strct from the results
        res_id_strct = IDStruct()

        # Keep the old debug information
        if debug:
            res_id_strct.import_debug(id_strct)

        id_lst = id_strct.id_lst
        if id_lst:
            find_lst = self.collection_find(id_lst, self.lookup, self.field)

            for doc in find_lst:
                for orig_id in id_strct.find_right(
                        nested_lookup(doc, self.lookup)):
                    res_id_strct.add(orig_id, nested_lookup(doc, self.field))
                    if debug:
                        res_id_strct.set_debug(orig_id, self.label,
                                               nested_lookup(doc, self.field))
        return res_id_strct
Ejemplo n.º 2
0
    def edge_lookup(self, keylookup_obj, id_strct):
        """
        Follow an edge given a key.

        An edge represets a document and this method uses the data in the edge_object
        to find one key to another key using exactly one mongodb lookup.
        :param keylookup_obj:
        :param id_strct:
        :return:
        """
        if not isinstance(id_strct, IDStruct):
            raise TypeError("edge_lookup id_struct is of the wrong type")

        # Build up a new_id_strct from the results
        res_id_strct = IDStruct()

        id_lst = id_strct.id_lst
        if len(id_lst):
            find_lst = self.collection.find({self.lookup: {
                "$in": id_lst
            }}, {
                self.lookup: 1,
                self.field: 1
            })

            for d in find_lst:
                for orig_id in id_strct.find_right(
                        nested_lookup(d, self.lookup)):
                    res_id_strct.add(orig_id, nested_lookup(d, self.field))
        return res_id_strct
Ejemplo n.º 3
0
 def _build_hit_miss_lsts(doc_lst, id_strct, debug):
     """
     Return a list of documents that have had their identifiers replaced
     also return a list of documents that were not changed
     :param doc_lst:
     :param id_strct:
     :return:
     """
     hit_lst = []
     miss_lst = []
     for doc in doc_lst:
         hit_flag = False
         value = nested_lookup(doc, input_type[1])
         for lookup_id in id_strct.find_left(value):
             new_doc = copy.deepcopy(doc)
             # ensure _id is always a str
             new_doc['_id'] = str(lookup_id)
             # capture debug information
             if debug:
                 new_doc['dt_debug']['start_field'] = input_type[1]
                 new_doc['dt_debug']['debug'] = id_strct.get_debug(
                     value)
             hit_lst.append(new_doc)
             hit_flag = True
         if not hit_flag:
             miss_lst.append(doc)
     return hit_lst, miss_lst
Ejemplo n.º 4
0
    def _init_strct(self, field, doc_lst):
        """
        initialze _id_tuple_lst

        In this class, stitch identifiers are converted to pubchem identifiers
        for keylookup.  This is done internally by this class which performs a
        preprocessing conversion to an identifier.
        """
        for doc in doc_lst:
            value = nested_lookup(doc, field)
            if value:
                self.add(value, self.preprocess_id(value))
Ejemplo n.º 5
0
 def _copy(self, input_type, doc_lst):
     """Copy ids in the case where input_type == output_type"""
     hit_lst = []
     miss_lst = []
     for doc in doc_lst:
         val = nested_lookup(doc, input_type[1])
         if val:
             # ensure _id is always a str
             doc['_id'] = str(val)
             hit_lst.append(doc)
         else:
             miss_lst.append(doc)
     return (hit_lst, miss_lst)
 def _copy(self, input_type, doc_lst):
     """Copy ids in the case where input_type == output_type"""
     hit_lst = []
     miss_lst = []
     for doc in doc_lst:
         val = nested_lookup(doc, input_type[1])
         if val:
             # ensure _id is always a str
             doc['_id'] = str(val)
             hit_lst.append(doc)
         else:
             miss_lst.append(doc)
     # Keep a record of IDs copied
     self.histogram.update_io(input_type, input_type, len(hit_lst))
     return (hit_lst, miss_lst)
Ejemplo n.º 7
0
 def _copy(self, input_type, doc_lst):
     """Copy ids in the case where input_type == output_type"""
     hit_lst = []
     miss_lst = []
     for doc in doc_lst:
         val = nested_lookup(doc, input_type[1])
         if val:
             # ensure _id is always a str
             doc['_id'] = str(val)
             hit_lst.append(doc)
             # retain debug information if available (assumed dt_debug already in place)
             if self.debug:
                 doc['dt_debug']['copy_from'] = (input_type[1], val)
         else:
             miss_lst.append(doc)
     # Keep a record of IDs copied
     self.histogram.update_io(input_type, input_type, len(hit_lst))
     return (hit_lst, miss_lst)
Ejemplo n.º 8
0
 def _build_hit_miss_lsts(doc_lst, id_strct):
     """
     Return a list of documents that have had their identifiers replaced
     also return a list of documents that were not changed
     :param doc_lst:
     :param id_strct:
     :return:
     """
     hit_lst = []
     miss_lst = []
     for d in doc_lst:
         hit_flag = False
         value = nested_lookup(d, input_type[1])
         for lookup_id in id_strct.find_left(value):
             new_doc = copy.deepcopy(d)
             # ensure _id is always a str
             new_doc['_id'] = str(lookup_id)
             hit_lst.append(new_doc)
             hit_flag = True
         if not hit_flag:
             miss_lst.append(d)
     return hit_lst, miss_lst
Ejemplo n.º 9
0
    def travel(self, input_type, target, doc_lst):
        """
        Traverse a graph from a start key type to a target key type using
        precomputed paths.

        :param start: key type to start from
        :param target: key type to end at
        :param key: key value of type 'start'
        :return:
        """
        def _build_path_strct(input_type, doc_lst):
            """
            Build the path structure for the travel function
            :return:
            """
            return self.idstruct_class(input_type[1], doc_lst)

        def _build_hit_miss_lsts(doc_lst, id_strct):
            """
            Return a list of documents that have had their identifiers replaced
            also return a list of documents that were not changed
            :param doc_lst:
            :param id_strct:
            :return:
            """
            hit_lst = []
            miss_lst = []
            for d in doc_lst:
                hit_flag = False
                value = nested_lookup(d, input_type[1])
                for lookup_id in id_strct.find_left(value):
                    new_doc = copy.deepcopy(d)
                    # ensure _id is always a str
                    new_doc['_id'] = str(lookup_id)
                    hit_lst.append(new_doc)
                    hit_flag = True
                if not hit_flag:
                    miss_lst.append(d)
            return hit_lst, miss_lst

        #self.logger.debug("Travel From '{}' To '{}'".format(input_type[0], target))

        # Keep a running list of all saved hits
        saved_hits = IDStruct()

        # Build the path structure, which will save results
        path_strct = _build_path_strct(input_type, doc_lst)

        for path in map(nx.utils.misc.pairwise,
                        self.paths[(input_type[0], target)]):
            for (v1, v2) in path:
                edge = self.G.edges[v1, v2]['object']
                num_input_ids = len(path_strct)
                path_strct = self._edge_lookup(edge, path_strct)
                num_output_ids = len(path_strct)
                if num_input_ids:
                    # self.logger.debug("Edge {} - {}, {} searched returned {}".format(v1, v2, num_input_ids, num_output_ids))
                    self.histogram.update_edge(v1, v2, num_output_ids)

            if len(path_strct):
                saved_hits += path_strct

            # reset the state to lookup misses
            path_strct = self.idstruct_class()
            for doc in doc_lst:
                val = nested_lookup(doc, input_type[1])
                if val:
                    if not saved_hits.left(val):
                        path_strct.add(val, val)

        # Return a list of documents that have had their identifiers replaced
        # also return a list of documents that were not changed
        hit_lst, miss_lst = _build_hit_miss_lsts(doc_lst, saved_hits)
        self.histogram.update_io(input_type, target, len(hit_lst))
        return hit_lst, miss_lst