def edge_lookup(self, keylookup_obj, id_strct, debug=False): """ Follow an edge given a key. An edge represets a document and this method uses the data in the edge_object to find one key to another key using exactly one mongodb lookup. :param keylookup_obj: :param id_strct: :return: """ if not isinstance(id_strct, IDStruct): raise TypeError("edge_lookup id_struct is of the wrong type") # Build up a new_id_strct from the results res_id_strct = IDStruct() # Keep the old debug information if debug: res_id_strct.import_debug(id_strct) id_lst = id_strct.id_lst if id_lst: find_lst = self.collection_find(id_lst, self.lookup, self.field) for doc in find_lst: for orig_id in id_strct.find_right( nested_lookup(doc, self.lookup)): res_id_strct.add(orig_id, nested_lookup(doc, self.field)) if debug: res_id_strct.set_debug(orig_id, self.label, nested_lookup(doc, self.field)) return res_id_strct
def edge_lookup(self, keylookup_obj, id_strct): """ Follow an edge given a key. An edge represets a document and this method uses the data in the edge_object to find one key to another key using exactly one mongodb lookup. :param keylookup_obj: :param id_strct: :return: """ if not isinstance(id_strct, IDStruct): raise TypeError("edge_lookup id_struct is of the wrong type") # Build up a new_id_strct from the results res_id_strct = IDStruct() id_lst = id_strct.id_lst if len(id_lst): find_lst = self.collection.find({self.lookup: { "$in": id_lst }}, { self.lookup: 1, self.field: 1 }) for d in find_lst: for orig_id in id_strct.find_right( nested_lookup(d, self.lookup)): res_id_strct.add(orig_id, nested_lookup(d, self.field)) return res_id_strct
def _build_hit_miss_lsts(doc_lst, id_strct, debug): """ Return a list of documents that have had their identifiers replaced also return a list of documents that were not changed :param doc_lst: :param id_strct: :return: """ hit_lst = [] miss_lst = [] for doc in doc_lst: hit_flag = False value = nested_lookup(doc, input_type[1]) for lookup_id in id_strct.find_left(value): new_doc = copy.deepcopy(doc) # ensure _id is always a str new_doc['_id'] = str(lookup_id) # capture debug information if debug: new_doc['dt_debug']['start_field'] = input_type[1] new_doc['dt_debug']['debug'] = id_strct.get_debug( value) hit_lst.append(new_doc) hit_flag = True if not hit_flag: miss_lst.append(doc) return hit_lst, miss_lst
def _init_strct(self, field, doc_lst): """ initialze _id_tuple_lst In this class, stitch identifiers are converted to pubchem identifiers for keylookup. This is done internally by this class which performs a preprocessing conversion to an identifier. """ for doc in doc_lst: value = nested_lookup(doc, field) if value: self.add(value, self.preprocess_id(value))
def _copy(self, input_type, doc_lst): """Copy ids in the case where input_type == output_type""" hit_lst = [] miss_lst = [] for doc in doc_lst: val = nested_lookup(doc, input_type[1]) if val: # ensure _id is always a str doc['_id'] = str(val) hit_lst.append(doc) else: miss_lst.append(doc) return (hit_lst, miss_lst)
def _copy(self, input_type, doc_lst): """Copy ids in the case where input_type == output_type""" hit_lst = [] miss_lst = [] for doc in doc_lst: val = nested_lookup(doc, input_type[1]) if val: # ensure _id is always a str doc['_id'] = str(val) hit_lst.append(doc) else: miss_lst.append(doc) # Keep a record of IDs copied self.histogram.update_io(input_type, input_type, len(hit_lst)) return (hit_lst, miss_lst)
def _copy(self, input_type, doc_lst): """Copy ids in the case where input_type == output_type""" hit_lst = [] miss_lst = [] for doc in doc_lst: val = nested_lookup(doc, input_type[1]) if val: # ensure _id is always a str doc['_id'] = str(val) hit_lst.append(doc) # retain debug information if available (assumed dt_debug already in place) if self.debug: doc['dt_debug']['copy_from'] = (input_type[1], val) else: miss_lst.append(doc) # Keep a record of IDs copied self.histogram.update_io(input_type, input_type, len(hit_lst)) return (hit_lst, miss_lst)
def _build_hit_miss_lsts(doc_lst, id_strct): """ Return a list of documents that have had their identifiers replaced also return a list of documents that were not changed :param doc_lst: :param id_strct: :return: """ hit_lst = [] miss_lst = [] for d in doc_lst: hit_flag = False value = nested_lookup(d, input_type[1]) for lookup_id in id_strct.find_left(value): new_doc = copy.deepcopy(d) # ensure _id is always a str new_doc['_id'] = str(lookup_id) hit_lst.append(new_doc) hit_flag = True if not hit_flag: miss_lst.append(d) return hit_lst, miss_lst
def travel(self, input_type, target, doc_lst): """ Traverse a graph from a start key type to a target key type using precomputed paths. :param start: key type to start from :param target: key type to end at :param key: key value of type 'start' :return: """ def _build_path_strct(input_type, doc_lst): """ Build the path structure for the travel function :return: """ return self.idstruct_class(input_type[1], doc_lst) def _build_hit_miss_lsts(doc_lst, id_strct): """ Return a list of documents that have had their identifiers replaced also return a list of documents that were not changed :param doc_lst: :param id_strct: :return: """ hit_lst = [] miss_lst = [] for d in doc_lst: hit_flag = False value = nested_lookup(d, input_type[1]) for lookup_id in id_strct.find_left(value): new_doc = copy.deepcopy(d) # ensure _id is always a str new_doc['_id'] = str(lookup_id) hit_lst.append(new_doc) hit_flag = True if not hit_flag: miss_lst.append(d) return hit_lst, miss_lst #self.logger.debug("Travel From '{}' To '{}'".format(input_type[0], target)) # Keep a running list of all saved hits saved_hits = IDStruct() # Build the path structure, which will save results path_strct = _build_path_strct(input_type, doc_lst) for path in map(nx.utils.misc.pairwise, self.paths[(input_type[0], target)]): for (v1, v2) in path: edge = self.G.edges[v1, v2]['object'] num_input_ids = len(path_strct) path_strct = self._edge_lookup(edge, path_strct) num_output_ids = len(path_strct) if num_input_ids: # self.logger.debug("Edge {} - {}, {} searched returned {}".format(v1, v2, num_input_ids, num_output_ids)) self.histogram.update_edge(v1, v2, num_output_ids) if len(path_strct): saved_hits += path_strct # reset the state to lookup misses path_strct = self.idstruct_class() for doc in doc_lst: val = nested_lookup(doc, input_type[1]) if val: if not saved_hits.left(val): path_strct.add(val, val) # Return a list of documents that have had their identifiers replaced # also return a list of documents that were not changed hit_lst, miss_lst = _build_hit_miss_lsts(doc_lst, saved_hits) self.histogram.update_io(input_type, target, len(hit_lst)) return hit_lst, miss_lst