def lookup(self, obj, level=None): """ More sophisticated RRSDatabase.contains(). This method doesnt call RRSDatabase.contains() explicitly, it checks other entities and tries to find relationship between them. This method uses list of lookup rules. @returns True if found (the object now carries the ID) False if not found """ if level is None: level = self.lookup_level if level < 0: return if not isinstance(obj, _RRSDatabaseEntity): raise TypeError('lookup() method can be called only on database '\ 'entity objects.') if obj._table_name.endswith("_meta"): raise RRSDatabaseEntityError('lookup() method cannot be called on meta-tables.') q = FluentSQLQuery() # LEVEL 0 rules try: lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) except KeyError: if self.logger is not None: self.logger.error("Level 0 rules for '%s' not found." % obj._table_name) return False for rule in lvl_zero_rules: attr_present = [item for item in rule.entities if item in obj] # if there are no such attrubutes or not the requested count of them, # continue to the next rule if rule.reqcount > len(attr_present): continue self._db.refresh() for cnt in reversed(range(rule.reqcount, len(attr_present)+1)): for attr_comb in combinations(attr_present, cnt): # now select them q.cleanup() q.select("id").from_table(obj._table_name) for attr in attr_comb: try: q.where("%s=" % attr, obj[attr]) except FluentSQLQueryError: q.and_("%s=" % attr, obj[attr]) q() res = q.fetch_all() if q.count() > 1: # there shouln't be more results than one self.logger.warning("There are more than one identical "\ "%ss. List of ID's: %s" % (obj._table_name, str([x[0] for x in res]))) if not res or res is None: continue obj['id'] = res[0][0] return True # LEVEL 1 rules try: lvl_one_rules = self._lookup_rules.get_rules(type(obj), 1) except KeyError: if self.logger is not None: self.logger.error("Level 1 rules for '%s' not found." % obj._table_name) return False # returns type of entity mapped in ent_id_map def getetype(ent_id_map, ent): for k in ent_id_map.keys(): e, et = k if ent == e: return et # these are objects which really are present in the entity for rule in lvl_one_rules: ent_present = [item for item in rule.entities if item in obj] # if there are no such entities or not the requested count of them, # continue to the next rule if rule.reqcount > len(ent_present): continue # get all those identifiers ent_id_map = {} for ent_name in ent_present: target = obj[ent_name] if type(target) is list and target: # list of relationship objects key = (ent_name, type(target[0])) ent_id_map[key] = [] for rel_obj in target: assert len(rel_obj.get_entities()) > 0 e = rel_obj.get_entities()[0] if self.lookup(e, level-1): if not key in ent_id_map: ent_id_map[key] = [] ent_id_map[key].append(e) if not ent_id_map[key]: del ent_id_map[key] elif isinstance(target, _RRSDatabaseEntity): # this is FK - @target is RRS*** object if self.lookup(target, level-1): ent_id_map[(ent_name, type(target))] = [target] else: ent_id_map[(ent_name, type(target))] = [target] # if we did not found as much as the rules requests, continue if rule.reqcount > len(ent_id_map): continue # try to catch some data from the minimum count of requested entities # to match, probably 2 # if this select spits out too many results (>100), the reqcount level 2 # is omitted and the process starts again from 3. # There has to be a flag, which indicates, that the level 2 # requested entities returned too many results next_reqcount_lvl = False ent_keys = [x[0] for x in ent_id_map.keys()] for cnt in range(rule.reqcount, len(ent_id_map)+1): next_reqcount_lvl = False for entity_comb in combinations(ent_keys, cnt): if next_reqcount_lvl: break self._db.refresh() # re-create cursors to drop the loaded data # construct the query q.cleanup() tg_tbl = obj._table_name from_lst = [tg_tbl] q.select("%s.id" % tg_tbl) # recognition of the same table in the query tablecounter = 1 for ent in entity_comb: etype = getetype(ent_id_map, ent) # now we have key to the object -> ent_id_map[(ent, etype)] # @ent is instance of RRS****** - 1:N relationship # the object contains id of this entity if issubclass(etype, _RRSDatabaseEntity): o = ent_id_map[(ent, etype)][0] try: q.where("%s.%s_id=" % (tg_tbl, ent), o['id']) except FluentSQLQueryError: q.and_("%s.%s_id=" % (tg_tbl, ent), o['id']) # @ent is fake junction table - it means, that it's # the second side of 1:N relationship - N:1. elif issubclass(etype, _RRSDbEntityRelationship) and etype._fake_table: # TODO return False # @ent is true junction table - this M:N relationship. elif issubclass(etype, _RRSDbEntityRelationship) and not etype._fake_table: j_tbl_uniq_as = None # storage of all acronyms iof junction tables j_tbl_uniq_as_list = [] o = None # join together all the found entities - for example: # given publication, two persons (authors), both found # in db so create query which selects ID of publication # which has both - the first AND the second person. for o in ent_id_map[(ent, etype)]: j_tbl_uniq_as = "%s%s" % (etype._table_name, tablecounter) j_tbl_uniq_as_list.append(j_tbl_uniq_as) e_tbl_uniq_as = "%s%s" % (o._table_name, tablecounter) # add table to the list of tables we are joining together from_lst.append("%s AS %s" % (etype._table_name, j_tbl_uniq_as)) from_lst.append("%s AS %s" % (o._table_name, e_tbl_uniq_as)) try: q.where("%s.id=" % e_tbl_uniq_as, o['id']) except FluentSQLQueryError: q.and_("%s.id=" % e_tbl_uniq_as, o['id']) q.and_("%s.%s_id=" % (j_tbl_uniq_as, o._table_name), "%s.id" % e_tbl_uniq_as, True) tablecounter += 1 # add the condition that all the junction table ID's of # the entity we are looking for has to be the same - we # are looking not for union, but intersection of them for i in range(0, len(j_tbl_uniq_as_list)): try: j1 = j_tbl_uniq_as_list[i] j2 = j_tbl_uniq_as_list[i+1] q.and_("%s.%s_id=" % (j1, tg_tbl), "%s.%s_id" % (j2, tg_tbl), True) except IndexError: break # bind junction table.entity_id to id of entity we are looking for q.and_("%s.%s_id=" % (j_tbl_uniq_as, tg_tbl), "%s.id" % tg_tbl, True) # @ent is attribute (int, basestring ect.) else: attr = ent_id_map[(ent, etype)][0] try: q.where("%s.%s=" % (tg_tbl, ent), attr) except FluentSQLQueryError: q.and_("%s.%s=" % (tg_tbl, ent), attr) q.from_table(from_lst) q() search_sql_query = q._sql # now if the total count of probably identical files is higher # than 100, we need to specify it more, so we jump to next # request count level (probably 1->2 or 2->3). if q.count() > 100: next_reqcount_lvl = True continue res = q.fetch_all() if not res: continue elif len(res) == 1: obj['id'] = res[0][0] self.logger.info("Found exactly one result for lookup: %s, params: %s, found ID: %s, SQL: %s" % \ (obj._table_name, str(entity_comb), obj['id'], search_sql_query)) return True else: # do some magic stuff here # intelligenty compare the attribute of all returned results # and choose the most similar q.cleanup() id_list = [x[0] for x in res] attrunion = set(["id"]) lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) # make a list of attributes needed to acomplish the rules # (these are all which are present in rules) for rule in lvl_zero_rules: attrunion = attrunion.union(set(rule.entities)) # construct query which loads all needed attributes of all returned ID's q.select(list(attrunion)).from_table(obj._table_name) for _id in id_list: try: q.where("id=", _id) except FluentSQLQueryError: q.or_("id=", _id) q() # perform the query loaded_data = q.fetch_all() similarity = {} # every rule tell us what attributes have to be similar # (or identical) for rule in lvl_zero_rules: attrs = [item for item in rule.entities if item in obj] if rule.reqcount > len(attrs): continue # count every row's similarity (the result is sum of # similarities of their attributes) sim_lst = {} for d in loaded_data: row_similarity = 0.0 for attr in attrs: if attr not in d or d[attr] is None or attr not in obj: continue if (d['id'], attr) not in similarity: s = SequenceMatcher(None, d[attr], obj[attr]) similarity[(d['id'], attr)] = s.ratio() row_similarity += similarity[(d['id'], attr)] sim_lst[row_similarity] = d['id'] # get the most similar row to the object obj['id'] = sim_lst[max(sim_lst.keys())] self.logger.info("Found more than one result for lookup: %s, params: %s, "\ "Choosen ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj['id'], search_sql_query)) return True
def lookup(self, obj, level=None): """ More sophisticated RRSDatabase.contains(). This method doesnt call RRSDatabase.contains() explicitly, it checks other entities and tries to find relationship between them. This method uses list of lookup rules. @returns True if found (the object now carries the ID) False if not found """ if level is None: level = self.lookup_level if level < 0: return if not isinstance(obj, _RRSDatabaseEntity): raise TypeError("lookup() method can be called only on database " "entity objects.") if obj._table_name.endswith("_meta"): raise RRSDatabaseEntityError("lookup() method cannot be called on meta-tables.") q = FluentSQLQuery() # LEVEL 0 rules try: lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) except KeyError: if self.logger is not None: self.logger.error("Level 0 rules for '%s' not found." % obj._table_name) return False for rule in lvl_zero_rules: attr_present = [item for item in rule.entities if item in obj] # if there are no such attrubutes or not the requested count of them, # continue to the next rule if rule.reqcount > len(attr_present): continue self._db.refresh() for cnt in reversed(range(rule.reqcount, len(attr_present) + 1)): for attr_comb in combinations(attr_present, cnt): # now select them q.cleanup() q.select("id").from_table(obj._table_name) for attr in attr_comb: try: q.where("%s=" % attr, obj[attr]) except FluentSQLQueryError: q.and_("%s=" % attr, obj[attr]) q() res = q.fetch_all() if q.count() > 1: # there shouln't be more results than one self.logger.warning( "There are more than one identical " "%ss. List of ID's: %s" % (obj._table_name, str([x[0] for x in res])) ) if not res or res is None: continue obj["id"] = res[0][0] return True # LEVEL 1 rules try: lvl_one_rules = self._lookup_rules.get_rules(type(obj), 1) except KeyError: if self.logger is not None: self.logger.error("Level 1 rules for '%s' not found." % obj._table_name) return False # returns type of entity mapped in ent_id_map def getetype(ent_id_map, ent): for k in ent_id_map.keys(): e, et = k if ent == e: return et # these are objects which really are present in the entity for rule in lvl_one_rules: ent_present = [item for item in rule.entities if item in obj] # if there are no such entities or not the requested count of them, # continue to the next rule if rule.reqcount > len(ent_present): continue # get all those identifiers ent_id_map = {} for ent_name in ent_present: target = obj[ent_name] if type(target) is list and target: # list of relationship objects key = (ent_name, type(target[0])) ent_id_map[key] = [] for rel_obj in target: assert len(rel_obj.get_entities()) > 0 e = rel_obj.get_entities()[0] if self.lookup(e, level - 1): if not key in ent_id_map: ent_id_map[key] = [] ent_id_map[key].append(e) if not ent_id_map[key]: del ent_id_map[key] elif isinstance(target, _RRSDatabaseEntity): # this is FK - @target is RRS*** object if self.lookup(target, level - 1): ent_id_map[(ent_name, type(target))] = [target] else: ent_id_map[(ent_name, type(target))] = [target] # if we did not found as much as the rules requests, continue if rule.reqcount > len(ent_id_map): continue # try to catch some data from the minimum count of requested entities # to match, probably 2 # if this select spits out too many results (>100), the reqcount level 2 # is omitted and the process starts again from 3. # There has to be a flag, which indicates, that the level 2 # requested entities returned too many results next_reqcount_lvl = False ent_keys = [x[0] for x in ent_id_map.keys()] for cnt in range(rule.reqcount, len(ent_id_map) + 1): next_reqcount_lvl = False for entity_comb in combinations(ent_keys, cnt): if next_reqcount_lvl: break self._db.refresh() # re-create cursors to drop the loaded data # construct the query q.cleanup() tg_tbl = obj._table_name from_lst = [tg_tbl] q.select("%s.id" % tg_tbl) # recognition of the same table in the query tablecounter = 1 for ent in entity_comb: etype = getetype(ent_id_map, ent) # now we have key to the object -> ent_id_map[(ent, etype)] # @ent is instance of RRS****** - 1:N relationship # the object contains id of this entity if issubclass(etype, _RRSDatabaseEntity): o = ent_id_map[(ent, etype)][0] try: q.where("%s.%s_id=" % (tg_tbl, ent), o["id"]) except FluentSQLQueryError: q.and_("%s.%s_id=" % (tg_tbl, ent), o["id"]) # @ent is fake junction table - it means, that it's # the second side of 1:N relationship - N:1. elif issubclass(etype, _RRSDbEntityRelationship) and etype._fake_table: # TODO return False # @ent is true junction table - this M:N relationship. elif issubclass(etype, _RRSDbEntityRelationship) and not etype._fake_table: j_tbl_uniq_as = None # storage of all acronyms iof junction tables j_tbl_uniq_as_list = [] o = None # join together all the found entities - for example: # given publication, two persons (authors), both found # in db so create query which selects ID of publication # which has both - the first AND the second person. for o in ent_id_map[(ent, etype)]: j_tbl_uniq_as = "%s%s" % (etype._table_name, tablecounter) j_tbl_uniq_as_list.append(j_tbl_uniq_as) e_tbl_uniq_as = "%s%s" % (o._table_name, tablecounter) # add table to the list of tables we are joining together from_lst.append("%s AS %s" % (etype._table_name, j_tbl_uniq_as)) from_lst.append("%s AS %s" % (o._table_name, e_tbl_uniq_as)) try: q.where("%s.id=" % e_tbl_uniq_as, o["id"]) except FluentSQLQueryError: q.and_("%s.id=" % e_tbl_uniq_as, o["id"]) q.and_("%s.%s_id=" % (j_tbl_uniq_as, o._table_name), "%s.id" % e_tbl_uniq_as, True) tablecounter += 1 # add the condition that all the junction table ID's of # the entity we are looking for has to be the same - we # are looking not for union, but intersection of them for i in range(0, len(j_tbl_uniq_as_list)): try: j1 = j_tbl_uniq_as_list[i] j2 = j_tbl_uniq_as_list[i + 1] q.and_("%s.%s_id=" % (j1, tg_tbl), "%s.%s_id" % (j2, tg_tbl), True) except IndexError: break # bind junction table.entity_id to id of entity we are looking for q.and_("%s.%s_id=" % (j_tbl_uniq_as, tg_tbl), "%s.id" % tg_tbl, True) # @ent is attribute (int, basestring ect.) else: attr = ent_id_map[(ent, etype)][0] try: q.where("%s.%s=" % (tg_tbl, ent), attr) except FluentSQLQueryError: q.and_("%s.%s=" % (tg_tbl, ent), attr) q.from_table(from_lst) q() search_sql_query = q._sql # now if the total count of probably identical files is higher # than 100, we need to specify it more, so we jump to next # request count level (probably 1->2 or 2->3). if q.count() > 100: next_reqcount_lvl = True continue res = q.fetch_all() if not res: continue elif len(res) == 1: obj["id"] = res[0][0] self.logger.info( "Found exactly one result for lookup: %s, params: %s, found ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj["id"], search_sql_query) ) return True else: # do some magic stuff here # intelligenty compare the attribute of all returned results # and choose the most similar q.cleanup() id_list = [x[0] for x in res] attrunion = set(["id"]) lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) # make a list of attributes needed to acomplish the rules # (these are all which are present in rules) for rule in lvl_zero_rules: attrunion = attrunion.union(set(rule.entities)) # construct query which loads all needed attributes of all returned ID's q.select(list(attrunion)).from_table(obj._table_name) for _id in id_list: try: q.where("id=", _id) except FluentSQLQueryError: q.or_("id=", _id) q() # perform the query loaded_data = q.fetch_all() similarity = {} # every rule tell us what attributes have to be similar # (or identical) for rule in lvl_zero_rules: attrs = [item for item in rule.entities if item in obj] if rule.reqcount > len(attrs): continue # count every row's similarity (the result is sum of # similarities of their attributes) sim_lst = {} for d in loaded_data: row_similarity = 0.0 for attr in attrs: if attr not in d or d[attr] is None or attr not in obj: continue if (d["id"], attr) not in similarity: s = SequenceMatcher(None, d[attr], obj[attr]) similarity[(d["id"], attr)] = s.ratio() row_similarity += similarity[(d["id"], attr)] sim_lst[row_similarity] = d["id"] # get the most similar row to the object obj["id"] = sim_lst[max(sim_lst.keys())] self.logger.info( "Found more than one result for lookup: %s, params: %s, " "Choosen ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj["id"], search_sql_query) ) return True
def _bind_entity_to_name(self, namedentity, source_module): """ This method creates connection between entity and it's name, which is stored in other database table. These tables are: - person vs person_name - organization vs organization_name - event vs event_name @returns ID of the row in the name-table. """ ACRONYM = 'acronym' # to be easily changed to abbreviation or whatever needed.. TITLE = 'title' # will be name? or what? if not isinstance(namedentity, _RRSDatabaseEntity): raise TypeError("Named object has to be instance of subclass of _RRSDatabaseEntity") if not 'id' in namedentity: raise DatabaseError("Named object has to contain ID!") q = FluentSQLQuery() if namedentity._table_name == "person": # act like person and handle person_name # this is slightly different because there is N:N relationship # create new person name object pname = RRSPerson_name() for attr in ('first_name', 'middle_name', 'last_name', 'full_name'): if attr in namedentity: pname[attr] = namedentity[attr] # create relationship object rel_obj = RRSRelationshipPersonPerson_name() rel_obj.set_entity(pname) namedentity['person_name'] = rel_obj # look for this name in database if self.lookup(pname): # it is in db yet, just check if rel exists q.select("person_id").from_table(("j__person__person_name")) q.where("person_id=", namedentity['id']).and_("person_name_id=", pname['id']) q() if not q.count(): # if the relationship doesn't exist, create new one self._rrsdb.relationship("person_name", rel_obj) elif q.count() > 1: self.logger.warning("There are more than one relationship "\ "entries in table 'j__person__person_name"\ " between person.id=%s and person_name.id=%s" \ % (namedentity['id'], pname['id'])) else: # insert new person_name and create the relationship self._rrsdb.insert(pname, self.module) self._rrsdb.relationship("person_name", rel_obj) # get the reference out of which is this name extracted and assign # the person name to the reference (j__person_name__reference) try: refe = namedentity['publication'][0].get_entities()[0]['reference_reference'][0].get_entities()[0] except (KeyError, TypeError, IndexError): pass else: pname_ref_rel = RRSRelationshipPerson_nameReference() pname_ref_rel.set_entity(refe) pname['reference'] = pname_ref_rel try: self._rrsdb.relationship('reference', pname_ref_rel) except DatabaseError: self._queue.wait(WQEntry(self._rrsdb.relationship, ('reference', pname_ref_rel))) elif namedentity._table_name in ("event", "organization"): if TITLE not in namedentity: # this violates constraint... raise exception?? Or return false? return False name_tbl = "%s_name" % namedentity._table_name # if there in the database is no title like this, insert it q.select(("id", "%s_id" % namedentity._table_name, ACRONYM, TITLE)).from_table(name_tbl) q.where("%s=" % TITLE, namedentity[TITLE]) if ACRONYM in namedentity: q.or_("%s=" % ACRONYM, namedentity[ACRONYM]) q() if q.count(): # check the parent id if it matches for row in q.fetch_all(): if namedentity['id'] == row[1]: # if it matched on acronym, check the titles if they are the same if row[TITLE] != namedentity[TITLE]: # if not, check the rest and maybe add new row into table continue # add the missing acronym if needed if row[ACRONYM] is None and namedentity[ACRONYM] is not None: # update the row q.cleanup() q.update(name_tbl, {ACRONYM: namedentity[ACRONYM]}) q.where("id=", row['id']) q() return row['id'] # if nothing matched, insert new name name_obj = self._table_to_class_map[name_tbl]() for attr in (TITLE, ACRONYM): if attr in namedentity: name_obj[attr] = namedentity[attr] if name_obj.empty(): return False name_obj[namedentity._table_name] = namedentity self._rrsdb.insert(name_obj, source_module) return name_obj['id'] else: raise RRSDatabaseEntityError("%s is not a named entity." % type(namedentity)) self._db.refresh()
def _bind_entity_to_name(self, namedentity, source_module): """ This method creates connection between entity and it's name, which is stored in other database table. These tables are: - person vs person_name - organization vs organization_name - event vs event_name @returns ID of the row in the name-table. """ ACRONYM = "acronym" # to be easily changed to abbreviation or whatever needed.. TITLE = "title" # will be name? or what? if not isinstance(namedentity, _RRSDatabaseEntity): raise TypeError("Named object has to be instance of subclass of _RRSDatabaseEntity") if not "id" in namedentity: raise DatabaseError("Named object has to contain ID!") q = FluentSQLQuery() if namedentity._table_name == "person": # act like person and handle person_name # this is slightly different because there is N:N relationship # create new person name object pname = RRSPerson_name() for attr in ("first_name", "middle_name", "last_name", "full_name"): if attr in namedentity: pname[attr] = namedentity[attr] # create relationship object rel_obj = RRSRelationshipPersonPerson_name() rel_obj.set_entity(pname) namedentity["person_name"] = rel_obj # look for this name in database if self.lookup(pname): # it is in db yet, just check if rel exists q.select("person_id").from_table(("j__person__person_name")) q.where("person_id=", namedentity["id"]).and_("person_name_id=", pname["id"]) q() if not q.count(): # if the relationship doesn't exist, create new one self._rrsdb.relationship("person_name", rel_obj) elif q.count() > 1: self.logger.warning( "There are more than one relationship " "entries in table 'j__person__person_name" " between person.id=%s and person_name.id=%s" % (namedentity["id"], pname["id"]) ) else: # insert new person_name and create the relationship self._rrsdb.insert(pname, self.module) self._rrsdb.relationship("person_name", rel_obj) # get the reference out of which is this name extracted and assign # the person name to the reference (j__person_name__reference) try: refe = namedentity["publication"][0].get_entities()[0]["reference_reference"][0].get_entities()[0] except (KeyError, TypeError, IndexError): pass else: pname_ref_rel = RRSRelationshipPerson_nameReference() pname_ref_rel.set_entity(refe) pname["reference"] = pname_ref_rel try: self._rrsdb.relationship("reference", pname_ref_rel) except DatabaseError: self._queue.wait(WQEntry(self._rrsdb.relationship, ("reference", pname_ref_rel))) elif namedentity._table_name in ("event", "organization"): if TITLE not in namedentity: # this violates constraint... raise exception?? Or return false? return False name_tbl = "%s_name" % namedentity._table_name # if there in the database is no title like this, insert it q.select(("id", "%s_id" % namedentity._table_name, ACRONYM, TITLE)).from_table(name_tbl) q.where("%s=" % TITLE, namedentity[TITLE]) if ACRONYM in namedentity: q.or_("%s=" % ACRONYM, namedentity[ACRONYM]) q() if q.count(): # check the parent id if it matches for row in q.fetch_all(): if namedentity["id"] == row[1]: # if it matched on acronym, check the titles if they are the same if row[TITLE] != namedentity[TITLE]: # if not, check the rest and maybe add new row into table continue # add the missing acronym if needed if row[ACRONYM] is None and namedentity[ACRONYM] is not None: # update the row q.cleanup() q.update(name_tbl, {ACRONYM: namedentity[ACRONYM]}) q.where("id=", row["id"]) q() return row["id"] # if nothing matched, insert new name name_obj = self._table_to_class_map[name_tbl]() for attr in (TITLE, ACRONYM): if attr in namedentity: name_obj[attr] = namedentity[attr] if name_obj.empty(): return False name_obj[namedentity._table_name] = namedentity self._rrsdb.insert(name_obj, source_module) return name_obj["id"] else: raise RRSDatabaseEntityError("%s is not a named entity." % type(namedentity)) self._db.refresh()