Ejemplo n.º 1
0
    def lookup(self, obj, level=None):
        """
        More sophisticated RRSDatabase.contains(). This method doesnt call
        RRSDatabase.contains() explicitly, it checks other entities and tries
        to find relationship between them. This method uses list of lookup rules.
        @returns True if found (the object now carries the ID)
                 False if not found
        """
        if level is None:
            level = self.lookup_level
        if level < 0:
            return
        if not isinstance(obj, _RRSDatabaseEntity):
            raise TypeError('lookup() method can be called only on database '\
                                'entity objects.')
        if obj._table_name.endswith("_meta"):
            raise RRSDatabaseEntityError('lookup() method cannot be called on meta-tables.')

        q = FluentSQLQuery()
        # LEVEL 0 rules
        try:
            lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0)
        except KeyError:
            if self.logger is not None:
                self.logger.error("Level 0 rules for '%s' not found." % obj._table_name)
            return False
        for rule in lvl_zero_rules:
            attr_present = [item for item in rule.entities if item in obj]
            # if there are no such attrubutes or not the requested count of them,
            # continue to the next rule
            if rule.reqcount > len(attr_present):
                continue
            self._db.refresh()
            for cnt in reversed(range(rule.reqcount, len(attr_present)+1)):
                for attr_comb in combinations(attr_present, cnt):
                    # now select them
                    q.cleanup()
                    q.select("id").from_table(obj._table_name)
                    for attr in attr_comb:
                        try:
                            q.where("%s=" % attr, obj[attr])
                        except FluentSQLQueryError:
                            q.and_("%s=" % attr, obj[attr])
                    q()
                    res = q.fetch_all()
                    if q.count() > 1: # there shouln't be more results than one
                        self.logger.warning("There are more than one identical "\
                        "%ss. List of ID's: %s" % (obj._table_name, str([x[0] for x in res])))
                    if not res or res is None:
                        continue
                    obj['id'] = res[0][0]
                    return True

        # LEVEL 1 rules
        try:
            lvl_one_rules = self._lookup_rules.get_rules(type(obj), 1)
        except KeyError:
            if self.logger is not None:
                self.logger.error("Level 1 rules for '%s' not found." % obj._table_name)
            return False
        # returns type of entity mapped in ent_id_map
        def getetype(ent_id_map, ent):
            for k in ent_id_map.keys():
                e, et = k
                if ent == e:
                    return et
        # these are objects which really are present in the entity
        for rule in lvl_one_rules:
            ent_present = [item for item in rule.entities if item in obj]
            # if there are no such entities or not the requested count of them,
            # continue to the next rule
            if rule.reqcount > len(ent_present):
                continue

            # get all those identifiers
            ent_id_map = {}
            for ent_name in ent_present:
                target = obj[ent_name]
                if type(target) is list and target:
                    # list of relationship objects
                    key = (ent_name, type(target[0]))
                    ent_id_map[key] = []
                    for rel_obj in target:
                        assert len(rel_obj.get_entities()) > 0
                        e = rel_obj.get_entities()[0]
                        if self.lookup(e, level-1):
                            if not key in ent_id_map:
                                ent_id_map[key] = []
                            ent_id_map[key].append(e)
                    if not ent_id_map[key]:
                        del ent_id_map[key]
                elif isinstance(target, _RRSDatabaseEntity):
                    # this is FK - @target is RRS*** object
                    if self.lookup(target, level-1):
                        ent_id_map[(ent_name, type(target))] = [target]
                else:
                    ent_id_map[(ent_name, type(target))] = [target]


            # if we did not found as much as the rules requests, continue
            if rule.reqcount > len(ent_id_map):
                continue
            # try to catch some data from the minimum count of requested entities
            # to match, probably 2
            # if this select spits out too many results (>100), the reqcount level 2
            # is omitted and the process starts again from 3.
            # There has to be a flag, which indicates, that the level 2
            # requested entities returned too many results
            next_reqcount_lvl = False
            ent_keys = [x[0] for x in ent_id_map.keys()]
            for cnt in range(rule.reqcount, len(ent_id_map)+1):
                next_reqcount_lvl = False
                for entity_comb in combinations(ent_keys, cnt):
                    if next_reqcount_lvl: break
                    self._db.refresh() # re-create cursors to drop the loaded data
                    # construct the query
                    q.cleanup()
                    tg_tbl = obj._table_name
                    from_lst = [tg_tbl]
                    q.select("%s.id" % tg_tbl)
                    # recognition of the same table in the query
                    tablecounter = 1
                    for ent in entity_comb:
                        etype = getetype(ent_id_map, ent)
                        # now we have key to the object -> ent_id_map[(ent, etype)]

                        # @ent is instance of RRS****** - 1:N relationship
                        # the object contains id of this entity
                        if issubclass(etype, _RRSDatabaseEntity):
                            o = ent_id_map[(ent, etype)][0]
                            try:
                                q.where("%s.%s_id=" % (tg_tbl, ent), o['id'])
                            except FluentSQLQueryError:
                                q.and_("%s.%s_id=" % (tg_tbl, ent), o['id'])

                        # @ent is fake junction table - it means, that it's
                        # the second side of 1:N relationship - N:1.
                        elif issubclass(etype, _RRSDbEntityRelationship) and etype._fake_table:
                            # TODO
                            return False

                        # @ent is true junction table - this M:N relationship.
                        elif issubclass(etype, _RRSDbEntityRelationship) and not etype._fake_table:
                            j_tbl_uniq_as = None
                            # storage of all acronyms iof junction tables
                            j_tbl_uniq_as_list = []
                            o = None

                            # join together all the found entities - for example:
                            # given publication, two persons (authors), both found
                            # in db so create query which selects ID of publication
                            # which has both - the first AND the second person.
                            for o in ent_id_map[(ent, etype)]:
                                j_tbl_uniq_as = "%s%s" % (etype._table_name, tablecounter)
                                j_tbl_uniq_as_list.append(j_tbl_uniq_as)
                                e_tbl_uniq_as = "%s%s" % (o._table_name, tablecounter)
                                # add table to the list of tables we are joining together
                                from_lst.append("%s AS %s" % (etype._table_name, j_tbl_uniq_as))
                                from_lst.append("%s AS %s" % (o._table_name, e_tbl_uniq_as))
                                try:
                                    q.where("%s.id=" % e_tbl_uniq_as, o['id'])
                                except FluentSQLQueryError:
                                    q.and_("%s.id=" % e_tbl_uniq_as, o['id'])
                                q.and_("%s.%s_id=" % (j_tbl_uniq_as, o._table_name), "%s.id" % e_tbl_uniq_as, True)
                                tablecounter += 1

                            # add the condition that all the junction table ID's of
                            # the entity we are looking for has to be the same - we
                            # are looking not for union, but intersection of them
                            for i in range(0, len(j_tbl_uniq_as_list)):
                                try:
                                    j1 = j_tbl_uniq_as_list[i]
                                    j2 = j_tbl_uniq_as_list[i+1]
                                    q.and_("%s.%s_id=" % (j1, tg_tbl), "%s.%s_id" % (j2, tg_tbl), True)
                                except IndexError:
                                    break
                            # bind junction table.entity_id to id of entity we are looking for
                            q.and_("%s.%s_id=" % (j_tbl_uniq_as, tg_tbl), "%s.id" % tg_tbl, True)

                        # @ent is attribute (int, basestring ect.)
                        else:
                            attr = ent_id_map[(ent, etype)][0]
                            try:
                                q.where("%s.%s=" % (tg_tbl, ent), attr)
                            except FluentSQLQueryError:
                                q.and_("%s.%s=" % (tg_tbl, ent), attr)

                    q.from_table(from_lst)
                    q()
                    search_sql_query = q._sql
                    # now if the total count of probably identical files is higher
                    # than 100, we need to specify it more, so we jump to next
                    # request count level (probably 1->2 or 2->3).
                    if q.count() > 100:
                        next_reqcount_lvl = True
                        continue
                    res = q.fetch_all()
                    if not res:
                        continue
                    elif len(res) == 1:
                        obj['id'] = res[0][0]
                        self.logger.info("Found exactly one result for lookup: %s, params: %s, found ID: %s, SQL: %s" % \
                                        (obj._table_name, str(entity_comb), obj['id'], search_sql_query))
                        return True
                    else:
                        # do some magic stuff here
                        # intelligenty compare the attribute of all returned results
                        # and choose the most similar
                        q.cleanup()
                        id_list = [x[0] for x in res]
                        attrunion = set(["id"])
                        lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0)
                        # make a list of attributes needed to acomplish the rules
                        # (these are all which are present in rules)
                        for rule in lvl_zero_rules:
                            attrunion = attrunion.union(set(rule.entities))
                        # construct query which loads all needed attributes of all returned ID's
                        q.select(list(attrunion)).from_table(obj._table_name)
                        for _id in id_list:
                            try:
                                q.where("id=", _id)
                            except FluentSQLQueryError:
                                q.or_("id=", _id)
                        q() # perform the query
                        loaded_data = q.fetch_all()

                        similarity = {}
                        # every rule tell us what attributes have to be similar
                        # (or identical)
                        for rule in lvl_zero_rules:
                            attrs = [item for item in rule.entities if item in obj]
                            if rule.reqcount > len(attrs):
                                continue

                            # count every row's similarity (the result is sum of
                            # similarities of their attributes)
                            sim_lst = {}
                            for d in loaded_data:
                                row_similarity = 0.0
                                for attr in attrs:
                                    if attr not in d or d[attr] is None or attr not in obj:
                                        continue
                                    if (d['id'], attr) not in similarity:
                                        s = SequenceMatcher(None, d[attr], obj[attr])
                                        similarity[(d['id'], attr)] = s.ratio()
                                    row_similarity += similarity[(d['id'], attr)]
                                sim_lst[row_similarity] = d['id']
                            # get the most similar row to the object
                            obj['id'] = sim_lst[max(sim_lst.keys())]
                            self.logger.info("Found more than one result for lookup: %s, params: %s, "\
                                             "Choosen ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj['id'], search_sql_query))
                            return True
    def lookup(self, obj, level=None):
        """
        More sophisticated RRSDatabase.contains(). This method doesnt call
        RRSDatabase.contains() explicitly, it checks other entities and tries
        to find relationship between them. This method uses list of lookup rules.
        @returns True if found (the object now carries the ID)
                 False if not found
        """
        if level is None:
            level = self.lookup_level
        if level < 0:
            return
        if not isinstance(obj, _RRSDatabaseEntity):
            raise TypeError("lookup() method can be called only on database " "entity objects.")
        if obj._table_name.endswith("_meta"):
            raise RRSDatabaseEntityError("lookup() method cannot be called on meta-tables.")

        q = FluentSQLQuery()
        # LEVEL 0 rules
        try:
            lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0)
        except KeyError:
            if self.logger is not None:
                self.logger.error("Level 0 rules for '%s' not found." % obj._table_name)
            return False
        for rule in lvl_zero_rules:
            attr_present = [item for item in rule.entities if item in obj]
            # if there are no such attrubutes or not the requested count of them,
            # continue to the next rule
            if rule.reqcount > len(attr_present):
                continue
            self._db.refresh()
            for cnt in reversed(range(rule.reqcount, len(attr_present) + 1)):
                for attr_comb in combinations(attr_present, cnt):
                    # now select them
                    q.cleanup()
                    q.select("id").from_table(obj._table_name)
                    for attr in attr_comb:
                        try:
                            q.where("%s=" % attr, obj[attr])
                        except FluentSQLQueryError:
                            q.and_("%s=" % attr, obj[attr])
                    q()
                    res = q.fetch_all()
                    if q.count() > 1:  # there shouln't be more results than one
                        self.logger.warning(
                            "There are more than one identical "
                            "%ss. List of ID's: %s" % (obj._table_name, str([x[0] for x in res]))
                        )
                    if not res or res is None:
                        continue
                    obj["id"] = res[0][0]
                    return True

        # LEVEL 1 rules
        try:
            lvl_one_rules = self._lookup_rules.get_rules(type(obj), 1)
        except KeyError:
            if self.logger is not None:
                self.logger.error("Level 1 rules for '%s' not found." % obj._table_name)
            return False
        # returns type of entity mapped in ent_id_map
        def getetype(ent_id_map, ent):
            for k in ent_id_map.keys():
                e, et = k
                if ent == e:
                    return et

        # these are objects which really are present in the entity
        for rule in lvl_one_rules:
            ent_present = [item for item in rule.entities if item in obj]
            # if there are no such entities or not the requested count of them,
            # continue to the next rule
            if rule.reqcount > len(ent_present):
                continue

            # get all those identifiers
            ent_id_map = {}
            for ent_name in ent_present:
                target = obj[ent_name]
                if type(target) is list and target:
                    # list of relationship objects
                    key = (ent_name, type(target[0]))
                    ent_id_map[key] = []
                    for rel_obj in target:
                        assert len(rel_obj.get_entities()) > 0
                        e = rel_obj.get_entities()[0]
                        if self.lookup(e, level - 1):
                            if not key in ent_id_map:
                                ent_id_map[key] = []
                            ent_id_map[key].append(e)
                    if not ent_id_map[key]:
                        del ent_id_map[key]
                elif isinstance(target, _RRSDatabaseEntity):
                    # this is FK - @target is RRS*** object
                    if self.lookup(target, level - 1):
                        ent_id_map[(ent_name, type(target))] = [target]
                else:
                    ent_id_map[(ent_name, type(target))] = [target]

            # if we did not found as much as the rules requests, continue
            if rule.reqcount > len(ent_id_map):
                continue
            # try to catch some data from the minimum count of requested entities
            # to match, probably 2
            # if this select spits out too many results (>100), the reqcount level 2
            # is omitted and the process starts again from 3.
            # There has to be a flag, which indicates, that the level 2
            # requested entities returned too many results
            next_reqcount_lvl = False
            ent_keys = [x[0] for x in ent_id_map.keys()]
            for cnt in range(rule.reqcount, len(ent_id_map) + 1):
                next_reqcount_lvl = False
                for entity_comb in combinations(ent_keys, cnt):
                    if next_reqcount_lvl:
                        break
                    self._db.refresh()  # re-create cursors to drop the loaded data
                    # construct the query
                    q.cleanup()
                    tg_tbl = obj._table_name
                    from_lst = [tg_tbl]
                    q.select("%s.id" % tg_tbl)
                    # recognition of the same table in the query
                    tablecounter = 1
                    for ent in entity_comb:
                        etype = getetype(ent_id_map, ent)
                        # now we have key to the object -> ent_id_map[(ent, etype)]

                        # @ent is instance of RRS****** - 1:N relationship
                        # the object contains id of this entity
                        if issubclass(etype, _RRSDatabaseEntity):
                            o = ent_id_map[(ent, etype)][0]
                            try:
                                q.where("%s.%s_id=" % (tg_tbl, ent), o["id"])
                            except FluentSQLQueryError:
                                q.and_("%s.%s_id=" % (tg_tbl, ent), o["id"])

                        # @ent is fake junction table - it means, that it's
                        # the second side of 1:N relationship - N:1.
                        elif issubclass(etype, _RRSDbEntityRelationship) and etype._fake_table:
                            # TODO
                            return False

                        # @ent is true junction table - this M:N relationship.
                        elif issubclass(etype, _RRSDbEntityRelationship) and not etype._fake_table:
                            j_tbl_uniq_as = None
                            # storage of all acronyms iof junction tables
                            j_tbl_uniq_as_list = []
                            o = None

                            # join together all the found entities - for example:
                            # given publication, two persons (authors), both found
                            # in db so create query which selects ID of publication
                            # which has both - the first AND the second person.
                            for o in ent_id_map[(ent, etype)]:
                                j_tbl_uniq_as = "%s%s" % (etype._table_name, tablecounter)
                                j_tbl_uniq_as_list.append(j_tbl_uniq_as)
                                e_tbl_uniq_as = "%s%s" % (o._table_name, tablecounter)
                                # add table to the list of tables we are joining together
                                from_lst.append("%s AS %s" % (etype._table_name, j_tbl_uniq_as))
                                from_lst.append("%s AS %s" % (o._table_name, e_tbl_uniq_as))
                                try:
                                    q.where("%s.id=" % e_tbl_uniq_as, o["id"])
                                except FluentSQLQueryError:
                                    q.and_("%s.id=" % e_tbl_uniq_as, o["id"])
                                q.and_("%s.%s_id=" % (j_tbl_uniq_as, o._table_name), "%s.id" % e_tbl_uniq_as, True)
                                tablecounter += 1

                            # add the condition that all the junction table ID's of
                            # the entity we are looking for has to be the same - we
                            # are looking not for union, but intersection of them
                            for i in range(0, len(j_tbl_uniq_as_list)):
                                try:
                                    j1 = j_tbl_uniq_as_list[i]
                                    j2 = j_tbl_uniq_as_list[i + 1]
                                    q.and_("%s.%s_id=" % (j1, tg_tbl), "%s.%s_id" % (j2, tg_tbl), True)
                                except IndexError:
                                    break
                            # bind junction table.entity_id to id of entity we are looking for
                            q.and_("%s.%s_id=" % (j_tbl_uniq_as, tg_tbl), "%s.id" % tg_tbl, True)

                        # @ent is attribute (int, basestring ect.)
                        else:
                            attr = ent_id_map[(ent, etype)][0]
                            try:
                                q.where("%s.%s=" % (tg_tbl, ent), attr)
                            except FluentSQLQueryError:
                                q.and_("%s.%s=" % (tg_tbl, ent), attr)

                    q.from_table(from_lst)
                    q()
                    search_sql_query = q._sql
                    # now if the total count of probably identical files is higher
                    # than 100, we need to specify it more, so we jump to next
                    # request count level (probably 1->2 or 2->3).
                    if q.count() > 100:
                        next_reqcount_lvl = True
                        continue
                    res = q.fetch_all()
                    if not res:
                        continue
                    elif len(res) == 1:
                        obj["id"] = res[0][0]
                        self.logger.info(
                            "Found exactly one result for lookup: %s, params: %s, found ID: %s, SQL: %s"
                            % (obj._table_name, str(entity_comb), obj["id"], search_sql_query)
                        )
                        return True
                    else:
                        # do some magic stuff here
                        # intelligenty compare the attribute of all returned results
                        # and choose the most similar
                        q.cleanup()
                        id_list = [x[0] for x in res]
                        attrunion = set(["id"])
                        lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0)
                        # make a list of attributes needed to acomplish the rules
                        # (these are all which are present in rules)
                        for rule in lvl_zero_rules:
                            attrunion = attrunion.union(set(rule.entities))
                        # construct query which loads all needed attributes of all returned ID's
                        q.select(list(attrunion)).from_table(obj._table_name)
                        for _id in id_list:
                            try:
                                q.where("id=", _id)
                            except FluentSQLQueryError:
                                q.or_("id=", _id)
                        q()  # perform the query
                        loaded_data = q.fetch_all()

                        similarity = {}
                        # every rule tell us what attributes have to be similar
                        # (or identical)
                        for rule in lvl_zero_rules:
                            attrs = [item for item in rule.entities if item in obj]
                            if rule.reqcount > len(attrs):
                                continue

                            # count every row's similarity (the result is sum of
                            # similarities of their attributes)
                            sim_lst = {}
                            for d in loaded_data:
                                row_similarity = 0.0
                                for attr in attrs:
                                    if attr not in d or d[attr] is None or attr not in obj:
                                        continue
                                    if (d["id"], attr) not in similarity:
                                        s = SequenceMatcher(None, d[attr], obj[attr])
                                        similarity[(d["id"], attr)] = s.ratio()
                                    row_similarity += similarity[(d["id"], attr)]
                                sim_lst[row_similarity] = d["id"]
                            # get the most similar row to the object
                            obj["id"] = sim_lst[max(sim_lst.keys())]
                            self.logger.info(
                                "Found more than one result for lookup: %s, params: %s, "
                                "Choosen ID: %s, SQL: %s"
                                % (obj._table_name, str(entity_comb), obj["id"], search_sql_query)
                            )
                            return True