Beispiel #1
0
    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                if term in matches:
                    matches[term].append(entity.id)
                else:
                    matches[term] = [entity.id]

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Beispiel #2
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return

        self.latest = latest
        self.matches = defaultdict(set)

        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                self.matches[normalize_strong(term)].add(entity.id)

        self.regexes = []
        terms = self.matches.keys()
        terms = [t for t in terms if len(t) > 2]
        for i in count(0):
            terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            if not len(terms_slice):
                break
            body = '|'.join(terms_slice)
            rex = re.compile('( |^)(%s)( |$)' % body)
            # rex = re.compile('(%s)' % body)
            self.regexes.append(rex)

        log.info('Generating entity tagger: %r (%s terms)',
                 latest, len(terms))
Beispiel #3
0
    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for term in entity.regex_terms:
                if term in matches:
                    matches[term].append((entity.name, tag))
                else:
                    matches[term] = [(entity.name, tag)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Beispiel #4
0
    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = match_form(name)
                # TODO: this is a weird heuristic, but to avoid overly
                # aggressive matching it may make sense:
                if match is None or ' ' not in match:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Beispiel #5
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return

        self.latest = latest
        self.matches = defaultdict(set)

        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                self.matches[normalize_strong(term)].add(entity.id)

        self.regexes = []
        terms = self.matches.keys()
        terms = [t for t in terms if len(t) > 2]
        for i in count(0):
            terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            if not len(terms_slice):
                break
            body = '|'.join(terms_slice)
            rex = re.compile('( |^)(%s)( |$)' % body)
            # rex = re.compile('(%s)' % body)
            self.regexes.append(rex)

        log.info('Generating entity tagger: %r (%s terms)', latest, len(terms))
Beispiel #6
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Beispiel #7
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))