def all(): q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) clause = Collection.id.in_(authz.collections(authz.READ)) q = q.filter(Entity.collections.any(clause)) q = q.order_by(Entity.id.asc()) return jsonify(Pager(q, limit=100))
def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest self.matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: self.matches[normalize_strong(term)].add(entity.id) self.regexes = [] terms = self.matches.keys() terms = [t for t in terms if len(t) > 2] for i in count(0): terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] if not len(terms_slice): break body = '|'.join(terms_slice) rex = re.compile('( |^)(%s)( |$)' % body) # rex = re.compile('(%s)' % body) self.regexes.append(rex) log.info('Generating entity tagger: %r (%s terms)', latest, len(terms))
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: if term in matches: matches[term].append(entity.id) else: matches[term] = [entity.id] if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def build_automaton(self): q = Entity.all() q = q.filter(Entity.schema.in_(self.TYPES.keys())) matches = {} for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = self.match_form(name) if match is None: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return automaton = Automaton() for term, entities in matches.iteritems(): automaton.add_word(term, entities) automaton.make_automaton() return automaton
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = match_form(name) # TODO: this is a weird heuristic, but to avoid overly # aggressive matching it may make sense: if match is None or ' ' not in match: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for term in entity.regex_terms: if term in matches: matches[term].append((entity.name, tag)) else: matches[term] = [(entity.name, tag)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def load_entities(): tx = get_graph().begin() q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: load_entity(tx, entity) tx.commit()
def load_entities(): graph = get_graph() tx = graph.begin() q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for i, entity in enumerate(q): load_entity(tx, entity) if i > 0 and i % 10000 == 0: tx.commit() tx = graph.begin() tx.commit()
def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: matches[term].add(entity.id) if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.items(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def index(): collection_ids = match_ids('collection', authz.collections(authz.READ)) q = Entity.all() q = q.filter(Entity.collection_id.in_(collection_ids)) return jsonify(Pager(q))