Example #1
0
 def __init__(self, dataset, entity, context):
     self.dataset = dataset
     self.ns = Namespace(context.get("namespace", dataset.name))
     self.entity = model.make_entity(entity.schema)
     self.entity.id = entity.id
     self.aggregator_entities = TagAggregatorFasttext()
     self.aggregator_patterns = TagAggregator()
Example #2
0
class Analyzer(object):
    def __init__(self, dataset, entity):
        self.dataset = dataset
        self.entity = model.make_entity(entity.schema)
        self.entity.id = entity.id
        self.aggregator = TagAggregator()

    def feed(self, entity):
        if not settings.ANALYZE_ENTITIES:
            return
        # TODO: should we have a schema called "Taggable" with
        # the XXmentioned properties?
        if not entity.schema.is_a('Document'):
            return
        # HACK: Tables will be mapped, don't try to tag them here.
        if entity.schema.is_a('Table'):
            return

        texts = entity.get_type_values(registry.text)
        for text in text_chunks(texts):
            detect_languages(self.entity, text)
            for (prop, tag) in extract_entities(self.entity, text):
                self.aggregator.add(prop, tag)
            for (prop, tag) in extract_patterns(self.entity, text):
                self.aggregator.add(prop, tag)

    def flush(self):
        for (label, prop) in self.aggregator.entities:
            self.entity.add(prop, label, cleaned=True)

        if len(self.aggregator):
            log.debug("Extracted %d tags: %r", len(self.aggregator),
                      self.entity)
            self.dataset.put(self.entity)
Example #3
0
 def __init__(self, dataset, entity):
     self.dataset = dataset
     self.entity = model.make_entity(entity.schema)
     self.entity.id = entity.id
     self.aggregator = TagAggregator()
Example #4
0
class Analyzer(object):
    MENTIONS = {TAG_COMPANY: "Organization", TAG_PERSON: "Person"}

    def __init__(self, dataset, entity, context):
        self.dataset = dataset
        self.ns = Namespace(context.get("namespace", dataset.name))
        self.entity = model.make_entity(entity.schema)
        self.entity.id = entity.id
        self.aggregator_entities = TagAggregatorFasttext()
        self.aggregator_patterns = TagAggregator()

    def feed(self, entity):
        if not settings.ANALYZE_ENTITIES:
            return
        if not entity.schema.is_a(ANALYZABLE):
            return
        # HACK: Tables should be mapped, don't try to tag them here.
        if entity.schema.is_a("Table"):
            return

        texts = entity.get_type_values(registry.text)
        for text in text_chunks(texts):
            detect_languages(self.entity, text)
            for (prop, tag) in extract_entities(self.entity, text):
                self.aggregator_entities.add(prop, tag)
            for (prop, tag) in extract_patterns(self.entity, text):
                self.aggregator_patterns.add(prop, tag)

    def flush(self):
        writer = self.dataset.bulk()
        countries = set()
        results = list(
            chain(self.aggregator_entities.results(),
                  self.aggregator_patterns.results()))

        for (key, prop, values) in results:
            if prop.type == registry.country:
                countries.add(key)

        mention_ids = set()
        for (key, prop, values) in results:
            label = values[0]
            if prop.type == registry.name:
                label = registry.name.pick(values)

            schema = self.MENTIONS.get(prop)
            if schema is not None and self.entity.schema.is_a(DOCUMENT):
                mention = model.make_entity("Mention")
                mention.make_id("mention", self.entity.id, prop, key)
                mention_ids.add(mention.id)
                mention.add("resolved", make_entity_id(key))
                mention.add("document", self.entity.id)
                mention.add("name", values)
                mention.add("detectedSchema", schema)
                mention.add("contextCountry", countries)
                mention = self.ns.apply(mention)
                writer.put(mention)
                # pprint(mention.to_dict())

            self.entity.add(prop, label, cleaned=True, quiet=True)

        if len(results):
            log.debug(
                "Extracted %d prop values, %d mentions [%s]: %s",
                len(results),
                len(mention_ids),
                self.entity.schema.name,
                self.entity.id,
            )
            writer.put(self.entity)
            writer.flush()

        return mention_ids