Example #1
0
 def get_proxy_context(self):
     """Metadata to be added to each generated entity."""
     return {
         'created_at': iso_text(self.created_at),
         'updated_at': iso_text(self.updated_at),
         'role_id': self.role_id,
     }
Example #2
0
 def get_proxy_context(self):
     """Metadata to be added to each generated entity."""
     return {
         "created_at": iso_text(self.created_at),
         "updated_at": iso_text(self.updated_at),
         "role_id": self.role_id,
         "mutable": True,
     }
Example #3
0
 def to_proxy(self):
     return model.get_proxy({
         'id': self.id,
         'schema': self.schema,
         'properties': self.data,
         'created_at': iso_text(self.created_at),
         'updated_at': iso_text(self.updated_at),
         'role_id': self.role_id,
         'mutable': True
     })
Example #4
0
 def to_proxy(self):
     data = {
         "id": self.id,
         "schema": self.schema,
         "properties": self.data,
         "created_at": iso_text(self.created_at),
         "updated_at": iso_text(self.updated_at),
         "role_id": self.role_id,
         "mutable": True,
     }
     return model.get_proxy(data, cleaned=False)
Example #5
0
 def to_proxy(self, ns=None):
     ns = ns or self.collection.ns
     proxy = model.get_proxy({
         "id": ns.sign(self.id),
         "schema": self.model,
         "properties": {},
         "created_at": iso_text(self.created_at),
         "updated_at": iso_text(self.updated_at),
         "role_id": self.role_id,
         "mutable": False,
     })
     meta = dict(self.meta)
     headers = meta.pop("headers", None)
     if is_mapping(headers):
         headers = {slugify(k, sep="_"): v for k, v in headers.items()}
         proxy.set("headers", registry.json.pack(headers), quiet=True)
     else:
         headers = {}
     proxy.set("contentHash", self.content_hash)
     proxy.set("parent", ns.sign(self.parent_id))
     proxy.set("ancestors", [ns.sign(a) for a in self.ancestors])
     proxy.set("crawler", meta.get("crawler"))
     proxy.set("sourceUrl", meta.get("source_url"))
     proxy.set("title", meta.get("title"))
     proxy.set("fileName", meta.get("file_name"))
     if not proxy.has("fileName"):
         disposition = headers.get("content_disposition")
         if disposition is not None:
             _, attrs = cgi.parse_header(disposition)
             proxy.set("fileName", attrs.get("filename"))
     proxy.set("mimeType", meta.get("mime_type"))
     if not proxy.has("mimeType"):
         proxy.set("mimeType", headers.get("content_type"))
     proxy.set("language", meta.get("languages"))
     proxy.set("country", meta.get("countries"))
     proxy.set("keywords", meta.get("keywords"))
     proxy.set("authoredAt", meta.get("authored_at"))
     proxy.set("modifiedAt", meta.get("modified_at"))
     proxy.set("publishedAt", meta.get("published_at"))
     proxy.set("retrievedAt", meta.get("retrieved_at"))
     proxy.set("sourceUrl", meta.get("source_url"))
     return proxy
Example #6
0
def format_proxy(proxy, collection):
    """Apply final denormalisations to the index."""
    data = proxy.to_full_dict()
    data['schemata'] = list(proxy.schema.names)

    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    data['text'] = text

    # integer casting
    numeric = {}
    for prop in proxy.iterprops():
        if prop.type in NUMERIC_TYPES:
            values = proxy.get(prop)
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # Context data - from aleph system, not followthemoney.
    now = iso_text(datetime.utcnow())
    data['created_at'] = min(ensure_list(data.get('created_at')), default=now)
    data['updated_at'] = min(ensure_list(data.get('updated_at')), default=now)
    # FIXME: Can there ever really be multiple role_ids?
    data['role_id'] = first(data.get('role_id'))
    data['mutable'] = max(ensure_list(data.get('mutable')), default=False)
    data['origin'] = ensure_list(data.get('origin'))
    data['collection_id'] = collection.id
    # log.info("%s", pformat(data))
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }