def _merge_schemata(proxy, schemata): for other in schemata: try: other = model.get(other) proxy.schema = model.common_schema(proxy.schema, other) except InvalidData: proxy.schema = model.get(Entity.LEGAL_ENTITY)
def datagen(proxy): if proxy.schema.is_a(model.get("Page")): yield from type_datagen_page(proxy) return elif proxy.schema.is_a(model.get("Document")): return yield from type_datagen(proxy) yield from schema_datagen(proxy)
def ingest(self, file_path, entity): """Ingestor implementation.""" if entity.schema == model.get("Document"): entity.schema = model.get("Folder") if file_path is None or not file_path.is_dir(): return self.crawl(self.manager, file_path, parent=entity)
def iter_proxies(**kw): document = model.get(Document.SCHEMA) includes = ['schema', 'properties'] for data in iter_entities(includes=includes, **kw): schema = model.get(data.get('schema')) if schema is None: continue if 'properties' not in data and schema.is_a(document): data.update(Document.doc_data_to_schema(data)) yield model.get_proxy(data)
def enrich_xref( G, foreign_id, match_collection_ids=None, entity_schemata=None, match_schemata=None, min_score=0.5, skip_unknown_entities=True, ): if entity_schemata: entity_schema = model.get(entity_schemata) if match_schemata: match_schema = model.get(match_schemata) collection = alephclient.get_collection_by_foreign_id(foreign_id) collection_id = collection["id"] xrefs = alephclient.get_collection_xref(collection_id, publisher=True) N = 0 for xref in tqdm(xrefs): if xref["score"] < min_score: log.debug( f"Stoping xref enrichment due to low xref score: {xref['score']} < {min_score}" ) break match_collection_id = int(xref["match_collection"]["collection_id"]) if match_collection_ids and match_collection_id not in match_collection_ids: log.debug( f"Collection not wanted: {match_collection_ids}: {match_collection_id}" ) continue if skip_unknown_entities and xref["entity"]["id"] not in G: log.debug(f"Entity not in graph: {xref['entity']}") continue entity_proxy = parse_entity(xref["entity"]) match_proxy = parse_entity(xref["match"]) if entity_schemata and not entity_proxy.schema.is_a(entity_schema): log.debug( f"Entity is not the right schema: {entity_schema}: {entity_proxy.schema}" ) continue if match_schemata and not match_proxy.schema.is_a(match_schema): log.debug( f"Match is not the right schema: {match_schema}: {match_proxy.schema}" ) continue try: G.add_proxy(entity_proxy) G.add_proxy(match_proxy) G.merge_proxies(entity_proxy, match_proxy) except InvalidData: pass N += 1 return N
def ingest(self, file_path, entity): try: entity.schema = model.get("Audio") metadata = MediaInfo.parse(file_path) for track in metadata.tracks: entity.add("title", track.title) entity.add("generator", track.writing_application) entity.add("generator", track.writing_library) entity.add("generator", track.publisher) entity.add( "authoredAt", self.parse_timestamp(track.recorded_date) ) # noqa entity.add( "authoredAt", self.parse_timestamp(track.tagged_date) ) # noqa entity.add( "authoredAt", self.parse_timestamp(track.encoded_date) ) # noqa modified_at = self.parse_timestamp( track.file_last_modification_date ) # noqa entity.add("modifiedAt", modified_at) if track.sampling_rate: entity.add("samplingRate", track.sampling_rate) entity.add("duration", track.duration) except Exception as ex: raise ProcessingException("Could not read audio: %r", ex) from ex
def ingest(self, file_path, entity): entity.schema = model.get('Table') with io.open(file_path, 'rb') as fh: encoding = self.detect_stream_encoding(fh) log.debug("Detected encoding [%r]: %s", entity, encoding) fh = io.open(file_path, 'r', encoding=encoding, errors='replace') try: sample = fh.read(4096 * 10) fh.seek(0) dialect = csv.Sniffer().sniff(sample) # dialect.delimiter = dialect.delimiter[0] has_header = csv.Sniffer().has_header(sample) reader = csv.reader(fh, dialect=dialect) rows = self.generate_rows(reader, has_header=has_header) self.emit_row_dicts(entity, rows) except UnicodeDecodeError as ude: log.warning("Encoding error: %r", entity) raise ProcessingException("Could not decode CSV (%s)" % encoding) from ude # noqa except Exception as err: log.exception("CSV error: %s", err) raise ProcessingException("Invalid CSV: %s" % err) from err finally: fh.close()
def ingest(self, file_path, entity): entity.schema = model.get("Email") try: msg = Message(file_path.as_posix()) except Exception as exc: msg = "Cannot open message file: %s" % exc raise ProcessingException(msg) from exc self.extract_olefileio_metadata(msg, entity) try: self.extract_msg_headers(entity, msg.header) except Exception: log.exception("Cannot parse Outlook-stored headers") entity.add("subject", msg.subject) entity.add("threadTopic", msg.getStringField("0070")) entity.add("encoding", msg.encoding) entity.add("bodyText", msg.body) entity.add("bodyHtml", msg.htmlBody) entity.add("messageId", self.parse_message_ids(msg.message_id)) if not entity.has("inReplyTo"): entity.add("inReplyTo", self.parse_references(msg.references, [])) try: date = parsedate_to_datetime(msg.date).isoformat() entity.add("date", date) except Exception: log.warning("Could not parse date: %s", msg.date) # sender name and email sender = self.get_identities(msg.sender) self.apply_identities(entity, sender, "emitters", "sender") # received by sender = self.get_identity(msg.getStringField("0040"), msg.getStringField("0076")) self.apply_identities(entity, sender, "emitters") froms = self.get_identities(msg.getStringField("1046")) self.apply_identities(entity, froms, "emitters", "from") tos = self.get_identities(msg.to) self.apply_identities(entity, tos, "recipients", "to") ccs = self.get_identities(msg.cc) self.apply_identities(entity, ccs, "recipients", "cc") bccs = self.get_identities(msg.bcc) self.apply_identities(entity, bccs, "recipients", "bcc") self.resolve_message_ids(entity) for attachment in msg.attachments: if attachment.type != "data": continue name = stringify(attachment.longFilename) name = name or stringify(attachment.shortFilename) self.ingest_attachment(entity, name, attachment.type, attachment.data)
def ingest(self, file_path, entity): entity.schema = model.get('Pages') pdf_path = self.make_work_file('tiff.pdf') self.exec_command('tiff2pdf', file_path, '-x', '300', '-y', '300', '-o', pdf_path) self.assert_outfile(pdf_path) self.pdf_alternative_extract(entity, pdf_path)
def convert_party(party): entity = model.make_entity('LegalEntity') party_id = party.pop('id', None) identifier = party.pop('identifier', {}) if party_id is None: party_id = identifier.get('id') entity.make_id(party_id) convert_name(entity, party) convert_address(entity, party.pop('address', {})) convert_address(entity, party.pop('deliveryAddress', {})) entity.add('legalForm', party.pop('organizationType', None)) contact = party.pop('contactPoint', {}) entity.add('website', contact.pop('url', None)) entity.add('phone', contact.pop('telephone', None)) entity.add('email', contact.pop('email', None)) convert_identifier(entity, identifier) for identifier in party.pop('additionalIdentifiers', []): convert_identifier(entity, identifier) yield entity for mem in ensure_list(party.pop('memberOf', [])): for other in convert_party(mem): other.schema = model.get('Organization') yield other mem = model.make_entity('Membership') mem.make_id(entity.id, other.id) mem.add('member', entity) mem.add('organization', other) yield mem party.pop('roles', None)
def entity_tags(entity, authz): """Do a search on tags of an entity.""" proxy = model.get_proxy(entity) Thing = model.get(Entity.THING) types = [registry.name, registry.email, registry.identifier, registry.iban, registry.phone, registry.address] facets = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for type_ in types: if type_.group is None: continue for fidx, value in enumerate(proxy.get_type_values(type_)): if type_.specificity(value) < 0.1: continue schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Thing)] index = entities_read_index(schemata) alias = '%s_%s' % (type_.name, fidx) facets.append((index, alias, type_.group, type_.group, value)) res = _filters_faceted_query(authz, facets) for (_, alias, field, _, value) in facets: total = res.get(alias, 0) if total > 1: yield (field, value, total)
def ingest(self, file_path, entity): entity.schema = model.get('Email') try: msg = Message(file_path.as_posix()) except Exception as exc: msg = "Cannot open message file: %s" % exc raise ProcessingException(msg) from exc self.extract_olefileio_metadata(msg, entity) try: self.extract_msg_headers(entity, msg.header) except Exception: log.exception("Cannot parse Outlook-stored headers") entity.add('subject', msg.subject) entity.add('threadTopic', msg.getStringField('0070')) entity.add('encoding', msg.encoding) entity.add('bodyText', msg.body) entity.add('bodyHtml', msg.htmlBody) entity.add('messageId', self.parse_message_ids(msg.message_id)) if not entity.has('inReplyTo'): entity.add('inReplyTo', self.parse_references(msg.references, [])) try: date = parsedate_to_datetime(msg.date).isoformat() entity.add('date', date) except Exception: log.warning("Could not parse date: %s", msg.date) # sender name and email sender = self.get_identities(msg.sender) self.apply_identities(entity, sender, 'emitters', 'sender') # received by sender = self.get_identity(msg.getStringField('0040'), msg.getStringField('0076')) self.apply_identities(entity, sender, 'emitters') froms = self.get_identities(msg.getStringField('1046')) self.apply_identities(entity, froms, 'emitters', 'from') tos = self.get_identities(msg.to) self.apply_identities(entity, tos, 'recipients', 'to') ccs = self.get_identities(msg.cc) self.apply_identities(entity, ccs, 'recipients', 'cc') bccs = self.get_identities(msg.bcc) self.apply_identities(entity, bccs, 'recipients', 'bcc') self.resolve_message_ids(entity) for attachment in msg.attachments: if attachment.type != 'data': continue name = stringify(attachment.longFilename) name = name or stringify(attachment.shortFilename) self.ingest_attachment(entity, name, attachment.type, attachment.data)
def _serialize(self, obj): pk = obj.get('id') obj['id'] = str(pk) collection_id = obj.pop('collection_id', None) obj['writeable'] = request.authz.can(collection_id, request.authz.WRITE) # noqa obj['collection'] = self.resolve(Collection, collection_id, CollectionSerializer) # noqa ent_ids = obj.pop('entities') obj['entities'] = [] for ent_id in ent_ids: entity = self.resolve(Entity, ent_id, DiagramEntitySerializer) if entity is not None: obj['entities'].append(entity) for ent in obj['entities']: schema = model.get(ent.get('schema')) properties = ent.get('properties', {}) for prop in schema.properties.values(): if prop.type != registry.entity: continue values = ensure_list(properties.get(prop.name)) if values: properties[prop.name] = [] for value in values: entity = self.resolve(Entity, value, DiagramEntitySerializer) # noqa properties[prop.name].append(entity) return self._clean_response(obj)
def add_cast( self, schema: Union[str, Schema], prop: Union[str, Property], values: Any, cleaned: bool = False, fuzzy: bool = False, format: Optional[str] = None, ): """Set a property on an entity. If the entity is of a schema that doesn't have the given property, also modify the schema (e.g. if something has a birthDate, assume it's a Person, not a LegalEntity). """ prop_ = self.schema.get(prop) if prop_ is not None: return self.add(prop, values, cleaned=cleaned, fuzzy=fuzzy, format=format) schema_ = model.get(schema) if schema_ is None: raise RuntimeError("Invalid schema: %s" % schema) prop_ = schema_.get(prop) if prop_ is None: raise RuntimeError("Invalid prop: %s" % prop) for value in self._lookup_values(prop_, values): clean = self._verbose_clean(prop_, value, fuzzy, format) if clean is not None: self.add_schema(schema) self.unsafe_add(prop_, clean, cleaned=True)
def expand_group(node): if node.type.group is None or node.value is None: return value = str(node.value) query = { 'query': { 'term': { node.type.group: value } }, '_source': { 'includes': ['schema', 'properties'] } } for res in scan(es, index=entities_index(), query=query): entity_id = res.get('_id') source = res.get('_source') properties = source.get('properties') schema = model.get(source.get('schema')) for prop in schema.properties.values(): if prop.type != node.type: continue values = properties.get(prop.name) values = node.type.normalize_set(values) if value not in values: continue if prop.reverse: yield Link(node, prop.reverse, entity_id) else: yield Link(node, prop, entity_id, inverted=True)
def suggest_property(): prefix = request.args.get('prefix', '').lower().strip() tag_request(prefix=prefix) schema = request.args.get('schema', Entity.THING) matches = [] for prop in model.get(schema).properties.values(): match = not len(prefix) match = prefix in prop.name.lower() match = match or prefix in prop.label.lower() if match: matches.append({ 'id': prop.name, 'quid': prop.name, 'name': prop.label, 'r:score': 100, 'n:type': { 'id': '/properties/property', 'name': 'Property' } }) return jsonify({ "code": "/api/status/ok", "status": "200 OK", "prefix": request.args.get('prefix', ''), "result": matches })
def test_model_basics(self): assert model.schemata['Thing'], model.schemata thing = model.schemata['Thing'] assert thing == model.get(thing) assert thing in list(model), list(model) assert 'Person' in model.to_dict(), model.to_dict() assert 'Thing' in model.to_dict(), model.to_dict()
def ingest(self, file_path, entity): entity.schema = model.get('Table') try: table = Table(file_path.as_posix()).open() self.emit_row_dicts(entity, self.generate_rows(table)) except DbfError as err: raise ProcessingException('Cannot open DBF file: %s' % err) from err # noqa
def iter_proxies(**kw): includes = ['schema', 'properties'] for data in iter_entities(includes=includes, **kw): schema = model.get(data.get('schema')) if schema is None: continue yield model.get_proxy(data)
def parse_reference(context, reference, rows): entity = context.make("LegalEntity") entity.id = context.make_slug(reference) # entity.add("sourceUrl", context.dataset.url) sanction = h.make_sanction(context, entity) for row in rows: if row.pop("type") == "Individual": entity.schema = model.get("Person") name = row.pop("name_of_individual_or_entity", None) if row.pop("name_type") == "aka": entity.add("alias", name) else: entity.add("name", name) address = h.make_address(context, full=row.pop("address")) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", row.pop("additional_information")) entity.add("notes", row.pop("listing_information"), quiet=True) control_date = row.pop("control_date") sanction.add("modifiedAt", control_date) entity.add("modifiedAt", control_date) entity.context["updated_at"] = control_date.isoformat() entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def suggest_property(): prefix = request.args.get("prefix", "").lower().strip() tag_request(prefix=prefix) schema = request.args.get("schema", Entity.THING) matches = [] for prop in model.get(schema).properties.values(): match = not len(prefix) match = prefix in prop.name.lower() match = match or prefix in prop.label.lower() if match: matches.append({ "id": prop.name, "quid": prop.name, "name": prop.label, "r:score": 100, "n:type": { "id": "/properties/property", "name": "Property" }, }) return jsonify({ "code": "/api/status/ok", "status": "200 OK", "prefix": request.args.get("prefix", ""), "result": matches, })
def ingest(self, file_path, entity): entity.schema = model.get("Workbook") self.ooxml_extract_metadata(file_path, entity) try: book = load_workbook(file_path, read_only=True) except Exception as err: raise ProcessingException("Invalid Excel file: %s" % err) from err try: for name in book.sheetnames: sheet = book[name] if not hasattr(sheet, "rows"): log.warning("Cannot parse chart sheet: %s", name) continue table = self.manager.make_entity("Table", parent=entity) table.make_id(entity.id, name) table.set("title", name) log.debug("Sheet: %s", name) self.emit_row_tuples(table, self.generate_rows(sheet)) if table.has("csvHash"): self.manager.emit_entity(table) except Exception as err: raise ProcessingException("Cannot read Excel file: %s" % err) from err finally: book.close()
def entity_tags(entity, authz=None): """Do a search on tags of an entity.""" proxy = model.get_proxy(entity) Thing = model.get(Entity.THING) types = [registry.name, registry.email, registry.identifier, registry.iban, registry.phone, registry.address] facets = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for type_ in types: if type_.group is None: continue for fidx, value in enumerate(proxy.get_type_values(type_)): if type_.specificity(value) < 0.1: continue schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Thing)] index = entities_read_index(schemata) alias = '%s_%s' % (type_.name, fidx) facets.append((index, alias, type_.group, type_.group, value)) res = _filters_faceted_query(facets, authz=authz) for (_, alias, field, _, value) in facets: total = res.get(alias, 0) if total > 1: yield (field, value, total)
def configure_schema(schema, version): # Generate relevant type mappings for entity properties so that # we can do correct searches on each. schema_mapping = {} numeric_mapping = {registry.date.group: NUMERIC} for prop in schema.properties.values(): config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD)) config["copy_to"] = ["text"] schema_mapping[prop.name] = config if prop.type in NUMERIC_TYPES: numeric_mapping[prop.name] = NUMERIC mapping = { "date_detection": False, "dynamic": False, "_source": {"excludes": ["text", "fingerprints"]}, "properties": { "caption": KEYWORD, "schema": KEYWORD, "schemata": KEYWORD, registry.entity.group: KEYWORD, registry.language.group: KEYWORD, registry.country.group: KEYWORD, registry.checksum.group: KEYWORD, registry.ip.group: KEYWORD, registry.url.group: KEYWORD, registry.iban.group: KEYWORD, registry.email.group: KEYWORD, registry.phone.group: KEYWORD, registry.mimetype.group: KEYWORD, registry.identifier.group: KEYWORD, registry.date.group: PARTIAL_DATE, registry.address.group: KEYWORD, registry.name.group: KEYWORD, "fingerprints": { "type": "keyword", "normalizer": "latin_index", "copy_to": "text", "fields": {"text": LATIN_TEXT}, }, "text": { "type": "text", "analyzer": "latin_index", "search_analyzer": "latin_query", "search_quote_analyzer": "latin_index", "term_vector": "with_positions_offsets", }, "properties": {"type": "object", "properties": schema_mapping}, "numeric": {"type": "object", "properties": numeric_mapping}, "role_id": KEYWORD, "collection_id": KEYWORD, "origin": KEYWORD, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, }, } index = schema_index(model.get(schema), version) settings = index_settings(shards=get_shard_weight(schema)) return configure_index(index, mapping, settings)
def parse_nested(edge): schema = model.get(edge["schema"]) for items in edge["properties"].values(): for item in items: if isinstance(item, dict): log.debug(f"Found nested item: {item['id']}") yield parse_entity(item) yield parse_entity(edge)
def ingest(self, file_path, entity): """Ingestor implementation.""" entity.schema = model.get("Pages") pdf_path = self.make_work_file("page.pdf") self.exec_command("ddjvu", "-format=pdf", "-quality=100", "-skip", file_path, pdf_path) self.assert_outfile(pdf_path) self.pdf_alternative_extract(entity, pdf_path)
def get_schemata(dataset: Dataset) -> List[Schema]: schemata: List[Schema] = list() names = Statement.all_schemata(dataset=dataset) for name in names: schema = model.get(name) if schema is not None: schemata.append(schema) return schemata
def ingest(self, file_path, entity): entity.schema = model.get("Pages") pdf_path = self.make_work_file("tiff.pdf") self.exec_command( "tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path ) self.assert_outfile(pdf_path) self.pdf_alternative_extract(entity, pdf_path)
def ingest(self, file_path, entity): """Ingestor implementation.""" entity.schema = model.get('Pages') pdf_path = self.make_work_file('page.pdf') self.exec_command('ddjvu', '-format=pdf', '-quality=100', '-skip', file_path, pdf_path) self.assert_outfile(pdf_path) self.pdf_alternative_extract(entity, pdf_path)
def _serialize(self, obj): pk = obj.get('id') obj['id'] = str(pk) authz = request.authz collection_id = obj.pop('collection_id', None) obj['collection'] = self.resolve(Collection, collection_id, CollectionSerializer) schema = model.get(obj.get('schema')) if schema is None: return None obj['schemata'] = schema.names properties = obj.get('properties', {}) for prop in schema.properties.values(): if prop.type != registry.entity: continue values = ensure_list(properties.get(prop.name)) properties[prop.name] = [] for value in values: entity = self.resolve(Entity, value, EntitySerializer) properties[prop.name].append(entity) links = { 'self': url_for('entities_api.view', entity_id=pk), 'references': url_for('entities_api.references', entity_id=pk), 'tags': url_for('entities_api.tags', entity_id=pk), 'ui': entity_url(pk) } if schema.is_a(Document.SCHEMA): links['content'] = url_for('entities_api.content', entity_id=pk) file_name = first(properties.get('fileName')) content_hash = first(properties.get('contentHash')) if content_hash: mime_type = first(properties.get('mimeType')) name = safe_filename(file_name, default=pk) links['file'] = archive_url(request.authz.id, content_hash, file_name=name, mime_type=mime_type) pdf_hash = first(properties.get('pdfHash')) if pdf_hash: name = safe_filename(file_name, default=pk, extension='.pdf') links['pdf'] = archive_url(request.authz.id, pdf_hash, file_name=name, mime_type=PDF) csv_hash = first(properties.get('csvHash')) if csv_hash: name = safe_filename(file_name, default=pk, extension='.csv') links['csv'] = archive_url(request.authz.id, csv_hash, file_name=name, mime_type=CSV) obj['links'] = links obj['writeable'] = authz.can(collection_id, authz.WRITE) obj.pop('_index', None) return self._clean_response(obj)
def ingest(self, file_path, entity): entity.schema = model.get('PlainText') text = self.read_file_decoded(entity, file_path) entity.set('bodyText', text) try: for card in vobject.readComponents(text): self.ingest_card(entity, card) except vobject.base.ParseError as err: raise ProcessingException('Cannot parse vcard: %s' % err) from err
def schema_scope(schema, expand=True): schemata = set() names = ensure_list(schema) or model.schemata.values() for schema in names: schema = model.get(schema) if schema is not None: schemata.add(schema) if expand: schemata.update(schema.descendants) for schema in schemata: if not schema.abstract: yield schema
def ancestors(self): if self.parent_id is None: return [] key = cache.key('ancestors', self.id) ancestors = cache.get_list(key) if len(ancestors): return ancestors parent_key = cache.key('ancestors', self.parent_id) ancestors = cache.get_list(parent_key) if not len(ancestors): ancestors = [] parent = Document.by_id(self.parent_id) if parent is not None: ancestors = parent.ancestors ancestors.append(self.parent_id) if self.model.is_a(model.get(self.SCHEMA_FOLDER)): cache.set_list(key, ancestors, expire=cache.EXPIRE) return ancestors
def entity_references(entity, authz): """Given a particular entity, find all the references to it from other entities, grouped by the property where they are used.""" schema = model.get(entity.get('schema')) group = registry.entity.group facets = [] for prop in model.properties: if prop.type != registry.entity: continue if not schema.is_a(prop.range): continue index = entities_read_index(prop.schema) field = 'properties.%s' % prop.name value = entity.get('id') facets.append((index, prop.qname, group, field, value)) res = _filters_faceted_query(authz, facets) for (qname, total) in res.items(): if total > 0: yield (model.get_qname(qname), total)
def reconcile_index(collection=None): domain = settings.APP_UI_URL.strip('/') label = settings.APP_TITLE suggest_query = [] schemata = list(model) if collection is not None: label = '%s (%s)' % (collection.get('label'), label) suggest_query.append(('filter:collection_id', collection.get('id'))) schemata = [model.get(s) for s in collection.get('schemata').keys()] return jsonify({ 'name': label, 'identifierSpace': 'http://rdf.freebase.com/ns/type.object.id', 'schemaSpace': 'http://rdf.freebase.com/ns/type.object.id', 'view': {'url': entity_url('{{id}}')}, 'preview': { 'url': entity_url('{{id}}'), 'width': 800, 'height': 400 }, 'suggest': { 'entity': { 'service_url': domain, 'service_path': url_for('reconcile_api.suggest_entity', _query=suggest_query, _authorize=True, _relative=True) }, 'type': { 'service_url': domain, 'service_path': url_for('reconcile_api.suggest_type', _relative=True) }, 'property': { 'service_url': domain, 'service_path': url_for('reconcile_api.suggest_property', _relative=True) } }, 'defaultTypes': [get_freebase_type(s) for s in schemata if s.matchable] })
def model(self): return model.get(self.schema)
def configure_schema(schema, version): # Generate relevant type mappings for entity properties so that # we can do correct searches on each. schema_mapping = {} for prop in schema.properties.values(): config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD)) config['copy_to'] = ['text'] schema_mapping[prop.name] = config mapping = { "date_detection": False, "dynamic": False, "_source": { "excludes": ["text", "fingerprints"] }, "properties": { "name": { "type": "text", "analyzer": "icu_latin", "fields": {"kw": KEYWORD}, "boost": 3.0, "copy_to": "text" }, "schema": KEYWORD, "schemata": KEYWORD, "bulk": {"type": "boolean"}, "status": KEYWORD, "error_message": { "type": "text", "copy_to": "text", "index": False }, "foreign_id": KEYWORD, "document_id": KEYWORD, "collection_id": KEYWORD, "uploader_id": KEYWORD, "fingerprints": { "type": "keyword", "normalizer": "icu_latin", "copy_to": "text", "fields": {"text": LATIN_TEXT} }, "entities": KEYWORD, "languages": KEYWORD, "countries": KEYWORD, "checksums": KEYWORD, "keywords": KEYWORD, "ips": KEYWORD, "urls": KEYWORD, "ibans": KEYWORD, "emails": KEYWORD, "phones": KEYWORD, "mimetypes": KEYWORD, "identifiers": KEYWORD, "addresses": { "type": "keyword", "fields": {"text": LATIN_TEXT} }, "dates": PARTIAL_DATE, "names": { "type": "keyword", "fields": {"text": LATIN_TEXT}, "copy_to": "text" }, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, "text": { "type": "text", "analyzer": "icu_latin", "term_vector": "with_positions_offsets", "store": True }, "properties": { "type": "object", "properties": schema_mapping } } } index = schema_index(model.get(schema), version) return configure_index( index, mapping, index_settings(shards=get_shard_weight(schema)) )
def entities_write_index(schema): """Index that us currently written by new queries.""" schema = model.get(schema) return schema_index(schema, settings.INDEX_WRITE)
def update(self, result, key): try: result['label'] = model.get(key).plural except AttributeError: result['label'] = key