def convert_party(party): entity = model.make_entity('LegalEntity') party_id = party.pop('id', None) identifier = party.pop('identifier', {}) if party_id is None: party_id = identifier.get('id') entity.make_id(party_id) convert_name(entity, party) convert_address(entity, party.pop('address', {})) convert_address(entity, party.pop('deliveryAddress', {})) entity.add('legalForm', party.pop('organizationType', None)) contact = party.pop('contactPoint', {}) entity.add('website', contact.pop('url', None)) entity.add('phone', contact.pop('telephone', None)) entity.add('email', contact.pop('email', None)) convert_identifier(entity, identifier) for identifier in party.pop('additionalIdentifiers', []): convert_identifier(entity, identifier) yield entity for mem in ensure_list(party.pop('memberOf', [])): for other in convert_party(mem): other.schema = model.get('Organization') yield other mem = model.make_entity('Membership') mem.make_id(entity.id, other.id) mem.add('member', entity) mem.add('organization', other) yield mem party.pop('roles', None)
def make_relation(root_path, documents, api, cid, entity): relation = entity.pop('relation') if relation not in RELATIONS: return schema, subject_prop, object_prop, prop = RELATIONS.get(relation) proxy = model.make_entity(schema) proxy.make_id(entity.pop('edge')) if prop is not None: proxy.add(prop, relation) subject = model.make_entity('Thing') subject.make_id(entity.pop('subject')) proxy.add(subject_prop, subject.id) object_ = model.make_entity('Thing') object_.make_id(entity.pop('object')) proxy.add(object_prop, object_.id) for section, values in entity.items(): for value in values: if section == 'Source': # TODO: no 'proof' on Intervals. upload_document(root_path, documents, api, cid, value) continue prop = PROPERTIES.get(section) if schema == 'Associate' and prop == 'role': prop = 'relationship' proxy.add(prop, value) print(repr(proxy)) yield proxy
def test_make_id(self): proxy = model.make_entity("Thing") assert not proxy.make_id(None) assert proxy.make_id("banana") assert proxy.make_id("banana") == proxy.make_id("banana") ff = proxy.make_id(44) assert ff is not None proxy = model.make_entity("Thing", key_prefix="foo") assert proxy.make_id(44) assert proxy.make_id(44) != ff, proxy.make_id(44)
def test_make_id(self): proxy = model.make_entity('Thing') assert not proxy.make_id(None) assert proxy.make_id('banana') assert proxy.make_id('banana') == proxy.make_id('banana') ff = proxy.make_id(44) assert ff is not None proxy = model.make_entity('Thing', key_prefix='foo') assert proxy.make_id(44) assert proxy.make_id(44) != ff, proxy.make_id(44)
def write_edges(writer, db): for i, edge in enumerate(db["edges"], 1): source_id = edge.pop("start_id", None) source_id = edge.pop("node_1", source_id) source = make_node_entity(source_id) target_id = edge.pop("end_id", None) target_id = edge.pop("node_2", target_id) target = make_node_entity(target_id) type_ = edge.pop("rel_type", None) type_ = edge.pop("type", type_) link = edge.pop("link", None) if type_ in IGNORE_EDGE_TYPES: continue if type_ in SAME_AS: source.add("sameAs", target) writer.put(source, fragment=target.id) target.add("sameAs", source) writer.put(target, fragment=source.id) continue schema = edge_schema(type_, link) # print(type_, link, schema) proxy = model.make_entity(schema) proxy.make_id(source_id, target_id, type_, link) proxy.add("startDate", parse_date(edge.pop("start_date", None))) proxy.add("endDate", parse_date(edge.pop("end_date", None))) proxy.add("summary", edge.pop("valid_until", None)) proxy.add(proxy.schema.source_prop, source) proxy.add(proxy.schema.target_prop, target) proxy.add("role", link) if link is None: proxy.add("role", type_) writer.put(proxy) if i % 10000 == 0: print("edges: %s" % i)
def __init__(self, dataset, entity, context): self.dataset = dataset self.ns = Namespace(context.get("namespace", dataset.name)) self.entity = model.make_entity(entity.schema) self.entity.id = entity.id self.aggregator_entities = TagAggregatorFasttext() self.aggregator_patterns = TagAggregator()
def test_de_number(self): phones = registry.phone proxy = model.make_entity("Person") proxy.add("country", "DE") self.assertEqual(phones.clean("017623423980"), None) self.assertEqual(phones.clean("017623423980", proxy=proxy), "+4917623423980")
def test_rdf(self): proxy = EntityProxy.from_dict(model, ENTITY) triples = list(proxy.triples()) assert 10 == len(triples), len(triples) proxy = model.make_entity("Person") assert 0 == len(list(proxy.triples()))
def company_entity(self, data, entity=None): if "company" in data: data = ensure_dict(data.get("company", data)) if entity is None: entity = model.make_entity("Company") entity.make_id(data.get("opencorporates_url")) entity.add("name", data.get("name")) address = ensure_dict(data.get("registered_address")) entity.add("country", address.get("country")) entity.add("jurisdiction", data.get("jurisdiction_code")) entity.add("alias", data.get("alternative_names")) entity.add("address", data.get("registered_address_in_full")) entity.add("sourceUrl", data.get("registry_url")) entity.add("legalForm", data.get("company_type")) entity.add("incorporationDate", data.get("incorporation_date")) entity.add("dissolutionDate", data.get("dissolution_date")) entity.add("status", data.get("current_status")) entity.add("registrationNumber", data.get("company_number")) entity.add("opencorporatesUrl", data.get("opencorporates_url")) source = data.get("source", {}) entity.add("publisher", source.get("publisher")) entity.add("publisherUrl", source.get("url")) entity.add("retrievedAt", source.get("retrieved_at")) for code in ensure_list(data.get("industry_codes")): code = code.get("industry_code", code) entity.add("sector", code.get("description")) for previous in ensure_list(data.get("previous_names")): entity.add("previousName", previous.get("company_name")) for alias in ensure_list(data.get("alternative_names")): entity.add("alias", alias.get("company_name")) return entity
def convert_party(party): entity = model.make_entity('LegalEntity') entity.make_id(party.pop('id', None)) entity.add('name', party.pop('name', None)) address = party.pop('address', {}) entity.add('country', address.pop('countryName', None)) address_text = make_address(address.pop('streetAddress', None), address.pop('postalCode', None), address.pop('region', None)) entity.add('address', address_text) if len(address): log.info("Unknown address part: %r", address.keys()) contact = party.pop('contactPoint', {}) entity.add('website', contact.pop('url', None)) entity.add('phone', contact.pop('telephone', None)) entity.add('email', contact.pop('email', None)) for identifier in party.pop('additionalIdentifiers', []): scheme = identifier.pop('scheme', None) prop = IDENTIFIERS.get(scheme, None) if prop is None: log.info("Unknown identifier scheme: %s", scheme) continue entity.add(prop, identifier.pop('id', None)) # pprint(party) return entity
def parse_calls(self, call): entity = model.make_entity('Call') entity.make_id(self.project_id, call.get('id')) for timestamp in self._field_values(call, 'TimeStamp'): entity.add('date', self.parse_timestamp(timestamp)) for duration in self._field_values(call, 'Duration'): entity.add('duration', self.get_seconds(duration)) call_types = self._field_values(call, 'Type') if OUTGOING in call_types: entity.add('caller', self.owner) entity.add('callerNumber', self.owner.get('phone')) else: entity.add('receiver', self.owner) entity.add('receiverNumber', self.owner.get('phone')) for party in self.parse_parties(self._models(call, 'Party')): if OUTGOING in call_types: entity.add('receiver', party) entity.add('receiverNumber', party.get('phone')) else: entity.add('caller', party) entity.add('callerNumber', party.get('phone')) yield party yield entity
def company_entity(self, data, entity=None): if 'company' in data: data = ensure_dict(data.get('company', data)) if entity is None: entity = model.make_entity('Company') entity.make_id(data.get('opencorporates_url')) entity.add('name', data.get('name')) address = ensure_dict(data.get('registered_address')) entity.add('country', address.get('country')) entity.add('jurisdiction', data.get('jurisdiction_code')) entity.add('alias', data.get('alternative_names')) entity.add('address', data.get('registered_address_in_full')) entity.add('sourceUrl', data.get('registry_url')) entity.add('legalForm', data.get('company_type')) entity.add('incorporationDate', data.get('incorporation_date')) entity.add('dissolutionDate', data.get('dissolution_date')) entity.add('status', data.get('current_status')) entity.add('registrationNumber', data.get('company_number')) entity.add('opencorporatesUrl', data.get('opencorporates_url')) source = data.get('source', {}) entity.add('publisher', source.get('publisher')) entity.add('publisherUrl', source.get('url')) entity.add('retrievedAt', source.get('retrieved_at')) for code in ensure_list(data.get('industry_codes')): code = code.get('industry_code', code) entity.add('sector', code.get('description')) for previous in ensure_list(data.get('previous_names')): entity.add('previousName', previous.get('company_name')) for alias in ensure_list(data.get('alternative_names')): entity.add('alias', alias.get('company_name')) return entity
def update_entity(collection, entity_id=None): """Update xref and aggregator after an entity has been edited.""" from aleph.logic.xref import xref_entity from aleph.logic.profiles import profile_fragments log.info("[%s] Update entity: %s", collection, entity_id) entity = index.get_entity(entity_id) proxy = model.get_proxy(entity) if collection.casefile: xref_entity(collection, proxy) aggregator = get_aggregator(collection, origin=MODEL_ORIGIN) profile_fragments(collection, aggregator, entity_id=entity_id) # Inline name properties from adjacent entities. See the # docstring on `inline_names` for a more detailed discussion. prop = proxy.schema.get("namesMentioned") if prop is not None: entity_ids = proxy.get_type_values(registry.entity) names = set() for related in index.entities_by_ids(entity_ids): related = model.get_proxy(related) names.update(related.get_type_values(registry.name)) if len(names) > 0: name_proxy = model.make_entity(proxy.schema) name_proxy.id = proxy.id name_proxy.add(prop, names) aggregator.put(name_proxy, fragment="names") index_aggregator(collection, aggregator, entity_ids=[entity_id]) refresh_entity(collection, proxy.id)
def transform(input): prev = None entity = None for (s, p, o) in parse_triples(input): if s != prev and entity is not None: # print(entity.to_dict()) pass if s != prev: prev = s if s.startswith(SPECIAL): qid = s[len(SPECIAL):] else: qid = s[len(ENTITY):] entity = model.make_entity("Thing") entity.make_id(qid) entity.add("wikidataId", qid) if s.startswith(ENTITY): entity.add("sourceUrl", str(s)) if p in [SKOS.prefLabel, RDFS.label, SCHEMA.name]: entity.add("name", str(o)) continue print(s, p, o) if p == PROP.P31: # print((p, o)) pass
def add_stub(self, proxy_id, schema="Thing"): stub = model.make_entity(schema) stub.id = proxy_id if proxy_id in self: return self.get_node_by_proxy(stub) node, _ = self.add_proxy(stub) self._stub_proxies.add(stub.id) return node
def test_language_tagging(self): text = "C'est le caniche d'Emmanuel Macron. " * 2 entity = model.make_entity('PlainText') entity.add('bodyText', text) analyze_entity(entity) names = entity.get_type_values(registry.name) assert "d'Emmanuel Macron" in names, names assert entity.get('detectedLanguage') == ['fra'], entity.get('detectedLanguage') # noqa
def to_proxy(self): if self.text is not None: proxy = model.make_entity(self.SCHEMA_PAGE) proxy.make_id('record', self.id) proxy.set('document', self.document_id) proxy.set('index', self.index) proxy.set('bodyText', stringify(self.text)) return proxy else: proxy = model.make_entity(self.SCHEMA_ROW) proxy.make_id('record', self.id) proxy.set('table', self.document_id) proxy.set('index', self.index) if self.data is not None: values = [v for (k, v) in sorted(self.data.items())] proxy.set('cells', registry.json.pack(values)) return proxy
def test_ner_extract(self): text = 'Das ist der Pudel von Angela Merkel. ' text = text * 5 entity = model.make_entity('PlainText') entity.add('bodyText', text) analyze_entity(entity) names = entity.get_type_values(registry.name) assert 'Angela Merkel' in names, names
def create_link(sources, targets): proxy = model.make_entity("UnknownLink") proxy.make_id(random.sample(string.ascii_letters, 8)) for s in sources: proxy.add("subject", s) for t in targets: proxy.add("object", t) return proxy
def get_profile(entityset_id, authz=None): """A profile is an entityset having a party. The idea is to cache profile metadata for the API, and to generate a merged view of all the entities the current user has access to.""" if entityset_id is None: return key = cache.object_key(EntitySet, entityset_id) data = cache.get_complex(key) stub = Stub() if data is None: entityset = get_entityset(entityset_id) if entityset is None: return data = entityset.to_dict() data["items"] = [] for item in entityset.items(): data["items"].append(item.to_dict()) cache.set_complex(key, data, expires=cache.EXPIRE) # Filter the subset of items the current user can access if authz is not None: items = [ i for i in data["items"] if authz.can(i["collection_id"], authz.READ) ] data["items"] = items # Load the constituent entities for the profile and generate a # combined proxy with all of the given properties. for item in data["items"]: if Judgement(item["judgement"]) == Judgement.POSITIVE: resolver.queue(stub, Entity, item.get("entity_id")) resolver.resolve(stub) merged = None data["proxies"] = [] for item in data["items"]: item["entity"] = resolver.get(stub, Entity, item.get("entity_id")) if item["entity"] is not None: proxy = model.get_proxy(item["entity"]) proxy.context = {} data["proxies"].append(proxy) if merged is None: merged = proxy.clone() merged.context["entities"] = [proxy.id] else: merged.merge(proxy) merged.context["entities"].append(proxy.id) if merged is None: merged = model.make_entity(Entity.LEGAL_ENTITY) # Polish it a bit: merged.id = data.get("id") merged = name_entity(merged) data["merged"] = merged data["label"] = merged.caption data["shallow"] = False return data
def test_base_functions(self): data = dict(ENTITY) data["properties"]["banana"] = ["foo"] proxy = EntityProxy.from_dict(model, data) assert "test" in repr(proxy), repr(proxy) assert hash(proxy) == hash(proxy.id) assert proxy.get("name") == ["Ralph Tester"] assert proxy.first("name") == "Ralph Tester" prop = model.get_qname("Thing:name") assert proxy.get(prop) == ["Ralph Tester"] assert proxy.caption == "Ralph Tester" assert str(proxy) == "Ralph Tester" name = "Ralph the Great" proxy.add("name", name) assert len(proxy.get("name")) == 2 proxy.add("name", None) assert len(proxy.get("name")) == 2 proxy.add("name", "") assert len(proxy.get("name")) == 2 proxy.add("name", [""]) assert len(proxy.get("name")) == 2 proxy.add("name", {"name": "banana"}) assert len(proxy.get("name")) == 3, proxy.get("name") assert name in proxy.get("name") assert name in proxy.names, proxy.names with raises(InvalidData): proxy.add("banana", "yellow") proxy.add("banana", "yellow", quiet=True) mem = model.make_entity("Membership") mem.id = "foo" with raises(InvalidData): proxy.add("directorshipDirector", mem) with raises(InvalidData): proxy.add("sameAs", proxy) with raises(InvalidData): proxy.get("banana") assert [] == proxy.get("banana", quiet=True) with raises(InvalidData): proxy.first("banana") assert proxy.first("banana", quiet=True) is None assert len(proxy.get("nationality")) == 0 double = model.get_proxy(proxy) assert double == proxy proxy.add("banana", name, quiet=True) with raises(InvalidData): proxy.add("banana", name) with raises(InvalidData): EntityProxy.from_dict(model, {})
def parse_notes(self, note): entity = model.make_entity('PlainText') entity.make_id(self.project_id, note.get('id')) entity.add('title', self._field_values(note, 'Title')) entity.add('summary', self._field_values(note, 'Summary')) entity.add('bodyText', self._field_values(note, 'Body')) for timestamp in self._field_values(note, 'Creation'): entity.add('date', self.parse_timestamp(timestamp)) yield entity
def _iter_mentions(collection): """Combine mentions into pseudo-entities used for xref.""" proxy = model.make_entity(Entity.LEGAL_ENTITY) for mention in iter_proxies( collection_id=collection.id, schemata=["Mention"], sort={"properties.resolved": "desc"}, ): if mention.first("resolved") != proxy.id: if proxy.id is not None: yield proxy proxy = model.make_entity(Entity.LEGAL_ENTITY) proxy.id = mention.first("resolved") _merge_schemata(proxy, mention.get("detectedSchema")) proxy.add("name", mention.get("name")) proxy.add("country", mention.get("contextCountry")) if proxy.id is not None: yield proxy
def test_pattern_extract(self): text = "Mr. Flubby Flubber called the number tel:+919988111222 twice" entity = model.make_entity('PlainText') entity.add('bodyText', text) analyze_entity(entity) phones = entity.get_type_values(registry.phone) assert '+919988111222' in phones countries = entity.get_type_values(registry.country) assert 'in' in countries
def test_ner_extract(self): text = "Das ist der Pudel von Angela Merkel. " text = text * 5 entity = model.make_entity("PlainText") entity.id = "test1" entity.add("bodyText", text) entity = self._tagged_entity(entity) names = entity.get_type_values(registry.name) assert "Angela Merkel" in names, names
def test_base_functions(self): proxy = EntityProxy.from_dict(model, ENTITY) assert 'test' in repr(proxy), repr(proxy) assert hash(proxy) == hash(proxy.id) assert proxy.get('name') == ['Ralph Tester'] assert proxy.first('name') == 'Ralph Tester' prop = model.get_qname('Thing:name') assert proxy.get(prop) == ['Ralph Tester'] assert proxy.caption == 'Ralph Tester' assert str(proxy) == 'Ralph Tester' name = 'Ralph the Great' proxy.add('name', name) assert len(proxy.get('name')) == 2 proxy.add('name', None) assert len(proxy.get('name')) == 2 proxy.add('name', '') assert len(proxy.get('name')) == 2 proxy.add('name', ['']) assert len(proxy.get('name')) == 2 proxy.add('name', {'name': 'banana'}, cleaned=True) assert len(proxy.get('name')) == 2 assert name in proxy.get('name') assert name in proxy.names, proxy.names with assert_raises(InvalidData): proxy.add('banana', 'yellow') proxy.add('banana', 'yellow', quiet=True) mem = model.make_entity('Membership') mem.id = 'foo' with assert_raises(InvalidData): proxy.add('directorshipDirector', mem) with assert_raises(InvalidData): proxy.add('sameAs', proxy) with assert_raises(InvalidData): proxy.get('banana') assert [] == proxy.get('banana', quiet=True) with assert_raises(InvalidData): proxy.first('banana') assert proxy.first('banana', quiet=True) is None assert len(proxy.get('nationality')) == 0 double = EntityProxy.from_dict(model, proxy) assert double == proxy proxy.add('banana', name, quiet=True) with assert_raises(InvalidData): proxy.add('banana', name) with assert_raises(InvalidData): EntityProxy.from_dict(model, {})
def validate(infile, outfile): try: for entity in read_entities(infile, cleaned=False): clean = model.make_entity(entity.schema) clean.id = entity.id for (prop, value) in entity.itervalues(): clean.add(prop, value) write_object(outfile, clean) except BrokenPipeError: raise click.Abort()
def test_rdf(self): proxy = EntityProxy.from_dict(model, ENTITY) statements = list(proxy.statements) assert 8 == len(statements), len(statements) triples = list(proxy.triples) count = len(statements) + 2 assert count == len(triples), len(triples) proxy = model.make_entity('Person') assert 0 == len(list(proxy.triples))
def test_pattern_extract(self): text = "Mr. Flubby Flubber called the number tel:+919988111222 twice" entity = model.make_entity("PlainText") entity.id = "test3" entity.add("bodyText", text) entity = self._tagged_entity(entity) phones = entity.get_type_values(registry.phone) assert "+919988111222" in phones countries = entity.get_type_values(registry.country) assert "in" in countries
def test_language_tagging(self): text = "C'est le caniche d'Emmanuel Macron. " * 2 entity = model.make_entity("PlainText") entity.id = "test2" entity.add("bodyText", text) entity = self._tagged_entity(entity) names = entity.get_type_values(registry.name) assert "Emmanuel Macron" in names, names assert entity.get("detectedLanguage") == ["fra"], entity.get( "detectedLanguage") # noqa
def to_proxy(self): if self.text is not None: proxy = model.make_entity(self.SCHEMA_PAGE) proxy.make_id('record', self.id) proxy.set('document', self.document_id) proxy.set('index', self.index) proxy.set('bodyText', stringify(self.text)) return proxy else: proxy = model.make_entity(self.SCHEMA_ROW) proxy.make_id('record', self.id) proxy.set('table', self.document_id) proxy.set('index', self.index) if self.data is not None: # sort values by columns values = [ self.data.get(k) for k in self.document.meta.get('columns') ] proxy.set('cells', registry.json.pack(values)) return proxy
def reconcile_op(query, collection=None): """Reconcile operation for a single query.""" log.info("Reconcile: %r", query) args = {'limit': query.get('limit', '5')} if collection is not None: args['filter:collection_id'] = collection.get('id') parser = SearchQueryParser(args, request.authz) schema = query.get('type') or Entity.THING proxy = model.make_entity(schema) proxy.add('name', query.get('query')) for p in query.get('properties', []): proxy.add(p.get('pid'), p.get('v'), quiet=True) query = MatchQuery(parser, entity=proxy) matches = list(entity_matches(query.search())) return { 'result': matches, 'num': len(matches) }