def finalize_index(data, schema, texts): """Apply final denormalisations to the index.""" data['schema'] = schema.name # Get implied schemata (i.e. parents of the actual schema) data['schemata'] = schema.names properties = data.get('properties', {}) for name, prop in schema.properties.items(): if name not in properties: continue if prop.type_name in ['entity', 'date', 'url', 'uri', 'country']: continue for value in ensure_list(properties[name]): if name == 'name': data['name'] = value texts.append(value) data = schema.invert(data) data['text'] = index_form(texts) names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) # Add latinised names for name in list(names): names.append(latinize_text(name)) data['names'] = list(set(names)) if 'created_at' not in data: data['created_at'] = data.get('updated_at') return data
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for prop in schema.properties: if prop.name not in properties: continue if prop.type_name in ['date', 'url', 'uri', 'country']: continue texts.extend(ensure_list(properties[prop.name])) data['text'] = index_form(texts) data = schema.invert(data) index_names(data) data['schema'] = schema.name # Get implied schemata (i.e. parents of the actual schema) data['schemata'] = schema.names # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') # pprint(data) return data
def generate_records(document): """Generate index records, based on document rows or pages.""" q = db.session.query(DocumentRecord) q = q.filter(DocumentRecord.document_id == document.id) for record in q.yield_per(1000): yield { '_id': record.id, '_type': TYPE_RECORD, '_index': six.text_type(es_index), '_source': { 'document_id': document.id, 'collection_id': document.collection_id, 'index': record.index, 'sheet': record.sheet, 'text': index_form(record.texts) } }
def generate_records(document): """Generate index records, based on document rows or pages.""" q = db.session.query(DocumentRecord) q = q.filter(DocumentRecord.document_id == document.id) for idx, record in enumerate(q): yield { '_id': record.id, '_index': record_index(), '_type': 'doc', '_source': { 'document_id': document.id, 'collection_id': document.collection_id, 'index': record.index, 'text': index_form(record.texts) } } if idx > 0 and idx % 1000 == 0: log.info("Indexed [%s]: %s records...", document.id, idx)
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) texts = [] for vs in properties.values(): for v in ensure_list(vs): texts.append(v) data['text'] = index_form(texts) # Generate inverted representations of the data stored in properties. for prop in schema.properties: values = properties.get(prop.name, []) if not len(values): continue # Find an set the name property if prop.is_label: data['name'] = values[0] # Add inverted properties. This takes all the properties # of a specific type (names, dates, emails etc.) invert = prop.type.index_invert if invert: if invert not in data: data[invert] = [] for norm in prop.type.normalize(values): if norm not in data[invert]: data[invert].append(norm) index_names(data) # Get implied schemata (i.e. parents of the actual schema) data['schema'] = schema.name data['schemata'] = [p.name for p in schema.schemata if not p.hidden] # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') # pprint(data) return data
def finalize_index(proxy, context, texts): """Apply final denormalisations to the index.""" for prop, value in proxy.itervalues(): if prop.type.name in ['entity', 'date', 'url', 'country', 'language']: continue texts.append(value) entity = proxy.to_full_dict() data = merge_data(context, entity) data['name'] = proxy.caption data['text'] = index_form(texts) names = data.get('names', []) fps = [fingerprints.generate(name) for name in names] fps = [fp for fp in fps if fp is not None] data['fingerprints'] = list(set(fps)) if not data.get('created_at'): data['created_at'] = data.get('updated_at') data.pop('id', None) return clean_dict(data)
def index_collection(collection, sync=False): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'kind': collection.kind, 'summary': collection.summary, 'category': Collection.DEFAULT, 'publisher': collection.publisher, 'publisher_url': collection.publisher_url, 'info_url': collection.info_url, 'data_url': collection.data_url, 'casefile': collection.casefile, 'secret': collection.secret, 'collection_id': collection.id, 'schemata': {}, 'team': [] } texts = [v for v in data.values() if isinstance(v, str)] if collection.category in Collection.CATEGORIES: data['category'] = collection.category if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) for role in collection.team: data['team'].append({ 'id': role.id, 'type': role.type, 'name': role.name }) texts.append(role.name) stats = get_collection_stats(collection.id) data['count'] = stats['count'] # expose entities by schema count. thing = model.get(Entity.THING) for schema, count in stats['schemata'].items(): schema = model.get(schema) if schema is not None and schema.is_a(thing): data['schemata'][schema.name] = count # if no countries or langs are given, take the most common from the data. countries = ensure_list(collection.countries) countries = countries or stats['countries'].keys() data['countries'] = registry.country.normalize_set(countries) languages = ensure_list(collection.languages) languages = languages or stats['languages'].keys() data['languages'] = registry.language.normalize_set(languages) texts.extend([normalize(t, ascii=True) for t in texts]) data['text'] = index_form(texts) return index_safe(collections_index(), collection.id, data, refresh=sync)
def index_collection(collection): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'kind': collection.kind, 'summary': collection.summary, 'category': Collection.DEFAULT, 'publisher': collection.publisher, 'publisher_url': collection.publisher_url, 'info_url': collection.info_url, 'data_url': collection.data_url, 'casefile': collection.casefile, 'roles': collection.roles, 'schemata': {}, 'team': [] } texts = [v for v in data.values() if isinstance(v, str)] if collection.category in Collection.CATEGORIES: data['category'] = collection.category if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) for role in collection.team: data['team'].append({ 'id': role.id, 'type': role.type, 'name': role.name }) texts.append(role.name) # Compute some statistics on the content of a collection. query = { 'size': 0, 'query': { 'bool': { 'filter': [{ 'term': { 'collection_id': collection.id } }, { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } }, 'countries': { 'terms': { 'field': 'countries', 'size': 500 } }, 'languages': { 'terms': { 'field': 'languages', 'size': 100 } }, } } result = search_safe(index=entities_index(), body=query) aggregations = result.get('aggregations') data['count'] = result['hits']['total'] # expose entities by schema count. for schema in aggregations['schema']['buckets']: data['schemata'][schema['key']] = schema['doc_count'] # if no countries or langs are given, take the most common from the data. countries = collection.countries if countries is None or not len(countries): countries = aggregations['countries']['buckets'] countries = [c['key'] for c in countries] data['countries'] = exactitude.countries.normalize_set(countries) languages = collection.languages if languages is None or not len(languages): languages = aggregations['languages']['buckets'] languages = [c['key'] for c in languages] data['languages'] = exactitude.languages.normalize_set(languages) texts.extend([normalize(t, ascii=True) for t in texts]) data['text'] = index_form(texts) data = index_safe(collections_index(), collection.id, data) refresh_index(index=collections_index()) return data
def index_document(document): if document.status == Document.STATUS_PENDING: return # FIXME: if document.type == Document.TYPE_OTHER: return log.info("Index document [%s]: %s", document.id, document.title) data = { 'schema': document.SCHEMA, 'schemata': [document.SCHEMA], 'collection_id': document.collection_id, 'roles': document.collection.roles, 'type': document.type, 'status': document.status, 'content_hash': document.content_hash, 'foreign_id': document.foreign_id, 'error_message': document.error_message, 'uploader_id': document.uploader_id, 'created_at': document.created_at, 'updated_at': document.updated_at, 'title': document.title, 'name_sort': document.title, 'summary': document.summary, 'author': document.author, 'file_size': document.file_size, 'file_name': document.file_title, 'source_url': document.source_url, 'languages': document.languages, 'countries': document.countries, 'keywords': document.keywords, 'dates': document.dates, 'extension': document.extension, 'encoding': document.encoding, 'mime_type': document.mime_type, 'pdf_version': document.pdf_version, 'columns': document.columns, '$children': document.children.count(), 'text': index_form(document.text_parts()) } if document.parent_id is not None: data['parent'] = { 'id': document.parent_id, 'type': document.parent.type, 'title': document.parent.title, } q = db.session.query(DocumentTag) q = q.filter(DocumentTag.document_id == document.id) for tag in q.yield_per(5000): field = TAG_FIELDS.get(tag.type) if field is None: log.warning("Cannot index document tag: %r", tag) continue if field not in data: data[field] = [] data[field].append(tag.text) index_names(data) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) data['id'] = document.id data['$type'] = TYPE_DOCUMENT return data
def index_collection(collection): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'summary': collection.summary, 'category': collection.category, 'countries': collection.countries, 'languages': collection.languages, 'managed': collection.managed, 'roles': collection.roles, 'schemata': {}, } texts = [ collection.label, collection.foreign_id, collection.summary, collection.category ] if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) # Compute some statistics on the content of a collection. query = { 'size': 0, 'query': { 'bool': { 'filter': [{ 'term': { 'collection_id': collection.id } }, { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } }, 'countries': { 'terms': { 'field': 'countries', 'size': 500 } }, 'languages': { 'terms': { 'field': 'languages', 'size': 100 } }, } } result = es.search(index=entities_index(), body=query) aggregations = result.get('aggregations') data['count'] = result['hits']['total'] # expose entities by schema count. for schema in aggregations['schema']['buckets']: data['schemata'][schema['key']] = schema['doc_count'] # if no countries or langs are given, take the most common from the data. if not data.get('countries'): countries = aggregations['countries']['buckets'] data['countries'] = [c['key'] for c in countries] if not data.get('languages'): countries = aggregations['languages']['buckets'] data['languages'] = [c['key'] for c in countries] texts.extend([match_form(t) for t in texts]) data['text'] = index_form(texts) es.index(index=collection_index(), doc_type='doc', id=collection.id, body=data)