def load_item(self, doc): # Recursively index associated models like attachments for model in doc.traverse(): model_body = json_encoder.encode( JsonLDSerializer().serialize(model)) log.debug('ElasticsearchUpsertLoader indexing document id: %s' % model.get_ori_identifier()) # Update document elasticsearch.update( id=model.get_short_identifier(), index=self.index_name, body={ 'doc': json.loads(model_body), 'doc_as_upsert': True, }, ) if 'enricher_task' in model: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': model.get_short_identifier(), 'original_url': model.original_url, 'file_name': model.name, } if 'content_type' in model: url_doc['content_type'] = model.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, id=get_sha1_hash(model.original_url), body=url_doc)
def load_item(self, object_id, combined_index_doc, doc): log.info('Indexing documents...') elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item', id=object_id, body=combined_index_doc) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type='item', body=doc, id=object_id) m_url_content_types = {} if 'media_urls' in doc['enrichments']: for media_url in doc['enrichments']['media_urls']: if 'content_type' in media_url: m_url_content_types[media_url['original_url']] = \ media_url['content_type'] # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = { 'original_url': media_url['original_url'] } if media_url['original_url'] in m_url_content_types: url_doc['content_type'] = \ m_url_content_types[media_url['original_url']] try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()), body=body, id=doc.get_short_identifier()) # Recursively index associated models like attachments for _, value in doc.properties(rels=True, props=False): self.load_item(value) if 'enricher_task' in value: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': value.get_short_identifier(), 'original_url': value.original_url, 'file_name': value.name, } if 'content_type' in value: url_doc['content_type'] = value.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=get_sha1_hash(value.original_url), body=url_doc)
def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()), body=body, id=doc.get_ori_identifier()) # Recursively index associated models like attachments for _, value in doc.properties(rels=True, props=False): self.load_item(value) if 'enricher_task' in value: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': value.get_ori_identifier(), 'original_url': value.original_url, 'file_name': value.name, } if 'content_type' in value: url_doc['content_type'] = value.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=get_sha1_hash(value.original_url), body=url_doc)
def load_item(self, object_id, combined_index_doc, doc): log.info('Indexing documents...') elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item', id=object_id, body=combined_index_doc) elasticsearch.index( index='%s_%s' % (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']), doc_type='item', id=object_id, body=doc) # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = {'original_url': media_url['original_url']} try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def load_item( self, combined_object_id, object_id, combined_index_doc, doc ): log.info('Indexing documents...') elasticsearch.index(index=settings.COMBINED_INDEX, doc_type=self.doc_type, id=combined_object_id, body=combined_index_doc) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=self.doc_type, body=doc, id=object_id) self._create_resolvable_media_urls(doc)
def load_item( self, combined_object_id, object_id, combined_index_doc, doc ): log.info('Indexing documents...') # log.exception('Indexing topics: %s' % ( # combined_index_doc.get('topics', []),)) # log.exception('Indexing sentiment: %s' % ( # combined_index_doc.get('sentiment', {}),)) elasticsearch.index(index=settings.COMBINED_INDEX, doc_type=self.doc_type, id=combined_object_id, body=combined_index_doc) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=self.doc_type, body=doc, id=object_id) self._create_resolvable_media_urls(doc)
def load_item(self, combined_object_id, object_id, combined_index_doc, doc): log.info('Indexing documents...') doc_type = self._get_doc_type(combined_index_doc, self.doc_type) elasticsearch.index(index=self.combined_index_name, doc_type=doc_type, id=combined_object_id, body=combined_index_doc) # Index documents into new index doc_type = self._get_doc_type(doc, self.doc_type) elasticsearch.index(index=self.index_name, doc_type=doc_type, body=doc, id=object_id) m_url_content_types = {} if 'media_urls' in doc['enrichments']: for media_url in doc['enrichments']['media_urls']: if 'content_type' in media_url: m_url_content_types[media_url['original_url']] = \ media_url['content_type'] # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = {'original_url': media_url['original_url']} if media_url['original_url'] in m_url_content_types: url_doc['content_type'] = \ m_url_content_types[media_url['original_url']] try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def load_item(self, combined_object_id, object_id, combined_index_doc, doc, doc_type): log.info('Indexing document id: %s' % object_id) elasticsearch.index(index=settings.COMBINED_INDEX, doc_type=doc_type, id=combined_object_id, body=combined_index_doc) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=doc_type, body=doc, id=object_id) m_url_content_types = {} if 'media_urls' in doc['enrichments']: for media_url in doc['enrichments']['media_urls']: if 'content_type' in media_url: m_url_content_types[media_url['original_url']] = \ media_url['content_type'] # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = { 'original_url': media_url['original_url'] } if media_url['original_url'] in m_url_content_types: url_doc['content_type'] = \ m_url_content_types[media_url['original_url']] # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc)
def load_item(self, object_id, combined_index_doc, doc): log.info('Indexing documents...') elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item', id=object_id, body=combined_index_doc) elasticsearch.index(index='%s_%s' % (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']), doc_type='item', id=object_id, body=doc) # For each media_urls.url, add a resolver document to the # RESOLVER_URL_INDEX if 'media_urls' in doc: for media_url in doc['media_urls']: url_hash = media_url['url'].split('/')[-1] url_doc = { 'original_url': media_url['original_url'] } try: elasticsearch.create(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=url_hash, body=url_doc) except ConflictError: log.debug('Resolver document %s already exists' % url_hash)
def process(self, model, model_body): # Index document into new index elasticsearch.index(index=self.index_name, body=model_body, id=model.get_short_identifier())