def load_item(self, doc): # Recursively index associated models like attachments for model in doc.traverse(): model_body = json_encoder.encode( JsonLDSerializer().serialize(model)) log.debug('ElasticsearchUpsertLoader indexing document id: %s' % model.get_ori_identifier()) # Update document elasticsearch.update( id=model.get_short_identifier(), index=self.index_name, body={ 'doc': json.loads(model_body), 'doc_as_upsert': True, }, ) if 'enricher_task' in model: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': model.get_short_identifier(), 'original_url': model.original_url, 'file_name': model.name, } if 'content_type' in model: url_doc['content_type'] = model.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, id=get_sha1_hash(model.original_url), body=url_doc)
def get_index_doc(self): """Construct the document that should be inserted into the index belonging to the item's source. :returns: a dict ready for indexing. :rtype: dict """ item = {} item['meta'] = dict(self.meta) item['enrichments'] = {} item['source_data'] = { 'content_type': self.data_content_type, 'data': self.data } combined_index_data = dict(self.combined_index_data) item.update(combined_index_data) # Store a string representation of the combined index data on the # collection specific index as well, as we need to be able to # reconstruct the combined index from the individual indices item['combined_index_data'] = json_encoder.encode( self.get_combined_index_doc()) item.update(self.index_data) return item
def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()), body=body, id=doc.get_short_identifier()) # Recursively index associated models like attachments for _, value in doc.properties(rels=True, props=False): self.load_item(value) if 'enricher_task' in value: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': value.get_short_identifier(), 'original_url': value.original_url, 'file_name': value.name, } if 'content_type' in value: url_doc['content_type'] = value.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=get_sha1_hash(value.original_url), body=url_doc)
def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()), body=body, id=doc.get_ori_identifier()) # Recursively index associated models like attachments for _, value in doc.properties(rels=True, props=False): self.load_item(value) if 'enricher_task' in value: # The value seems to be enriched so add to resolver url_doc = { 'ori_identifier': value.get_ori_identifier(), 'original_url': value.original_url, 'file_name': value.name, } if 'content_type' in value: url_doc['content_type'] = value.content_type # Update if already exists elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url', id=get_sha1_hash(value.original_url), body=url_doc)
def get_index_doc(self): """Construct the document that should be inserted into the index belonging to the item's source. :returns: a dict ready for indexing. :rtype: dict """ item = {} item['meta'] = dict(self.meta) item['enrichments'] = {} item['source_data'] = { 'content_type': self.data_content_type, 'data': self.data } combined_index_data = dict(self.combined_index_data) item.update(combined_index_data) # Store a string representation of the combined index data on the # collection specific index as well, as we need to be able to # reconstruct the combined index from the individual indices item['combined_index_data'] = json_encoder.encode(self.get_combined_index_doc()) item.update(self.index_data) return item
def enrich_item(self, enrichments, object_id, combined_index_doc, doc): doc['id'] = unicode(object_id) if 'meta' in doc: doc['meta']['pfl_url'] = unicode( "https://api.poliflw.nl/v0/%s/%s" % ( doc['meta']['source_id'], object_id,)) if 'date' not in doc: log.info( 'Document has no date information, not enriching for binoas') return enrichments amsterdam_tz = pytz.timezone('Europe/Amsterdam') current_dt = datetime.datetime.now(tz=amsterdam_tz) try: current_tz = doc['date'].tzinfo except AttributeError: current_tz = None if current_tz is not None: delay = current_dt - doc['date'] else: # adjust for amsterdam time adjusted_dt = iso8601.parse_date('%s+02:00' % ( doc['date'].isoformat())) delay = current_dt - adjusted_dt if delay.total_seconds() > (6 * 3600.0): log.info('Document delayed for %s so we have seen it before' % ( str(delay),)) return enrichments url = 'http://binoas.openstate.eu/posts/new' r = {} resp = None log.info('sending to binoas: ' + str(doc)) try: resp = self.http_session.post( url, data=json_encoder.encode({ 'application': 'poliflw', 'payload': doc})) r = resp.json() except Exception as e: log.exception('Unexpected binoas enrichment error: %s' % (e.message,)) log.exception(resp.content) log.exception(doc) log.info('binoas result: ' + str(r)) return enrichments
def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) if doc == {}: log.info('Empty document ....') return log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.update( id=doc.get_short_identifier(), index=self.index_name, doc_type=doc_type(doc.verbose_name()), body={'doc': body}, )
def load_item(self, doc): body = json_encoder.encode(JsonLDSerializer().serialize(doc)) if doc == {}: log.info('Empty document ....') return log.info('Indexing document id: %s' % doc.get_ori_identifier()) # Index documents into new index elasticsearch.update( id=doc.get_ori_identifier(), index=self.index_name, doc_type=doc_type(doc.verbose_name()), body={'doc': body}, )
def _perform_ner(self, doc_id, doc): # FIXME: sometimes we use short names for parties and sometimes not parties2names = { u'Christen-Democratisch Appèl': u'CDA', u'Democraten 66': u'D66', u'Partij van de Arbeid': u'PvdA', u'Staatkundig Gereformeerde Partij': u'SGP', u'Socialistische Partij': u'SP', u'Volkspartij voor Vrijheid en Democratie': u'VVD' } url = 'http://politags_web_1:5000/api/articles/entities' politicians = doc.get('politicians', []) parties = doc.get('parties', []) topics = doc.get('topics', []) sentiment = doc.get('sentiment', {}) doc['id'] = unicode(doc_id) doc['meta']['pfl_url'] = unicode("https://api.poliflw.nl/v0/%s/%s" % ( doc['meta']['source_id'], doc_id,)) try: resp = self.http_session.post( url, data=json_encoder.encode(doc), headers={'Content-type': 'application/json'}) r = resp.json() except Exception as e: log.exception('Unexpected NER enrichment error: %s' % (e.message,)) # log.exception(resp.content) # log.exception(json_encoder.encode(doc)) r = { 'parties': [], 'politicians': [], 'topics': [], 'sentiment': {} } log.exception('NER response:') log.exception(r) log.exception('Indexing found topics: %s' % (r.get('topics', []),)) log.exception('Indexing found sentiment: %s' % (r.get('sentiment', {}),)) return { 'topics': r.get('topics', []), 'sentiment': r.get('sentiment', {}), 'parties': parties + [parties2names.get(p['name'], p['name']) for p in r['parties'] if p['name'] not in parties], 'politicians': politicians + [ u'%s %s' % (p['initials'], p['last_name'],) for p in r['politicians'] if u'%s %s' % (p['initials'], p['last_name'],) not in politicians] }
def load_item(self, doc): # Recursively index associated models like attachments for model in doc.traverse(): model_body = json_encoder.encode( JsonLDSerializer().serialize(model)) if doc == {}: log.info('Empty document ....') return log.debug( 'ElasticsearchUpdateOnlyLoader indexing document id: %s' % model.get_ori_identifier()) # Index document into new index elasticsearch.update( id=model.get_short_identifier(), index=self.index_name, body={'doc': json.loads(model_body)}, )
def enrich_item(self, enrichments, object_id, combined_index_doc, doc): # log.info('Enrichments data: %s', enrichments) for item in combined_index_doc.get('item', {}).get('items', []): if item.get('@type', 'Note') not in settings.BINOAS_AS2_TYPES: # log.info( # 'Document %s is not a translatable type (%s)' % ( # item.get('@id', '???'), item['@type'],)) continue if 'created' not in item: log.info( 'Document has no date information, not enriching for binoas' ) return enrichments topics = combined_index_doc.get('percolations', {}).get(item.get('@id', ''), []) for topic in topics: log.info('Getting link obj for %s' % (topic, )) topic_obj = self.get_percolation(topic) log.info(topic_obj['@id']) item['tag'].append(topic_obj['@id']) # log.info('created: %s' % (item['created'],)) amsterdam_tz = pytz.timezone(settings.BINOAS_TZ) current_dt = datetime.datetime.now(tz=amsterdam_tz) adjusted_dt = item['created'] try: current_tz = item['created'].tzinfo except AttributeError: current_tz = None if current_tz is not None: delay = current_dt - item['created'] else: # adjust for amsterdam time if item['created'] is not None: adjusted_dt = iso8601.parse_date( '%s+02:00' % (item['created'].isoformat())) else: adjusted_dt = current_dt delay = current_dt - adjusted_dt # if len(item.get('tag', [])) > 0: # log.info('sending to binoas: ' + str(item)) log.info('Delay: %s (%s vs %s)' % (delay, current_dt, adjusted_dt)) if delay.total_seconds() > settings.BINOAS_ALLOWED_DELAY: log.info('Document delayed for %s so we have seen it before' % (str(delay), )) return enrichments if delay.total_seconds() < settings.BINOAS_MINIMUM_DELAY: log.info( 'Document published too short ago (%s) so we will skip it' % (str(delay), )) return enrichments # TODO: figure out howto include classifications here ... url = '%s/posts/new' % (settings.BINOAS_BASE_URL, ) r = {} resp = None if settings.BINOAS_BASE_URL is None: log.info('Binoas not configured, so skipping') return enrichments try: resp = self.http_session.post(url, data=json_encoder.encode({ 'application': settings.BINOAS_APPLICATION, 'payload': item })) r = resp.json() except Exception as e: log.exception('Unexpected binoas enrichment error: %s' % (e.message, )) log.exception(resp.content) log.exception(doc) log.info('binoas result: ' + str(r)) return enrichments
def enrich_item(self, enrichments, object_id, combined_index_doc, doc): enrichments['translations'] = {} for item in combined_index_doc.get('item', {}).get('items', []): if item.get('@type', 'Note') not in settings.AS2_TRANSLATION_TYPES: # log.info( # 'Document %s is not a translatable type (%s)' % ( # item.get('@id', '???'), item['@type'],)) continue translated = False try: resp = self.http_session.post('http://*****:*****@id']] } } })).json() if resp['as:totalItems'] > 0: first_item = resp['as:items'][0] log.info( 'Found %s existing document(s) for %s, translated into %s' % ( resp['as:totalItems'], item['@id'], first_item['contentMap'].keys(), )) #translated = sorted(first_item['contentMap'].keys()) == sorted(settings.AS2_TRANSLATION_LANGUAGES) translated = set( settings.AS2_TRANSLATION_LANGUAGES).issubset( set(first_item['contentMap'].keys())) if translated: # if doc is already translated, use the translation we used before. docs = [] for fld in ['nameMap', 'contentMap']: if fld in first_item: docs.append({ # FIXME: hould copy the source langueage from the doc!!! 'detectedLanguage': { "language": first_item.get('@language', 'nl'), "score": 1.0 }, 'translations': [{ 'text': v, 'to': k } for k, v in first_item[fld].iteritems()] }) enrichments['translations'][item['@id']] = docs else: resp = None except Exception as e: log.error(e) resp = None if translated: log.info('Document %s was already translated' % (item['@id'], )) continue log.info('Translating document %s now' % (item['@id'], )) # TODO: check if item exists to prevent unecesary retranslation of text we already have # print >>sys.stderr, item docs_for_translation = [] if item.get('nameMap', {}).get('nl', None) is not None: docs_for_translation.append(item['nameMap']['nl']) # print >>sys.stderr, "Combined doc before translation: %s" % (combined_index_doc,) if item.get('contentMap', {}).get('nl', None) is not None: docs_for_translation.append( html_cleanup(item['contentMap']['nl'])) if len(docs_for_translation) > 0: translations = self.translate( docs_for_translation, to_lang=settings.AS2_TRANSLATION_LANGUAGES) enrichments['translations'][item['@id']] = translations #print >>sys.stderr, "Enrichments: %s" % (enrichments,) #log.info(enrichments) return enrichments
def enrich_item(self, enrichments, object_id, combined_index_doc, doc): for item in combined_index_doc.get('item', {}).get('items', []): if item.get('@type', 'Note') not in settings.AS2_TRANSLATION_TYPES: # log.info( # 'Document %s is not a translatable type (%s)' % ( # item.get('@id', '???'), item['@type'],)) continue if 'created' not in item: log.info( 'Document has no date information, not enriching for binoas') return enrichments log.info('created: %s' % (item['created'],)) amsterdam_tz = pytz.timezone('Europe/Amsterdam') current_dt = datetime.datetime.now(tz=amsterdam_tz) adjusted_dt = item['created'] try: current_tz = item['created'].tzinfo except AttributeError: current_tz = None if current_tz is not None: delay = current_dt - item['created'] else: # adjust for amsterdam time adjusted_dt = iso8601.parse_date('%s+02:00' % ( item['created'].isoformat())) delay = current_dt - adjusted_dt #log.info('Delay: %s (%s vs %s)' % (delay, current_dt, adjusted_dt)) if delay.total_seconds() > (6 * 3600.0): log.info('Document delayed for %s so we have seen it before' % ( str(delay),)) return enrichments translations = enrichments.get('translations', {}).get(item.get('@id', ''), []) if len(translations) == 0: translation_keys = {} if len(translations) == 1: translation_keys = {0: 'contentMap'} if len(translations) == 2: translation_keys = {0: 'nameMap', 1: 'contentMap'} for t_idx, t_key in translation_keys.iteritems(): item[t_key] = {x['to']: x['text'] for x in translations[t_idx]['translations']} # always take the language of the content, since content tends to # be longer than the title item['@language'] = translations[-1]['detectedLanguage']['language'] url = 'http://binoas.openstate.eu/posts/new' #url = 'http://binoas_app-binoas_1:5000/posts/new' r = {} resp = None log.info('sending to binoas: ' + str(item)) try: resp = self.http_session.post( url, data=json_encoder.encode({ 'application': 'poliscoops', 'payload': item})) r = resp.json() except Exception as e: log.exception('Unexpected binoas enrichment error: %s' % (e.message,)) log.exception(resp.content) log.exception(doc) log.info('binoas result: ' + str(r)) return enrichments