Ejemplo n.º 1
0
    def load_item(self, doc):
        # Recursively index associated models like attachments
        for model in doc.traverse():
            model_body = json_encoder.encode(
                JsonLDSerializer().serialize(model))

            log.debug('ElasticsearchUpsertLoader indexing document id: %s' %
                      model.get_ori_identifier())

            # Update document
            elasticsearch.update(
                id=model.get_short_identifier(),
                index=self.index_name,
                body={
                    'doc': json.loads(model_body),
                    'doc_as_upsert': True,
                },
            )

            if 'enricher_task' in model:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': model.get_short_identifier(),
                    'original_url': model.original_url,
                    'file_name': model.name,
                }

                if 'content_type' in model:
                    url_doc['content_type'] = model.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX,
                                    id=get_sha1_hash(model.original_url),
                                    body=url_doc)
Ejemplo n.º 2
0
    def get_index_doc(self):
        """Construct the document that should be inserted into the index
        belonging to the item's source.

        :returns: a dict ready for indexing.
        :rtype: dict
        """
        item = {}

        item['meta'] = dict(self.meta)
        item['enrichments'] = {}
        item['source_data'] = {
            'content_type': self.data_content_type,
            'data': self.data
        }

        combined_index_data = dict(self.combined_index_data)
        item.update(combined_index_data)

        # Store a string representation of the combined index data on the
        # collection specific index as well, as we need to be able to
        # reconstruct the combined index from the individual indices
        item['combined_index_data'] = json_encoder.encode(
            self.get_combined_index_doc())

        item.update(self.index_data)

        return item
Ejemplo n.º 3
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()),
                            body=body, id=doc.get_short_identifier())

        # Recursively index associated models like attachments
        for _, value in doc.properties(rels=True, props=False):
            self.load_item(value)

            if 'enricher_task' in value:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': value.get_short_identifier(),
                    'original_url': value.original_url,
                    'file_name': value.name,
                }

                if 'content_type' in value:
                    url_doc['content_type'] = value.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url',
                                    id=get_sha1_hash(value.original_url), body=url_doc)
Ejemplo n.º 4
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type=doc_type(doc.verbose_name()),
                            body=body, id=doc.get_ori_identifier())

        # Recursively index associated models like attachments
        for _, value in doc.properties(rels=True, props=False):
            self.load_item(value)

            if 'enricher_task' in value:
                # The value seems to be enriched so add to resolver
                url_doc = {
                    'ori_identifier': value.get_ori_identifier(),
                    'original_url': value.original_url,
                    'file_name': value.name,
                }

                if 'content_type' in value:
                    url_doc['content_type'] = value.content_type

                # Update if already exists
                elasticsearch.index(index=settings.RESOLVER_URL_INDEX, doc_type='url',
                                    id=get_sha1_hash(value.original_url), body=url_doc)
Ejemplo n.º 5
0
    def get_index_doc(self):
        """Construct the document that should be inserted into the index
        belonging to the item's source.

        :returns: a dict ready for indexing.
        :rtype: dict
        """
        item = {}

        item['meta'] = dict(self.meta)
        item['enrichments'] = {}
        item['source_data'] = {
            'content_type': self.data_content_type,
            'data': self.data
        }

        combined_index_data = dict(self.combined_index_data)
        item.update(combined_index_data)

        # Store a string representation of the combined index data on the
        # collection specific index as well, as we need to be able to
        # reconstruct the combined index from the individual indices
        item['combined_index_data'] = json_encoder.encode(self.get_combined_index_doc())

        item.update(self.index_data)

        return item
Ejemplo n.º 6
0
    def enrich_item(self, enrichments, object_id, combined_index_doc, doc):
        doc['id'] = unicode(object_id)
        if 'meta' in doc:
            doc['meta']['pfl_url'] = unicode(
                "https://api.poliflw.nl/v0/%s/%s" % (
                    doc['meta']['source_id'], object_id,))

        if 'date' not in doc:
            log.info(
                'Document has no date information, not enriching for binoas')
            return enrichments

        amsterdam_tz = pytz.timezone('Europe/Amsterdam')
        current_dt = datetime.datetime.now(tz=amsterdam_tz)
        try:
            current_tz = doc['date'].tzinfo
        except AttributeError:
            current_tz = None
        if current_tz is not None:
            delay = current_dt - doc['date']
        else:
            # adjust for amsterdam time
            adjusted_dt = iso8601.parse_date('%s+02:00' % (
                doc['date'].isoformat()))
            delay = current_dt - adjusted_dt

        if delay.total_seconds() > (6 * 3600.0):
            log.info('Document delayed for %s so we have seen it before' % (
                str(delay),))
            return enrichments

        url = 'http://binoas.openstate.eu/posts/new'
        r = {}
        resp = None
        log.info('sending to binoas: ' + str(doc))
        try:
            resp = self.http_session.post(
                url, data=json_encoder.encode({
                    'application': 'poliflw',
                    'payload': doc}))
            r = resp.json()
        except Exception as e:
            log.exception('Unexpected binoas enrichment error: %s'
                          % (e.message,))
            log.exception(resp.content)
            log.exception(doc)
        log.info('binoas result: ' + str(r))
        return enrichments
Ejemplo n.º 7
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        if doc == {}:
            log.info('Empty document ....')
            return

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.update(
            id=doc.get_short_identifier(),
            index=self.index_name,
            doc_type=doc_type(doc.verbose_name()),
            body={'doc': body},
        )
Ejemplo n.º 8
0
    def load_item(self, doc):
        body = json_encoder.encode(JsonLDSerializer().serialize(doc))

        if doc == {}:
            log.info('Empty document ....')
            return

        log.info('Indexing document id: %s' % doc.get_ori_identifier())

        # Index documents into new index
        elasticsearch.update(
            id=doc.get_ori_identifier(),
            index=self.index_name,
            doc_type=doc_type(doc.verbose_name()),
            body={'doc': body},
        )
Ejemplo n.º 9
0
    def _perform_ner(self, doc_id, doc):
        # FIXME: sometimes we use short names for parties and sometimes not
        parties2names = {
            u'Christen-Democratisch Appèl': u'CDA',
            u'Democraten 66': u'D66',
            u'Partij van de Arbeid': u'PvdA',
            u'Staatkundig Gereformeerde Partij': u'SGP',
            u'Socialistische Partij': u'SP',
            u'Volkspartij voor Vrijheid en Democratie': u'VVD'
        }

        url = 'http://politags_web_1:5000/api/articles/entities'
        politicians = doc.get('politicians', [])
        parties = doc.get('parties', [])
        topics = doc.get('topics', [])
        sentiment = doc.get('sentiment', {})

        doc['id'] = unicode(doc_id)
        doc['meta']['pfl_url'] = unicode("https://api.poliflw.nl/v0/%s/%s" % (
            doc['meta']['source_id'], doc_id,))
        try:
            resp = self.http_session.post(
                url, data=json_encoder.encode(doc),
                headers={'Content-type': 'application/json'})
            r = resp.json()
        except Exception as e:
            log.exception('Unexpected NER enrichment error: %s'
                          % (e.message,))
            # log.exception(resp.content)
            # log.exception(json_encoder.encode(doc))

            r = {
                'parties': [], 'politicians': [], 'topics': [], 'sentiment': {}
            }

        log.exception('NER response:')
        log.exception(r)
        log.exception('Indexing found topics: %s' % (r.get('topics', []),))
        log.exception('Indexing found sentiment: %s' % (r.get('sentiment', {}),))
        return {
            'topics': r.get('topics', []),
            'sentiment': r.get('sentiment', {}),
            'parties': parties + [parties2names.get(p['name'], p['name']) for p in r['parties'] if p['name'] not in parties],
            'politicians': politicians + [
                u'%s %s' % (p['initials'], p['last_name'],) for p in r['politicians'] if u'%s %s' % (p['initials'], p['last_name'],) not in politicians]
        }
Ejemplo n.º 10
0
    def load_item(self, doc):
        # Recursively index associated models like attachments
        for model in doc.traverse():
            model_body = json_encoder.encode(
                JsonLDSerializer().serialize(model))

            if doc == {}:
                log.info('Empty document ....')
                return

            log.debug(
                'ElasticsearchUpdateOnlyLoader indexing document id: %s' %
                model.get_ori_identifier())

            # Index document into new index
            elasticsearch.update(
                id=model.get_short_identifier(),
                index=self.index_name,
                body={'doc': json.loads(model_body)},
            )
Ejemplo n.º 11
0
    def enrich_item(self, enrichments, object_id, combined_index_doc, doc):
        # log.info('Enrichments data: %s', enrichments)

        for item in combined_index_doc.get('item', {}).get('items', []):
            if item.get('@type', 'Note') not in settings.BINOAS_AS2_TYPES:
                # log.info(
                #     'Document %s is not a translatable type (%s)' % (
                #         item.get('@id', '???'), item['@type'],))
                continue

            if 'created' not in item:
                log.info(
                    'Document has no date information, not enriching for binoas'
                )
                return enrichments

            topics = combined_index_doc.get('percolations',
                                            {}).get(item.get('@id', ''), [])
            for topic in topics:
                log.info('Getting link obj for %s' % (topic, ))
                topic_obj = self.get_percolation(topic)
                log.info(topic_obj['@id'])
                item['tag'].append(topic_obj['@id'])

            # log.info('created: %s' % (item['created'],))
            amsterdam_tz = pytz.timezone(settings.BINOAS_TZ)
            current_dt = datetime.datetime.now(tz=amsterdam_tz)
            adjusted_dt = item['created']
            try:
                current_tz = item['created'].tzinfo
            except AttributeError:
                current_tz = None
            if current_tz is not None:
                delay = current_dt - item['created']
            else:
                # adjust for amsterdam time
                if item['created'] is not None:
                    adjusted_dt = iso8601.parse_date(
                        '%s+02:00' % (item['created'].isoformat()))
                else:
                    adjusted_dt = current_dt
                delay = current_dt - adjusted_dt

            # if len(item.get('tag', [])) > 0:
            #     log.info('sending to binoas: ' + str(item))

            log.info('Delay: %s (%s vs %s)' % (delay, current_dt, adjusted_dt))
            if delay.total_seconds() > settings.BINOAS_ALLOWED_DELAY:
                log.info('Document delayed for %s so we have seen it before' %
                         (str(delay), ))
                return enrichments
            if delay.total_seconds() < settings.BINOAS_MINIMUM_DELAY:
                log.info(
                    'Document published too short ago (%s) so we will skip it'
                    % (str(delay), ))
                return enrichments

            # TODO: figure out howto include classifications here ...

            url = '%s/posts/new' % (settings.BINOAS_BASE_URL, )

            r = {}
            resp = None

            if settings.BINOAS_BASE_URL is None:
                log.info('Binoas not configured, so skipping')
                return enrichments

            try:
                resp = self.http_session.post(url,
                                              data=json_encoder.encode({
                                                  'application':
                                                  settings.BINOAS_APPLICATION,
                                                  'payload':
                                                  item
                                              }))
                r = resp.json()
            except Exception as e:
                log.exception('Unexpected binoas enrichment error: %s' %
                              (e.message, ))
                log.exception(resp.content)
                log.exception(doc)
        log.info('binoas result: ' + str(r))
        return enrichments
Ejemplo n.º 12
0
    def enrich_item(self, enrichments, object_id, combined_index_doc, doc):
        enrichments['translations'] = {}
        for item in combined_index_doc.get('item', {}).get('items', []):
            if item.get('@type', 'Note') not in settings.AS2_TRANSLATION_TYPES:
                # log.info(
                #     'Document %s is not a translatable type (%s)' % (
                #         item.get('@id', '???'), item['@type'],))
                continue

            translated = False
            try:
                resp = self.http_session.post('http://*****:*****@id']]
                                                      }
                                                  }
                                              })).json()
                if resp['as:totalItems'] > 0:
                    first_item = resp['as:items'][0]
                    log.info(
                        'Found %s existing document(s) for  %s, translated into %s'
                        % (
                            resp['as:totalItems'],
                            item['@id'],
                            first_item['contentMap'].keys(),
                        ))
                    #translated = sorted(first_item['contentMap'].keys()) == sorted(settings.AS2_TRANSLATION_LANGUAGES)
                    translated = set(
                        settings.AS2_TRANSLATION_LANGUAGES).issubset(
                            set(first_item['contentMap'].keys()))
                    if translated:
                        # if doc is already translated, use the translation we used before.
                        docs = []
                        for fld in ['nameMap', 'contentMap']:
                            if fld in first_item:
                                docs.append({
                                    # FIXME: hould copy the source langueage from the doc!!!
                                    'detectedLanguage': {
                                        "language":
                                        first_item.get('@language', 'nl'),
                                        "score":
                                        1.0
                                    },
                                    'translations': [{
                                        'text': v,
                                        'to': k
                                    } for k, v in first_item[fld].iteritems()]
                                })
                        enrichments['translations'][item['@id']] = docs
                else:
                    resp = None
            except Exception as e:
                log.error(e)
                resp = None

            if translated:
                log.info('Document %s was already translated' %
                         (item['@id'], ))
                continue

            log.info('Translating document %s now' % (item['@id'], ))

            # TODO: check if item exists to prevent unecesary retranslation of text we already have
            # print >>sys.stderr, item
            docs_for_translation = []
            if item.get('nameMap', {}).get('nl', None) is not None:
                docs_for_translation.append(item['nameMap']['nl'])
            # print >>sys.stderr, "Combined doc before translation: %s" % (combined_index_doc,)
            if item.get('contentMap', {}).get('nl', None) is not None:
                docs_for_translation.append(
                    html_cleanup(item['contentMap']['nl']))
            if len(docs_for_translation) > 0:
                translations = self.translate(
                    docs_for_translation,
                    to_lang=settings.AS2_TRANSLATION_LANGUAGES)
                enrichments['translations'][item['@id']] = translations
                #print >>sys.stderr, "Enrichments: %s" % (enrichments,)

        #log.info(enrichments)
        return enrichments
Ejemplo n.º 13
0
    def enrich_item(self, enrichments, object_id, combined_index_doc, doc):
        for item in combined_index_doc.get('item', {}).get('items', []):
            if item.get('@type', 'Note') not in settings.AS2_TRANSLATION_TYPES:
                # log.info(
                #     'Document %s is not a translatable type (%s)' % (
                #         item.get('@id', '???'), item['@type'],))
                continue

            if 'created' not in item:
                log.info(
                    'Document has no date information, not enriching for binoas')
                return enrichments
            log.info('created: %s' % (item['created'],))
            amsterdam_tz = pytz.timezone('Europe/Amsterdam')
            current_dt = datetime.datetime.now(tz=amsterdam_tz)
            adjusted_dt = item['created']
            try:
                current_tz = item['created'].tzinfo
            except AttributeError:
                current_tz = None
            if current_tz is not None:
                delay = current_dt - item['created']
            else:
                # adjust for amsterdam time
                adjusted_dt = iso8601.parse_date('%s+02:00' % (
                    item['created'].isoformat()))
                delay = current_dt - adjusted_dt

            #log.info('Delay: %s (%s vs %s)' % (delay, current_dt, adjusted_dt))
            if delay.total_seconds() > (6 * 3600.0):
                log.info('Document delayed for %s so we have seen it before' % (
                    str(delay),))
                return enrichments

            translations = enrichments.get('translations', {}).get(item.get('@id', ''), [])
            if len(translations) == 0:
                translation_keys = {}
            if len(translations) == 1:
                translation_keys = {0: 'contentMap'}
            if len(translations) == 2:
                translation_keys = {0: 'nameMap', 1: 'contentMap'}
            for t_idx, t_key in translation_keys.iteritems():
                item[t_key] = {x['to']: x['text'] for x in translations[t_idx]['translations']}

                # always take the language of the content, since content tends to
                # be longer than the title
                item['@language'] = translations[-1]['detectedLanguage']['language']

            url = 'http://binoas.openstate.eu/posts/new'
            #url = 'http://binoas_app-binoas_1:5000/posts/new'
            r = {}
            resp = None
            log.info('sending to binoas: ' + str(item))
            try:
                resp = self.http_session.post(
                    url, data=json_encoder.encode({
                        'application': 'poliscoops',
                        'payload': item}))
                r = resp.json()
            except Exception as e:
                log.exception('Unexpected binoas enrichment error: %s'
                              % (e.message,))
                log.exception(resp.content)
                log.exception(doc)
        log.info('binoas result: ' + str(r))
        return enrichments