Exemple #1
0
    def _create_resolvable_media_urls(self, doc):
        m_url_content_types = {}
        if 'media_urls' in doc['enrichments']:
            for media_url in doc['enrichments']['media_urls']:
                if 'content_type' in media_url:
                    m_url_content_types[media_url['original_url']] = \
                        media_url['content_type']

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {
                    'original_url': media_url['original_url']
                }

                if media_url['original_url'] in m_url_content_types:
                    url_doc['content_type'] = \
                        m_url_content_types[media_url['original_url']]

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url', id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
    def load_item(self, object_id, combined_index_doc, doc):
        log.info('Indexing documents...')
        elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item',
                            id=object_id, body=combined_index_doc)

        # Index documents into new index
        elasticsearch.index(index=self.index_name, doc_type='item', body=doc,
                            id=object_id)

        m_url_content_types = {}
        if 'media_urls' in doc['enrichments']:
            for media_url in doc['enrichments']['media_urls']:
                if 'content_type' in media_url:
                    m_url_content_types[media_url['original_url']] = \
                        media_url['content_type']

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {
                    'original_url': media_url['original_url']
                }

                if media_url['original_url'] in m_url_content_types:
                    url_doc['content_type'] = \
                        m_url_content_types[media_url['original_url']]

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url', id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Exemple #3
0
    def load_item(self, object_id, combined_index_doc, doc):
        log.info('Indexing documents...')
        elasticsearch.index(index=settings.COMBINED_INDEX,
                            doc_type='item',
                            id=object_id,
                            body=combined_index_doc)
        elasticsearch.index(
            index='%s_%s' %
            (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']),
            doc_type='item',
            id=object_id,
            body=doc)

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {'original_url': media_url['original_url']}

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url',
                                         id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Exemple #4
0
    def load_item(self, combined_object_id, object_id, combined_index_doc,
                  doc):
        log.info('Indexing documents...')
        doc_type = self._get_doc_type(combined_index_doc, self.doc_type)
        elasticsearch.index(index=self.combined_index_name,
                            doc_type=doc_type,
                            id=combined_object_id,
                            body=combined_index_doc)

        # Index documents into new index
        doc_type = self._get_doc_type(doc, self.doc_type)
        elasticsearch.index(index=self.index_name,
                            doc_type=doc_type,
                            body=doc,
                            id=object_id)

        m_url_content_types = {}
        if 'media_urls' in doc['enrichments']:
            for media_url in doc['enrichments']['media_urls']:
                if 'content_type' in media_url:
                    m_url_content_types[media_url['original_url']] = \
                        media_url['content_type']

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {'original_url': media_url['original_url']}

                if media_url['original_url'] in m_url_content_types:
                    url_doc['content_type'] = \
                        m_url_content_types[media_url['original_url']]

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url',
                                         id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Exemple #5
0
    def load_item(self, object_id, combined_index_doc, doc):
        log.info('Indexing documents...')
        elasticsearch.index(index=settings.COMBINED_INDEX, doc_type='item',
                            id=object_id, body=combined_index_doc)
        elasticsearch.index(index='%s_%s' % (settings.DEFAULT_INDEX_PREFIX, self.source_definition['id']),
                            doc_type='item', id=object_id, body=doc)

        # For each media_urls.url, add a resolver document to the
        # RESOLVER_URL_INDEX
        if 'media_urls' in doc:
            for media_url in doc['media_urls']:
                url_hash = media_url['url'].split('/')[-1]
                url_doc = {
                    'original_url': media_url['original_url']
                }

                try:
                    elasticsearch.create(index=settings.RESOLVER_URL_INDEX,
                                         doc_type='url', id=url_hash,
                                         body=url_doc)
                except ConflictError:
                    log.debug('Resolver document %s already exists' % url_hash)
Exemple #6
0
def create_queries(mapping_dir, doc_type, index_name):
    """
    Create queries for which a json file is available.
    """
    click.echo(
        'Creating queries for ES queries in index %s (%s) (doc type: %s)' % (
            index_name,
            mapping_dir,
            doc_type,
        ))

    try:
        es.indices.create(index=index_name)  # use templae
    except Exception:
        pass

    for mapping_file_path in glob('%s/*.json' % mapping_dir):
        # Extract the index name from the filename
        query_id = os.path.split(mapping_file_path)[-1].split('.')[0]
        click.echo('Creating ES query %s' % query_id)

        mapping_file = open(mapping_file_path, 'rb')
        mapping = json.load(mapping_file)
        mapping_file.close()

        try:
            r = es.create(index=index_name,
                          doc_type=doc_type,
                          body=mapping,
                          id=query_id)
            click.echo('Query %s was %s' % (
                query_id,
                r['result'],
            ))
        except ConflictError as e:
            click.echo('Query already existed')
        except RequestError as e:
            error_msg = click.style('Failed to create query %s due to ES '
                                    'error: %s' % (query_id, e.error),
                                    fg='red')
            click.echo(error_msg)
            click.echo(e)