Example #1
0
def provider_map(delete=False):
    ''' Adds favicons and metadata for harvesters to Elasticsearch '''
    from six.moves.urllib import parse as urllib_parse
    from scrapi import registry
    from scrapi.base.helpers import null_on_error
    from scrapi.processing.elasticsearch import DatabaseManager
    dm = DatabaseManager()
    dm.setup()
    es = dm.es
    if delete:
        es.indices.delete(index='share_providers', ignore=[404])
    from scrapi.harvesters.push_api import gen_harvesters
    gen_harvesters()

    for harvester_name, harvester in registry.items():
        if not null_on_error(es.get, log=False)(index='share_providers', doc_type=harvester_name, id=harvester_name):
            with open("img/favicons/{}_favicon.ico".format(harvester.short_name), "rb") as f:
                favicon = urllib_parse.quote(base64.encodestring(f.read()))

            es.index(
                'share_providers',
                harvester.short_name,
                body={
                    'favicon': 'data:image/png;base64,' + favicon,
                    'short_name': harvester.short_name,
                    'long_name': harvester.long_name,
                    'url': harvester.url
                },
                id=harvester.short_name,
                refresh=True
            )
    print(es.count('share_providers', body={'query': {'match_all': {}}})['count'])
Example #2
0
 def format_property(self, property):
     if property == "date":
         fn = compose(
             lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property
         )
     else:
         fn = self.resolve_property
     return (property, ("//dc:{}/node()".format(property), "//ns0:{}/node()".format(property), fn))
Example #3
0
 def format_property(self, property):
     if property == 'date':
         fn = compose(
             lambda x: list(map(null_on_error(datetime_formatter), x)),
             coerce_to_list, self.resolve_property)
     else:
         fn = self.resolve_property
     return (property, ('//dc:{}/node()'.format(property),
                        '//ns0:{}/node()'.format(property), fn))
Example #4
0
 def format_property(self, property):
     if property == 'date':
         fn = compose(lambda x: map(null_on_error(date_formatter), x), coerce_to_list, self.resolve_property)
     else:
         fn = self.resolve_property
     return (property, (
         '//dc:{}/node()'.format(property),
         '//ns0:{}/node()'.format(property),
         fn)
     )
Example #5
0
def normalize(raw_doc, harvester_name):
    normalized_started = timestamp()
    harvester = registry[harvester_name]

    normalized = null_on_error(harvester.normalize)(raw_doc)

    if not normalized:
        raise events.Skip('Did not normalize document with id {}'.format(raw_doc['docID']))

    normalized['timestamps'] = util.stamp_from_raw(raw_doc, normalizeStarted=normalized_started)

    return normalized  # returns a single normalized document
Example #6
0
def normalize(raw_doc, harvester_name):
    normalized_started = timestamp()
    harvester = registry[harvester_name]

    normalized = null_on_error(harvester.normalize)(raw_doc)

    if not normalized:
        raise events.Skip('Did not normalize document with id {}'.format(
            raw_doc['docID']))

    normalized['timestamps'] = util.stamp_from_raw(
        raw_doc, normalizeStarted=normalized_started)

    return normalized  # returns a single normalized document