def provider_map(delete=False): ''' Adds favicons and metadata for harvesters to Elasticsearch ''' from six.moves.urllib import parse as urllib_parse from scrapi import registry from scrapi.base.helpers import null_on_error from scrapi.processing.elasticsearch import DatabaseManager dm = DatabaseManager() dm.setup() es = dm.es if delete: es.indices.delete(index='share_providers', ignore=[404]) from scrapi.harvesters.push_api import gen_harvesters gen_harvesters() for harvester_name, harvester in registry.items(): if not null_on_error(es.get, log=False)(index='share_providers', doc_type=harvester_name, id=harvester_name): with open("img/favicons/{}_favicon.ico".format(harvester.short_name), "rb") as f: favicon = urllib_parse.quote(base64.encodestring(f.read())) es.index( 'share_providers', harvester.short_name, body={ 'favicon': 'data:image/png;base64,' + favicon, 'short_name': harvester.short_name, 'long_name': harvester.long_name, 'url': harvester.url }, id=harvester.short_name, refresh=True ) print(es.count('share_providers', body={'query': {'match_all': {}}})['count'])
def format_property(self, property): if property == "date": fn = compose( lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property ) else: fn = self.resolve_property return (property, ("//dc:{}/node()".format(property), "//ns0:{}/node()".format(property), fn))
def format_property(self, property): if property == 'date': fn = compose( lambda x: list(map(null_on_error(datetime_formatter), x)), coerce_to_list, self.resolve_property) else: fn = self.resolve_property return (property, ('//dc:{}/node()'.format(property), '//ns0:{}/node()'.format(property), fn))
def format_property(self, property): if property == 'date': fn = compose(lambda x: map(null_on_error(date_formatter), x), coerce_to_list, self.resolve_property) else: fn = self.resolve_property return (property, ( '//dc:{}/node()'.format(property), '//ns0:{}/node()'.format(property), fn) )
def normalize(raw_doc, harvester_name): normalized_started = timestamp() harvester = registry[harvester_name] normalized = null_on_error(harvester.normalize)(raw_doc) if not normalized: raise events.Skip('Did not normalize document with id {}'.format(raw_doc['docID'])) normalized['timestamps'] = util.stamp_from_raw(raw_doc, normalizeStarted=normalized_started) return normalized # returns a single normalized document
def normalize(raw_doc, harvester_name): normalized_started = timestamp() harvester = registry[harvester_name] normalized = null_on_error(harvester.normalize)(raw_doc) if not normalized: raise events.Skip('Did not normalize document with id {}'.format( raw_doc['docID'])) normalized['timestamps'] = util.stamp_from_raw( raw_doc, normalizeStarted=normalized_started) return normalized # returns a single normalized document