Ejemplo n.º 1
0
 def _read_data(self, spider, request):
     try:
         result = self.solr.search(q='id:{}'.format(get_urn(request.url)))
         doc = result.docs[0]
         return doc
     except Exception as e:
         return None
Ejemplo n.º 2
0
    def process_item(self, item, spider):
        data = dict(item)
        data['updated'] = datetime.now()
        data = self.map_to_solr_datatypes(data=data)

        # print(data['pub_date_dt'], "================")
        if "pub_date_dt" in data.keys():
            del data['pub_date_dt']

        data['id'] = self.clean_str(get_urn(data['url_s']))
        self.solr.add([data])
        return item
Ejemplo n.º 3
0
 def store_response(self, spider, request, response):
     data = {
         'status':
         response.status,
         'domain':
         get_domain(response.url),
         'url':
         response.url,
         'html':
         str(response.body).lstrip("b'").strip("'").replace(
             "\\n", "").replace("\\t", "").replace("\\\\", "\\"),
         'created':
         datetime.now()
     }
     data.update(
         self._flatten_headers(self._clean_headers(response.headers)))
     WebLink(meta={'id': get_urn(response.url)}, **data).save()
Ejemplo n.º 4
0
    def store_response(self, spider, request, response):
        data = {
            'status':
            response.status,
            'domain':
            get_domain(response.url),
            'url':
            response.url,
            'html':
            str(response.body).lstrip("b'").strip("'").replace(
                "\\n", "").replace("\\t", "").replace("\\\\", "\\"),
            'created':
            datetime.now()
        }
        data.update(
            self._flatten_headers(self._clean_headers(response.headers)))

        data = self.map_to_solr_datatypes(data=data)
        data['id'] = get_urn(response.url)
        self.solr.add([data])
Ejemplo n.º 5
0
 def process_item(self, item, spider):
     data = dict(item)
     data['updated'] = datetime.now()
     WebLinkExtracted(meta={'id': get_urn(data['url'])}, **data).save()
     return item
Ejemplo n.º 6
0
 def _read_data(self, spider, request):
     try:
         return WebLink.get(id=get_urn(request.url)).to_dict()
     except Exception as e:
         return None