def _read_data(self, spider, request): try: result = self.solr.search(q='id:{}'.format(get_urn(request.url))) doc = result.docs[0] return doc except Exception as e: return None
def process_item(self, item, spider): data = dict(item) data['updated'] = datetime.now() data = self.map_to_solr_datatypes(data=data) # print(data['pub_date_dt'], "================") if "pub_date_dt" in data.keys(): del data['pub_date_dt'] data['id'] = self.clean_str(get_urn(data['url_s'])) self.solr.add([data]) return item
def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'html': str(response.body).lstrip("b'").strip("'").replace( "\\n", "").replace("\\t", "").replace("\\\\", "\\"), 'created': datetime.now() } data.update( self._flatten_headers(self._clean_headers(response.headers))) WebLink(meta={'id': get_urn(response.url)}, **data).save()
def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'html': str(response.body).lstrip("b'").strip("'").replace( "\\n", "").replace("\\t", "").replace("\\\\", "\\"), 'created': datetime.now() } data.update( self._flatten_headers(self._clean_headers(response.headers))) data = self.map_to_solr_datatypes(data=data) data['id'] = get_urn(response.url) self.solr.add([data])
def process_item(self, item, spider): data = dict(item) data['updated'] = datetime.now() WebLinkExtracted(meta={'id': get_urn(data['url'])}, **data).save() return item
def _read_data(self, spider, request): try: return WebLink.get(id=get_urn(request.url)).to_dict() except Exception as e: return None