def start_requests(self): for url_id, raw in self.url_tuples: purl = get_parsed_url(raw) status_code = None if purl is None or purl.hostname is None: status_code = U_HTML_ERROR_INVALID_URL logger.debug('Invalide url %r', raw) elif is_home_url(purl.path) is True: status_code = U_HTML_ERROR_HOME_URL logger.debug('Ignore home kind url %r', raw) elif belongs_to_domain(purl.hostname, self.excluded_domains) is not None: status_code = U_HTML_ERROR_EXCLUDED_DOMAIN logger.debug('Ignore excluded domain %r', raw) else: try: yield scrapy.Request( raw, callback=self.parse, meta=dict(url_id=url_id, raw=raw), errback=self.errback_request) except Exception as e: logger.error('Error when sending request %r: %s', raw, e) status_code = U_HTML_ERROR_INVALID_URL # when error happends, update url status_code if status_code is not None: try: self.session.query(Url).filter_by(id=url_id)\ .update(dict(status_code=status_code), synchronize_session=False) self.session.commit() except SQLAlchemyError as e: logger.error('Error when update url status_code: %s', e) self.session.rollback()
def process_item(self, item, spider): """Main function that process URL item (second phase).""" # canonicalize expanded URL without considering the status_code # because scrapy crawling not ganrantee the success # we still try to canonicalize the URL if len(item['expanded']) > MAX_URL_LEN: item['expanded'] = item['expanded'][:MAX_URL_LEN] logger.error('Expanded URL too long, trucate it! %r', item['raw']) item['canonical'] = canonicalize(item['expanded']) if item['canonical'] is None: item['status_code'] = U_HTML_ERROR_INVALID_URL # if url could be canonicalized and if site_id is not determined # we infer it from the expanded url if item['status_code'] != U_HTML_ERROR_INVALID_URL\ and item.get('site_id', None) is None: purl = get_parsed_url(item['expanded']) if purl is not None and purl.hostname is not None: if belongs_to_domain(purl.hostname, spider.excluded_domains)\ is not None: item['status_code'] = U_HTML_ERROR_EXCLUDED_DOMAIN else: item['site_id'] = belongs_to_site(purl.hostname, self.site_tuples) else: item['status_code'] = U_HTML_ERROR_INVALID_URL # remove potential NUL byte \x00 in the HTML if 'html' in item: item['html'] = item['html'].replace(b'\x00', '') try: # update database of url table spider.session.query(Url).filter_by(id=item['id'])\ .update(dict(item), synchronize_session=False) spider.session.commit() logger.debug('Fetched html of url %r with status %i', item['raw'], item['status_code']) except SQLAlchemyError as e: logger.error(e) spider.session.rollback() raise DropItem('Fail to update database of url: %s', item) return item
def process_item(self, item, spider): """Main function that process URL item (first phase).""" # validate URL length if len(item['raw']) > MAX_URL_LEN: item['raw'] = item['raw'][:MAX_URL_LEN] logger.error('Raw URL too long, trucate it! %r', item['raw']) # parse raw URL purl = get_parsed_url(item['raw']) if purl is None or purl.hostname is None: raise DropItem('Invalide URL') site_id = belongs_to_site(purl.hostname, self.site_tuples) if site_id is None: raise DropItem('Offsite domain: %s', item) item['site_id'] = site_id # insert URL into table try: get_or_create_murl(spider.session, item, spider.platform_id) except SQLAlchemyError as e: logger.error(e) spider.session.rollback() raise DropItem('Fail to insert database of url: %s', item) return item