Example #1
0
 def start_requests(self):
     for url_id, raw in self.url_tuples:
         purl = get_parsed_url(raw)
         status_code = None
         if purl is None or purl.hostname is None:
             status_code = U_HTML_ERROR_INVALID_URL
             logger.debug('Invalide url %r', raw)
         elif is_home_url(purl.path) is True:
             status_code = U_HTML_ERROR_HOME_URL
             logger.debug('Ignore home kind url %r', raw)
         elif belongs_to_domain(purl.hostname,
                                self.excluded_domains) is not None:
             status_code = U_HTML_ERROR_EXCLUDED_DOMAIN
             logger.debug('Ignore excluded domain %r', raw)
         else:
             try:
                 yield scrapy.Request(
                     raw,
                     callback=self.parse,
                     meta=dict(url_id=url_id, raw=raw),
                     errback=self.errback_request)
             except Exception as e:
                 logger.error('Error when sending request %r: %s', raw, e)
                 status_code = U_HTML_ERROR_INVALID_URL
         # when error happends, update url status_code
         if status_code is not None:
             try:
                 self.session.query(Url).filter_by(id=url_id)\
                     .update(dict(status_code=status_code),
                             synchronize_session=False)
                 self.session.commit()
             except SQLAlchemyError as e:
                 logger.error('Error when update url status_code: %s', e)
                 self.session.rollback()
Example #2
0
    def process_item(self, item, spider):
        """Main function that process URL item (second phase)."""
        # canonicalize expanded URL without considering the status_code
        # because scrapy crawling not ganrantee the success
        # we still try to canonicalize the URL
        if len(item['expanded']) > MAX_URL_LEN:
            item['expanded'] = item['expanded'][:MAX_URL_LEN]
            logger.error('Expanded URL too long, trucate it! %r', item['raw'])
        item['canonical'] = canonicalize(item['expanded'])
        if item['canonical'] is None:
            item['status_code'] = U_HTML_ERROR_INVALID_URL

        # if url could be canonicalized and if site_id is not determined
        # we infer it from the expanded url
        if item['status_code'] != U_HTML_ERROR_INVALID_URL\
                and item.get('site_id', None) is None:
            purl = get_parsed_url(item['expanded'])
            if purl is not None and purl.hostname is not None:
                if belongs_to_domain(purl.hostname, spider.excluded_domains)\
                        is not None:
                    item['status_code'] = U_HTML_ERROR_EXCLUDED_DOMAIN
                else:
                    item['site_id'] = belongs_to_site(purl.hostname,
                                                      self.site_tuples)
            else:
                item['status_code'] = U_HTML_ERROR_INVALID_URL
        # remove potential NUL byte \x00 in the HTML
        if 'html' in item:
            item['html'] = item['html'].replace(b'\x00', '')
        try:
            # update database of url table
            spider.session.query(Url).filter_by(id=item['id'])\
                .update(dict(item), synchronize_session=False)
            spider.session.commit()
            logger.debug('Fetched html of url %r with status %i', item['raw'],
                         item['status_code'])
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to update database of url: %s', item)
        return item
Example #3
0
 def process_item(self, item, spider):
     """Main function that process URL item (first phase)."""
     # validate URL length
     if len(item['raw']) > MAX_URL_LEN:
         item['raw'] = item['raw'][:MAX_URL_LEN]
         logger.error('Raw URL too long, trucate it! %r', item['raw'])
     # parse raw URL
     purl = get_parsed_url(item['raw'])
     if purl is None or purl.hostname is None:
         raise DropItem('Invalide URL')
     site_id = belongs_to_site(purl.hostname, self.site_tuples)
     if site_id is None:
         raise DropItem('Offsite domain: %s', item)
     item['site_id'] = site_id
     # insert URL into table
     try:
         get_or_create_murl(spider.session, item, spider.platform_id)
     except SQLAlchemyError as e:
         logger.error(e)
         spider.session.rollback()
         raise DropItem('Fail to insert database of url: %s', item)
     return item