Example #1
0
    def _build_response_data(self, req, response):

        encoding = 'utf8'
        unicode_html = u''

        try:
            unicode_html = response['data'].decode(encoding, 'ignore')
        except Exception as e:
            logger.warn('failed to decode bytes from url: %s', req.url)

        return_type = req.get('return_type') or 'doc'

        if return_type == 'doc':
            doc = http.Doc(url=req.url, html=unicode_html)
            doc.req = req
            doc.status.code = response['code']
            doc.status.message = response['message']
            return doc
        elif return_type == 'html':
            html = common.DataItem(unicode_html)
            html.req = req
            html.status = common.DataObject()
            html.status.code = response['code']
            html.status.message = response['message']
            return html

        else:
            self.scraper.logger.warn('unsupported return_type: %s',
                                     return_type)
            return None
Example #2
0
    def save(self,
             record,
             filename='result.csv',
             max=None,
             keys=[],
             id=None,
             headers=[],
             remove_existing_file=True,
             always_quoted=True):
        #waiting while other thread writing
        while self.writingflag:
            pass
        #hold the flag
        self.writingflag = True

        path = os.path.join(self.dir, filename)
        format = common.DataItem(path).subreg('\.([a-z]{2,5})$--is').lower()

        if not self.outdb.get(path):
            if os.path.exists(path):
                if remove_existing_file:
                    os.remove(path)

            self.outdb.update({
                path:
                common.DataObject(cnt=0, data=[], ids=[], format=format)
            })

        trackingobj = self.outdb.get(path)
        if keys or id:
            id = id or u"".join(
                [unicode(record[record.index(key) + 1]) for key in keys])
            if id in trackingobj.ids:
                self.writingflag = False
                return
            else:
                trackingobj.ids.append(id)

        trackingobj.cnt += 1

        if format == 'csv':
            #for csv format, save to file immediately
            common.save_csv(path, record, always_quoted=always_quoted)
        elif format in ['xls', 'xlsx']:
            #save for later
            trackingobj.data.append(record)
        if max and trackingobj.cnt == max:
            self.flush()  #save output files and quit
            os._exit(1)

        #free the flag
        self.writingflag = False
Example #3
0
        def handler(doc):
            page = stats.page
            doc.page = page

            if verify:
                if not verify(
                        common.DataObject(starturl=common.DataItem(url),
                                          page=page,
                                          doc=doc)):
                    doc.ok = False
                    logger.warn("invalid doc at page {0}".format(page))

            logger.info('page %s', page)

            #download and parse details
            if detail:

                listings = detail(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(detail,
                                             '__call__') else doc.q(detail)

                logger.info('details: %s', len(listings))

                for listing in listings:

                    self.downloader.put(Request(url=listing if isinstance(
                        listing, basestring) else listing.nodevalue(),
                                                cb=parse_detail,
                                                meta=meta,
                                                **options),
                                        onhold=list_pages_first)

            done = False

            _nexturl = None
            _next_post = None

            if next:
                _nexturl = next(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(next, '__call__') else (
                            next if next.startswith('http') else doc.x(next))
            if next_post:
                if not next:
                    #next is not provided, use the original url
                    _nexturl = doc.url
                _next_post = next_post(
                    common.DataObject(
                        doc=doc, page=page,
                        starturl=common.DataItem(url))) if hasattr(
                            next_post, '__call__') else next_post

            if next_post:
                if _next_post:
                    done = False
                else:
                    done = True
            else:
                if not _nexturl:
                    done = True
                else:
                    done = False

            #if (next and _nexturl ) or (next_post and _next_post):
            if not done:

                #logger.debug('next_post: %s, _nexturl: %s', _next_post,  _nexturl)

                stats.page += 1

                if max_pages != 0 and stats.page > max_pages:
                    done = True
                else:
                    self.downloader.put(
                        Request(_nexturl, _next_post, cb=handler, **options))
            else:
                done = True

            if parse_list:
                parse_list(doc)
Example #4
0
    def pagin(self,
              url,
              next=None,
              post=None,
              next_post=None,
              parse_list=None,
              detail=None,
              parse_detail=None,
              cc=3,
              max_pages=0,
              list_pages_first=True,
              start_now=False,
              debug=True,
              verify=None,
              meta={},
              **_options):

        if cc != self.downloader.cc:
            self.downloader.set_cc(cc)

        options = common.combine_dicts(self.config, _options)

        stats = common.DataObject(page=1)

        #apply scraper-level options

        def handler(doc):
            page = stats.page
            doc.page = page

            if verify:
                if not verify(
                        common.DataObject(starturl=common.DataItem(url),
                                          page=page,
                                          doc=doc)):
                    doc.ok = False
                    logger.warn("invalid doc at page {0}".format(page))

            logger.info('page %s', page)

            #download and parse details
            if detail:

                listings = detail(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(detail,
                                             '__call__') else doc.q(detail)

                logger.info('details: %s', len(listings))

                for listing in listings:

                    self.downloader.put(Request(url=listing if isinstance(
                        listing, basestring) else listing.nodevalue(),
                                                cb=parse_detail,
                                                meta=meta,
                                                **options),
                                        onhold=list_pages_first)

            done = False

            _nexturl = None
            _next_post = None

            if next:
                _nexturl = next(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(next, '__call__') else (
                            next if next.startswith('http') else doc.x(next))
            if next_post:
                if not next:
                    #next is not provided, use the original url
                    _nexturl = doc.url
                _next_post = next_post(
                    common.DataObject(
                        doc=doc, page=page,
                        starturl=common.DataItem(url))) if hasattr(
                            next_post, '__call__') else next_post

            if next_post:
                if _next_post:
                    done = False
                else:
                    done = True
            else:
                if not _nexturl:
                    done = True
                else:
                    done = False

            #if (next and _nexturl ) or (next_post and _next_post):
            if not done:

                #logger.debug('next_post: %s, _nexturl: %s', _next_post,  _nexturl)

                stats.page += 1

                if max_pages != 0 and stats.page > max_pages:
                    done = True
                else:
                    self.downloader.put(
                        Request(_nexturl, _next_post, cb=handler, **options))
            else:
                done = True

            if parse_list:
                parse_list(doc)

        ##### end of the handler function ##################################################

        #start the initial url
        self.downloader.put(Request(url, post, cb=handler, **options))
        if start_now:
            self.downloader.start()