def load_html(self, req): """ returns a unicode html object """ cache = self.scraper.cache accept_error_codes = req.get('accept_error_codes') if accept_error_codes is None: accept_error_codes = [] if cache and cache.exists(url = req.url, post=req.post, file_name=req.get('file_name')) and req.get('use_cache'): return self._read_from_cache(url=req.url, post=req.post, file_name=req.get('file_name')) if req.get('use_cache') and req.get('cache_only') and not cache.exists(url = req.url, post=req.post, file_name=req.get('file_name')): html = common.DataItem('<html/>') html.status = Status() return html res = self.fetch_data(req) html = common.DataItem( res.data or '') status = res.status if (status.code == 200 or status.code in accept_error_codes) and cache and req.get('use_cache'): self._write_to_cache(url=req.url, post=req.post, data=html, status=status, file_name=req.get('file_name')) html.status = status return html
def _build_response_data(self, req, response): encoding = 'utf8' unicode_html = u'' try: unicode_html = response['data'].decode(encoding, 'ignore') except Exception as e: logger.warn('failed to decode bytes from url: %s', req.url) return_type = req.get('return_type') or 'doc' if return_type == 'doc': doc = http.Doc(url=req.url, html=unicode_html) doc.req = req doc.status.code = response['code'] doc.status.message = response['message'] return doc elif return_type == 'html': html = common.DataItem(unicode_html) html.req = req html.status = common.DataObject() html.status.code = response['code'] html.status.message = response['message'] return html else: self.scraper.logger.warn('unsupported return_type: %s', return_type) return None
def _read_from_cache(self, url, post, filename=None): cache = self.scraper.cache cachedata = cache.read(url=url, post=post, filename=filename).split(meta_seperator) cachedhtml = None if len(cachedata) == 2: cachedhtml = cachedata[1] meta = json.loads(cachedata[0]) #reload status response = Response(data=cachedhtml, code=meta['response']['code'], final_url=meta['response']['final_url'], message=meta['response'].get('message', '')) else: #no meta data cachedhtml = cachedata[0] response = Response(data=cachedhtml, code=200, final_url=None, message=None) html = common.DataItem(cachedhtml) html.response = response return html
def __init__(self, url='', html='<html></html>', html_clean=None, status=None): logger = logging.getLogger('__name__') if html_clean: html = html_clean(html) Node.__init__(self, html) self.url = common.DataItem( url ) self.status = status or Status(final_url=url) #resolve relative urls baseurl = self.x("//base/@href").tostring() if not baseurl: baseurl = self.url try: for n in self.q('//a[@href and not(contains(@href, "javascript")) and not(starts-with(@href, "#")) and not(contains(@href, "mailto:"))]'): if n.href().trim() == '': continue n.set('href', urlparse.urljoin(baseurl, n.get('href').tostring())) for n in self.q('//iframe[@src]'): if n.src().trim() == '': continue n.set('src', urlparse.urljoin(baseurl, n.src())) for n in self.q('//form[@action]'): n.set('action', urlparse.urljoin(baseurl, n.get('action').tostring())) for n in self.q('//img[@src]'): n.set('src', urlparse.urljoin(baseurl, n.get('src').tostring())) except Exception as e: logger.warn('there was error while init the Doc object: %s', self.url) logger.exception(e)
def save(self, record, filename='result.csv', max=None, keys=[], id=None, headers=[], remove_existing_file=True, always_quoted=True): #waiting while other thread writing while self.writingflag: pass #hold the flag self.writingflag = True path = os.path.join(self.dir, filename) format = common.DataItem(path).subreg('\.([a-z]{2,5})$--is').lower() if not self.outdb.get(path): if os.path.exists(path): if remove_existing_file: os.remove(path) self.outdb.update({ path: common.DataObject(cnt=0, data=[], ids=[], format=format) }) trackingobj = self.outdb.get(path) if keys or id: id = id or u"".join( [unicode(record[record.index(key) + 1]) for key in keys]) if id in trackingobj.ids: self.writingflag = False return else: trackingobj.ids.append(id) trackingobj.cnt += 1 if format == 'csv': #for csv format, save to file immediately common.save_csv(path, record, always_quoted=always_quoted) elif format in ['xls', 'xlsx']: #save for later trackingobj.data.append(record) if max and trackingobj.cnt == max: self.flush() #save output files and quit os._exit(1) #free the flag self.writingflag = False
def load_oneproxy(self): proxy_file = self.proxy_file if proxy_file: if not os.path.exists(proxy_file): raise Exception('proxy_file not found: {0}'.format(proxy_file)) proxy_auth_found = False proxy_selected = False lines = common.read_lines(proxy_file) while (True): line = random.choice(lines) if 'proxy_auth' in line: if (proxy_auth_found is False): self.proxy_auth = common.DataItem(line).rr( 'proxy_auth\s*=\s*').trim() proxy_auth_found = True else: if (proxy_selected is False): #support tab, commas separator as well line = line.replace('\t', ':').replace(',', ':') self.proxies.append(line) proxy_selected = True if (proxy_selected): if (proxy_auth_found): break else: for line in lines: if 'proxy_auth' in line: self.proxy_auth = common.DataItem(line).rr( 'proxy_auth\s*=\s*').trim() proxy_auth_found = True break if (proxy_auth_found): break return self
def body_ready(body): if req.get('contain') and req.get('contain') not in body: result = { 'success': False, 'data': body, 'req': req, 'code': 200, 'message': 'not contain text: %s' % req.get('contain') } else: #success content_type = response.headers.getRawHeaders('content-type') if not content_type: logger.warn('no content-type header found: %s', req.url) content_type = '' else: content_type = content_type[0].lower() is_binary_data = req.get('bin') or False if 'image' in content_type or 'pdf' in content_type: is_binary_data = True if not is_binary_data: charset = common.DataItem(content_type).subreg('charset\s*=\s*([^\s]+)') if not charset: # logger.warn('no charset parsed from content_type: %s, assumed utf8, url: %s', content_type, req.url) charset = 'utf8' body = _to_utf8encoded_bytes(body, charset=charset) result = { 'success': True, 'data': body, #in utf8-encoded bytes 'req': req, 'code': 200, 'message': 'ok' } try: output_deferred.callback(result) except Exception as e: logger.debug(e.message)
def load_proxies(self): proxy_file = self.proxy_file if proxy_file: if not os.path.exists(proxy_file): raise Exception('proxy_file not found: {0}'.format(proxy_file)) for line in common.read_lines(proxy_file): if 'proxy_auth' in line: self.proxy_auth = common.DataItem(line).rr( 'proxy_auth\s*=\s*').trim() continue #support tab, commas separator as well line = line.replace('\t', ':').replace(',', ':') self.proxies.append(line) return self
def save_link(self, url, dir='images', file_name='auto', format='jpg', prefix='', **_options): fn = '' if file_name == 'auto': #special name fn = common.DataItem(url).rr('\?.*?$').subreg( '/([^/\?\$]+\.[a-z]{2,4})$--is') if not fn: self.logger.warn('failed to parse file_name from url: %s', url) return None else: #file_name is a fixed name fn = file_name if not common.subreg(fn, '(\.[a-z]{2,5}$)--is'): fn += '.' + format fn = prefix + fn if not os.path.exists(os.path.join(self.dir, dir)): os.makedirs(os.path.join(self.dir, dir)) path = os.path.join(self.dir, dir, fn) if (os.path.exists(path)): return fn #already downloaded else: #start downloading the file options = common.combine_dicts(self.config, _options) res = self.client.fetch_data( http.Request(url=url, bin=True, **options)) if res.status.code == 200 and res.data: common.put_bin(path, res.data) return fn else: return None
def _read_from_cache(self, url, post, file_name=None): cache = self.scraper.cache cachedata = cache.read(url = url, post = post, file_name = file_name).split(meta_seperator) cachedhtml = None status = Status(code=200, final_url=None, error=None) if len(cachedata)==2: cachedhtml = cachedata[1] meta = json.loads( cachedata[0] ) #reload status status = Status(code= meta['status']['code'], final_url = meta['status']['final_url'], error = meta['status'].get('error', '') ) else: #no meta data cachedhtml = cachedata[0] html = common.DataItem(cachedhtml) html.status = status return html
def handler(doc): page = stats.page doc.page = page if verify: if not verify( common.DataObject(starturl=common.DataItem(url), page=page, doc=doc)): doc.ok = False logger.warn("invalid doc at page {0}".format(page)) logger.info('page %s', page) #download and parse details if detail: listings = detail( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail) logger.info('details: %s', len(listings)) for listing in listings: self.downloader.put(Request(url=listing if isinstance( listing, basestring) else listing.nodevalue(), cb=parse_detail, meta=meta, **options), onhold=list_pages_first) done = False _nexturl = None _next_post = None if next: _nexturl = next( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next)) if next_post: if not next: #next is not provided, use the original url _nexturl = doc.url _next_post = next_post( common.DataObject( doc=doc, page=page, starturl=common.DataItem(url))) if hasattr( next_post, '__call__') else next_post if next_post: if _next_post: done = False else: done = True else: if not _nexturl: done = True else: done = False #if (next and _nexturl ) or (next_post and _next_post): if not done: #logger.debug('next_post: %s, _nexturl: %s', _next_post, _nexturl) stats.page += 1 if max_pages != 0 and stats.page > max_pages: done = True else: self.downloader.put( Request(_nexturl, _next_post, cb=handler, **options)) else: done = True if parse_list: parse_list(doc)
def fetch_data(self, req): """ processes a http request specified by the req object and returns a response object """ req.normalize(self.scraper) accept_error_codes = req.get('accept_error_codes') time.sleep(self.scraper.config['delay']) opener = req.get('opener') if not opener: opener = create_opener(use_cookie=False) if req.get('use_cookie') is False else self.opener headers = req.get('headers') if self.scraper.config.get('use_requests') is True: #use requests module instead of urllib2 return self.requests_fetch_data(req,headers) proxy = req.get('proxy') if proxy and req.get('use_proxy') is not False: if req.url.lower().startswith('https://'): opener.add_handler(urllib2.ProxyHandler({'https' : proxy.full_address })) else: opener.add_handler(urllib2.ProxyHandler({'http' : proxy.full_address })) request = urllib2.Request(req.url, req.post, headers) tries = req.get('retries', 0) status_code = 0 error_message = '' final_url = None if self.scraper.config['debug']: self.logger.debug('loading %s', req.url) try: with contextlib.closing(opener.open(request, timeout= req.get('timeout', self.scraper.config['timeout']))) as res: final_url = res.url status_code = res.code rawdata = res.read() self.resp = res if 'gzip' in res.headers.get('content-encoding','').lower(): bytes = zlib.decompress(rawdata, 16+zlib.MAX_WBITS) elif 'deflate' in res.headers.get('content-encoding','').lower(): bytes = zlib.decompressobj(-zlib.MAX_WBITS).decompress(rawdata) else: bytes = rawdata encoding = req.get('encoding') or common.DataItem(res.headers.get('content-type') or '').subreg('charset\s*=([^;]+)') or 'utf8' content_type = res.headers.get('content-type', '').lower() #self.logger.debug('content type: %s, encoding: %s', content_type, encoding) data = '' #default is text data is_binary_data = req.get('bin') or False if 'image' in content_type or 'pdf' in content_type: is_binary_data = True if not is_binary_data: data = bytes.decode(encoding, 'ignore') #verify data #self.logger.debug('contain: %s', req.get('contain')) if req.get('contain') and req.get('contain') not in data: raise Exception("invalid html, not contain: %s" % req.get('contain')) if req.get('not_contain') and req.get('not_contain') in data: raise Exception("invalid html, contain negative string: %s" % req.get('not_contain')) verify = req.get('verify') if verify and (not verify(data)): raise Exception("invalid html") else: #binary content data = bytes #log proxy and timestamp into the html file if asked if req.get('log_time_and_proxy'): data += '\n<log id="time_proxy"><time>{time}</time><proxy>{proxy}</proxy></log>'.format(time=time.time(), proxy=proxy if (proxy and req.get('use_proxy') is not False) else '') return Response(data=data, status= Status(code=status_code, final_url=final_url)) except Exception, e: if status_code == 0 and hasattr(e,'code'): status_code = e.code if hasattr(e, 'reason'): error_message = e.reason elif hasattr(e, 'line'): error_message = 'BadStatusLine: %s' % e.line elif hasattr(e, 'message'): error_message = e.message if not error_message and hasattr(e, 'args'): try: error_message = u", ".join([unicode(item) for item in e.args]).replace("''",'unknown') except: pass if tries > 0 and status_code not in accept_error_codes: #try to open the request one again self.logger.debug('data fetching error: %s %s', status_code if status_code !=0 else '', error_message) req.update({'retries': tries - 1}) return self.fetch_data(req) else: self.logger.warn('data fetching error: %s %s', status_code if status_code !=0 else '', error_message) if 'invalid html' in error_message: status_code = 0 return Response(data=None, status = Status(code = status_code, final_url=final_url, error = error_message))
def export_items(self, dest_file, query=None, limit=None, sort=None, fields=None, include_hidden_fields=False, multicol_fields={}, exclude_fields=[]): """ @query: None means all items @fields: None means all fields """ if os.path.exists(dest_file): os.remove(dest_file) if not fields: fields = self._compile_all_fields(include_hidden_fields, exclude_fields=exclude_fields) format = common.DataItem(dest_file).subreg( '\.([a-z]{2,5})$--is').lower() rows = [] query = query or {} cnt = self.count_items(query) print('cnt: {}'.format(cnt)) cursor = self._db.items.find(query) if sort: cursor = cursor.sort(sort) if limit: cursor = cursor.limit(limit) for item in cursor: res = [] for field in fields: # value = item.get(field) or '' value = item.get(field) if value is None: value = '' if field in multicol_fields: maxcol = multicol_fields[field] parts = [] if value is None: value = [] if isinstance(value, list): parts = value else: parts = value.split('|') if len(parts) < maxcol: #normalize for i in xrange(maxcol - len(parts)): parts.append('') for i in xrange(maxcol): res.append('{} {}'.format(field, i + 1)) res.append(parts[i]) else: res.append(field) res.append(value) if format == 'csv': common.save_csv(dest_file, res) else: rows.append(res) if format == 'xls': import excellib excellib.save_xls(dest_file, rows) elif format == 'xlsx': import excellib excellib.save_xlsx(dest_file, rows)