def ping(name, url, grab, thread_number=10): """ Do XMLRPC ping of given site. """ name = smart_str(name) url = smart_str(url) def worker(rpc_url): post = PING_XML % { 'url': html.escape(url), 'name': html.escape(name), } ok = False try: grab.go(rpc_url, post=post) except Exception as ex: logging.error(unicode(ex)) else: if not '<boolean>0' in grab.response.body: logging.error('%s : FAIL' % rpc_url) logging.error(grab.response.body[:1000]) else: ok = True return rpc_url, ok results = [] for rpc_url, ok in make_work(worker, SERVER_LIST, thread_number): results.append((rpc_url, ok)) return results
def save_list(self, list_name, path): """ Save items from list to the file. """ with open(path, 'wb') as out: lines = [] for item in self.items.get(list_name, []): if isinstance(item, (six.text_type, six.binary_type)): lines.append(smart_str(item)) else: lines.append(smart_str(json.dumps(item))) out.write(b'\n'.join(lines) + b'\n')
def sanitize_html(html, encoding='utf-8', return_unicode=False): html = smart_str(html, encoding=encoding) if RE_TAG_START.search(html): html = render_html(parse_html(html)) if return_unicode: return html.decode('utf-8') else: return html
def build_search_url(query, page=1, per_page=None, lang='en', filter=True, region=213, **kwargs): """ Build yandex search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 213 region is Moscow """ query = smart_str(query) url = 'http://yandex.ru/yandsearch?text=%s&lr=%s' % ( quote(query), region) if kwargs: url += '&' + urlencode(kwargs) url += '&p=%d' % (page - 1) return url
def build_search_url(query, page=1, per_page=None, lang='en', filter=True, region=213, **kwargs): """ Build yandex search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 213 region is Moscow """ query = smart_str(query) url = 'http://yandex.ru/yandsearch?text=%s&lr=%s' % (quote(query), region) if kwargs: url += '&' + urlencode(kwargs) url += '&p=%d' % (page - 1) return url
def build_search_url(query, page=None, per_page=None, lang=None, filter=None, **kwargs): """ Build google search url with specified query and pagination options. :param per_page: 10, 20, 30, 50, 100 kwargs: tbs=qdr:h tbs=qdr:d tbs=qdr:w tbs=qdr:m tbs=qdr:y """ if per_page is None: per_page = 10 if page is None: page = 1 if lang is None: lang = 'en' if filter is None: filter = True start = per_page * (page - 1) if not 'hl' in kwargs: kwargs['hl'] = lang if not 'num' in kwargs: kwargs['num'] = per_page if not 'start' in kwargs: kwargs['start'] = start if not 'filter' in kwargs: if not filter: kwargs['filter'] = '0' url = 'http://google.com/search?q=%s' % quote(smart_str(query)) if kwargs: url += '&' + urlencode(kwargs) return url
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), grab.config['url'])) # py3 hack if not six.PY3: request_url = smart_str(request_url) self.curl.setopt(pycurl.URL, request_url) self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file(grab.config['body_storage_dir'], grab.config['body_storage_filename']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = random_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], six.string_types): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) # py3 hack if six.PY3: post_items = decode_pairs(post_items, grab.config['charset']) # import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError( 'Value of post option could be only ' 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) header_tuples = [str('%s: %s' % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config['encoding']: if ('gzip' in grab.config['encoding'] and 'zlib' not in pycurl.version): raise error.GrabMisuseError( 'You can not use gzip encoding because ' 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def quote(data): return urllib.quote_plus(smart_str(data))
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (six.text_type(ex), grab.config['url'])) # py3 hack if not six.PY3: request_url = smart_str(request_url) self.curl.setopt(pycurl.URL, request_url) self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file(grab.config['body_storage_dir'], grab.config['body_storage_filename']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = random_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], six.string_types): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) # py3 hack if six.PY3: post_items = decode_pairs(post_items, grab.config['charset']) # import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError( 'Value of post option could be only ' 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) header_tuples = [str('%s: %s' % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config['encoding']: if ('gzip' in grab.config['encoding'] and 'zlib' not in pycurl.version): raise error.GrabMisuseError( 'You can not use gzip encoding because ' 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])