def do(queue, string_proxy): lr = LRequest(string_proxy=string_proxy) while 1: try: # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk category = queue.get(timeout=30) url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus( category) lr.load(url) if check_captcha(lr): lr.load(url) ele = lr.xpath('//h2[@id="s-result-count"]') f.write('%s\t%s\n' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace(',', ''))) f.flush() print '%s\t%s' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace(',', '')) except Empty: print 'empty' break except Exception as e: queue.put(category) print 'EEEEEEEEE %s' % e
def get_codes(delay=.0): # 20200810: need delay 4s codes = [] urls = [ 'http://app.finance.ifeng.com/list/stock.php?t=ha&f=symbol&o=asc', 'http://app.finance.ifeng.com/list/stock.php?t=hs&f=symbol&o=asc', 'http://app.finance.ifeng.com/list/stock.php?t=sa&f=symbol&o=asc', 'http://app.finance.ifeng.com/list/stock.php?t=kcb&f=symbol&o=asc', ] lr = LRequest(delay=delay) try: for url, m in urls: # logger.info('Load: %s' % url) lr.load(url, isdecode=True) while 1: for ele in lr.xpaths( '//div[@class="tab01"]/table//td[1]/a')[:-1]: code = ele.text.strip() if code.isdigit(): codes.append(code) next_ele = lr.xpath(u'//a[contains(text(), "下一页")]') if next_ele is None: break next_url = urljoin(url, next_ele.attrib['href']) # logger.info('Load: %s' % next_url) lr.load(next_url, isdecode=True) except: logger.error(traceback.format_exc()) return codes
class GoogleSearch(object): search_url = 'https://www.google.%(tld)s/search?q=%(query)s&hl=%(lang)s&filter=%(filter)d&num=%(num)d&start=%(start)s&btnG=Google+Search' def __init__(self, query, *args, **kwargs): self.query = query self._tld = kwargs.get('tld', 'com') self._filter = kwargs.get('filter', 0) self._lang = kwargs.get('lang', 'en') self._num = kwargs.get('num', 100) self._page = kwargs.get('page', 0) timeout = kwargs.get('timeout', 90) string_proxy = kwargs.get('string_proxy', None) self.lr = LRequest(timeout=timeout, string_proxy=string_proxy, handers=[GoogleHTTPErrorProcessor(), ]) @property def page(self): return self._page @page.setter def page(self, value): self._page = value def _get_result(self): safe_url = self.search_url % {'query': urllib.quote_plus(self.query), 'start': self.page * self._num, 'num': self._num, 'tld' : self._tld, 'lang' : self._lang, 'filter': self._filter} print safe_url self.lr.load(safe_url) results = [] i = 0 for r in self.lr.xpath('//li[@class="g"]'): i += 1 result = {} result['title'] = ''.join(r.xpath('./div/h3//text()')) result['description'] = ''.join(r.xpath('./div//span[@class="st"]//text()')) result['url'] = ''.join(r.xpath('./div/h3/a/@href')) results.append(result) print i return results def get_result(self): return self._get_result()
def do(queue, string_proxy): lr = LRequest(string_proxy=string_proxy) while 1: try: # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk category = queue.get(timeout=30) url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus( category) lr.load(url) if check_captcha(lr): lr.load(url) total_price = 0.0 count = 0.0 price_eles = lr.xpaths( '//span[contains(@class, "s-price a-text-bold")]') for price_ele in price_eles: # $49.99 price = price_ele.text.replace('$', '').replace(',', '').split( '-', 1)[0].strip() try: float(price) except: pass else: total_price += float(price) count += 1 if count > 0: ave_price = total_price / count ele = lr.xpath('//h2[@id="s-result-count"]') f.write('%s\t%s\t%.2f\n' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace( ',', ''), ave_price)) f.flush() print '%s\t%s\t%.2f' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace(',', ''), ave_price) except Empty: print 'empty' break except Exception as e: traceback.print_exc() queue.put(category) print 'EEEEEEEEE %s' % e
class AmazonBase(object): CACHE_ROOT = '' CACHE_PAGES_ROOT = '' CACHE_IMAGES_ROOT = '' CACHE_EXPIRED_DAYS = 15 captcha = None def __init__(self, **kwargs): self.lr = LRequest(string_proxy=kwargs.get('string_proxy', '')) self.captcha = GsaCaptcha(ip=kwargs.get('gsa_ip', '192.168.1.188'), port=kwargs.get('gsa_port', '8000')) self.CACHE_ROOT = config.AMAZON_CACHE_ROOT self.CACHE_PAGES_ROOT = kwargs.get( 'cache_page', os.path.join(self.CACHE_ROOT, 'pages')) self.CACHE_IMAGES_ROOT = kwargs.get( 'cache_image', os.path.join(self.CACHE_ROOT, 'images')) if not os.path.exists(self.CACHE_ROOT): os.makedirs(self.CACHE_ROOT) if not os.path.exists(self.CACHE_PAGES_ROOT): os.makedirs(self.CACHE_PAGES_ROOT) if not os.path.exists(self.CACHE_IMAGES_ROOT): os.makedirs(self.CACHE_IMAGES_ROOT) self.domain = kwargs.get('domain', 'amazon.com') self.CACHE_EXPIRED_DAYS = kwargs.get('cache_expired_days', 15) def load(self, url, is_xpath=True, is_decode=True): # logger.info('Load Url: %s' % url) url = urllib.parse.quote(url, safe='https:/') self.lr.load(url, is_xpath=is_xpath, is_decode=is_decode) if self.check_captcha(): self.lr.load(url, is_xpath=is_xpath, is_decode=is_decode) def check_captcha(self): if self.captcha is not None: captcha_img_ele = self.lr.xpath( '//form[contains(@action, "Captcha")]//img[contains(@src, "captcha")]' ) if captcha_img_ele is not None: while 1: logger.info('Need Captcha') try: if captcha_img_ele is not None: print('##### %s ' % captcha_img_ele.attrib['src']) form = self.lr.get_forms()[0] self.lr.load(captcha_img_ele.attrib['src']) cap = self.captcha.decode_stream(self.lr.body) logger.info('Captcha: %s' % cap) form['field-keywords'] = cap self.lr.load(form.click()) else: return True captcha_img_ele = self.lr.xpath( '//form[contains(@action, "Captcha")]//img[contains(@src, "captcha")]' ) except KeyboardInterrupt: raise except IndexError: self.lr.load(self.lr.current_url) captcha_img_ele = self.lr.xpath( '//form[contains(@action, "Captcha")]//img[contains(@src, "captcha")]' ) if captcha_img_ele is None: return True except: # open(os.path.join('I:\\captcha_error_page', '%s.html' % time.time()), 'w').write(self.lr.body) logger.error(traceback.format_exc()) return False else: raise RuntimeError('Not Captcha Server...') def exists_cache(self, cache_name): cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0], cache_name[1], cache_name) return os.path.exists(cache_path) def remove_cache(self, cache_name): cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0], cache_name[1], cache_name) if os.path.exists(cache_path): try: os.remove(cache_path) except: pass def load_cache(self, cache_name): cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0], cache_name[1], cache_name) if os.path.exists(cache_path): try: return pickle.loads(gzip.GzipFile(cache_path, 'rb').read()) except: return {} return {} def save_cache(self, cache_name, data): _p = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0], cache_name[1]) if not os.path.exists(_p): os.makedirs(_p) cache_path = os.path.join(self.CACHE_PAGES_ROOT, cache_name[0], cache_name[1], cache_name) gzip_file = gzip.open(cache_path, 'wb') gzip_file.write(pickle.dumps(data)) gzip_file.close() def exists_image(self, name): image_path = os.path.join(self.CACHE_IMAGES_ROOT, name[0], name[1], name) return os.path.exists(image_path) def save_image(self, name, data): _p = os.path.join(self.CACHE_IMAGES_ROOT, name[0], name[1]) if not os.path.exists(_p): os.makedirs(_p) image_path = os.path.join(self.CACHE_IMAGES_ROOT, name[0], name[1], name) open(image_path, 'wb').write(data) @staticmethod def wrapped_url(url): return url.split('/ref', 1)[0] @cache() @load_html @name @price @brand @merchant @sold_by @reviews @star @ranks_str @other_seller @weight_ounces def product_detail(self, asin, is_cache=True, **kwargs): return kwargs.get('product_info', {}) @cache() @load_html @image_urls @image_data def product(self, asin, is_cache=True, **kwargs): return kwargs.get('product_info', {})