def get_new_stock_code(year=None): lr = LRequest() stock_codes = [] if year is None: year = str(datetime.date.today().year) lr.load('http://quotes.money.163.com/data/ipo/shengou.html?reportdate=%s' % year) # lr.loads(BeautifulSoup(lr.body).prettify()) for ele in lr.xpaths('//table[@id="plate_performance"]/tr/td[3]'): # codes # print ele.text.strip() stock_codes.append(ele.text.strip()) for ele in lr.xpaths( '//div[@class="fn_cm_pages"]//a[contains(@href, "page")]' )[:-1]: # pages u = urljoin('http://quotes.money.163.com/data/ipo/shengou.html', ele.attrib['href']) lr.load(u) lr.loads(BeautifulSoup(lr.body, 'lxml').prettify()) for ce in lr.xpaths( '//table[@id="plate_performance"]/tr/td[3]'): # codes # print ce.text.strip() stock_codes.append(ce.text.strip()) return stock_codes
def do(queue, string_proxy): lr = LRequest(string_proxy=string_proxy) while 1: try: # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk category = queue.get(timeout=30) url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus( category) lr.load(url) if check_captcha(lr): lr.load(url) ele = lr.xpath('//h2[@id="s-result-count"]') f.write('%s\t%s\n' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace(',', ''))) f.flush() print '%s\t%s' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace(',', '')) except Empty: print 'empty' break except Exception as e: queue.put(category) print 'EEEEEEEEE %s' % e
def get_codes(delay=.0): # 20200810: need delay 4s codes = [] urls = [ 'http://app.finance.ifeng.com/list/stock.php?t=ha&f=symbol&o=asc', 'http://app.finance.ifeng.com/list/stock.php?t=hs&f=symbol&o=asc', 'http://app.finance.ifeng.com/list/stock.php?t=sa&f=symbol&o=asc', 'http://app.finance.ifeng.com/list/stock.php?t=kcb&f=symbol&o=asc', ] lr = LRequest(delay=delay) try: for url, m in urls: # logger.info('Load: %s' % url) lr.load(url, isdecode=True) while 1: for ele in lr.xpaths( '//div[@class="tab01"]/table//td[1]/a')[:-1]: code = ele.text.strip() if code.isdigit(): codes.append(code) next_ele = lr.xpath(u'//a[contains(text(), "下一页")]') if next_ele is None: break next_url = urljoin(url, next_ele.attrib['href']) # logger.info('Load: %s' % next_url) lr.load(next_url, isdecode=True) except: logger.error(traceback.format_exc()) return codes
def __init__(self, delay=0.0, cache=None, debuglevel=0): #, input, output, **kwargs): # threading.Thread.__init__(self) # self.input = input # self.output = output self.count = 0 self.cache = cache self.debuglevel = debuglevel self.lr = LRequest(delay=delay)
def __init__(self, **kwargs): Amazon.CACHE_ROOT = kwargs.get('CACHE_ROOT', 'I:\\cache') Amazon.CACHE_EXPIRED_DAYS = kwargs.get('CACHE_EXPIRED_DAYS', 15) Amazon.max_workers = kwargs.get('max_workers', 1) Amazon.string_proxies = kwargs.get('string_proxies', []) if len(Amazon.string_proxies) > 0: Amazon.lr = LRequest(string_proxy=Amazon.string_proxies[0]) Amazon.captcha = GsaCaptcha(ip='192.168.1.188', port='8000') self.executor = LThreadPoolExecutor(max_workers=Amazon.max_workers)
def __init__(self, query, *args, **kwargs): self.query = query self._tld = kwargs.get('tld', 'com') self._filter = kwargs.get('filter', 0) self._lang = kwargs.get('lang', 'en') self._num = kwargs.get('num', 100) self._page = kwargs.get('page', 0) timeout = kwargs.get('timeout', 90) string_proxy = kwargs.get('string_proxy', None) self.lr = LRequest(timeout=timeout, string_proxy=string_proxy, handers=[GoogleHTTPErrorProcessor(), ])
def get_codes_sina(delay=.0): codes = [] # url = 'http://vip.stock.finance.sina.com.cn/datacenter/hqstat.html#jdgd' url = '''http://money.finance.sina.com.cn/quotes_service/api/jsonp_v2.php/IO.XSRV2.CallbackList['ys65jC9HtVOEBgTh']/StatisticsService.getPeriodList?page=1&num=9999&sort=_5high&asc=0&node=adr_hk''' lr = LRequest(delay=delay) try: lr.load(url, isdecode=True) for s in json.loads(lr.body.split('](', 1)[-1][:-2]): codes.append(s['symbol']) except: logger.error(traceback.format_exc()) return codes
def do(queue, string_proxy): lr = LRequest(string_proxy=string_proxy) while 1: try: # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk category = queue.get(timeout=30) url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus( category) lr.load(url) if check_captcha(lr): lr.load(url) total_price = 0.0 count = 0.0 price_eles = lr.xpaths( '//span[contains(@class, "s-price a-text-bold")]') for price_ele in price_eles: # $49.99 price = price_ele.text.replace('$', '').replace(',', '').split( '-', 1)[0].strip() try: float(price) except: pass else: total_price += float(price) count += 1 if count > 0: ave_price = total_price / count ele = lr.xpath('//h2[@id="s-result-count"]') f.write('%s\t%s\t%.2f\n' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace( ',', ''), ave_price)) f.flush() print '%s\t%s\t%.2f' % (category, ele.text.split( 'result', 1)[0].split('of')[-1].strip().replace(',', ''), ave_price) except Empty: print 'empty' break except Exception as e: traceback.print_exc() queue.put(category) print 'EEEEEEEEE %s' % e
def __init__(self, **kwargs): self.lr = LRequest(string_proxy=kwargs.get('string_proxy', '')) self.captcha = GsaCaptcha(ip=kwargs.get('gsa_ip', '192.168.1.188'), port=kwargs.get('gsa_port', '8000')) self.CACHE_ROOT = kwargs.get('cache_root', 'I:\\cache_amazon') self.CACHE_PAGES_ROOT = kwargs.get('cache_page', os.path.join(self.CACHE_ROOT, 'pages')) self.CACHE_IMAGES_ROOT = kwargs.get('cache_image', os.path.join(self.CACHE_ROOT, 'images')) if not os.path.exists(self.CACHE_ROOT): os.makedirs(self.CACHE_ROOT) if not os.path.exists(self.CACHE_PAGES_ROOT): os.makedirs(self.CACHE_PAGES_ROOT) if not os.path.exists(self.CACHE_IMAGES_ROOT): os.makedirs(self.CACHE_IMAGES_ROOT) self.domain = kwargs.get('domain', 'amazon.com') self.CACHE_EXPIRED_DAYS = kwargs.get('cache_expired_days', 15)
def iter_name(string_proxy, queue): lr = LRequest(string_proxy) while 1: try: url, deep = queue.get(timeout=30) xp = '//ul[@id="zg_browseRoot"]/%s/li/a' % '/'.join(['ul' for i in range(deep)]) # print xp lr.load(url.encode('utf-8')) next_deep = deep + 1 for ele in lr.xpaths(xp): name = ele.text.strip() if name not in categories: categories.add(name) print name.encode('utf-8') queue.put([ele.attrib['href'], next_deep]) except Empty: print 'Empty'
def get_all_codes(): stock_code_url = 'http://quote.eastmoney.com/center/gridlist.html' # 'http://quote.eastmoney.com/stocklist.html' # us: http://quote.eastmoney.com/usstocklist.html exchanges = ['ss', 'sz', 'hk'] lr = LRequest() stock_codes = [] lr.load(stock_code_url) # stock_eles = lr.xpath('//div[@id="quotesearch"]//li/a[@target="_blank"]') stock_exchange_eles = lr.xpaths('//div[@id="quotesearch"]/ul') for i, stock_exchange_ele in enumerate(stock_exchange_eles): stock_eles = stock_exchange_ele.xpath('./li/a[@target="_blank"]') for stock_ele in stock_eles: # code = stock_ele.get('href').rsplit('/', 1)[-1].split('.', 1)[0] if stock_ele.text: code = stock_ele.text.split('(', 1)[-1].split(')', 1)[0] stock_codes.append((exchanges[i], code)) return stock_codes
def iter_name(string_proxy, queue): lr = LRequest(string_proxy) while 1: try: url, deep = queue.get(timeout=30) xp = '//ul[@id="zg_browseRoot"]/%s/li/a' % '/'.join(['ul' for i in range(deep)]) # logger.info(xp) lr.load(url) next_deep = deep + 1 for ele in lr.xpaths(xp): name = ele.text.strip() if name not in categories: categories.add(name) logger.info(name) queue.put([ele.attrib['href'], next_deep]) # except KeyboardInterrupt: # return except Exception as ex: traceback.print_exec() except Empty: logger.info('Empty')
def __init__(self, ip='127.0.0.1', port='80'): self.ip = ip self.port = port self.lr = LRequest()
# -*- coding: utf-8 -*- __author__ = 'xtwxfxk' import urllib.parse from lutils.lrequest import LRequest # url = 'https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0' url = 'https://www.amazon.com/Best-Sellers-Home-Kitchen-Décor-Products/zgbs/home-garden/1063278' # url = urllib.parse.quote('https://www.amazon.com/Best-Sellers-Home-Kitchen-Décor-Products/zgbs/home-garden/1063278') # url = urllib.parse.urlencode('https://www.amazon.com/Best-Sellers-Home-Kitchen-Décor-Products/zgbs/home-garden/1063278') url = urllib.parse.quote(url, safe='https:/') print(url) lr = LRequest() lr.load(url, is_decode=True) eles = lr.xpaths('//ul[@id="zg_browseRoot"]/ul/ul/ul/li/a') for ele in eles: print(ele.text.strip(), ele.attrib['href']) # https://www.amazon.com/Best-Sellers-Home-Kitchen-D%C3%A9cor-Products/zgbs/home-garden/1063278/
def __init__(self, **kwargs): self.lr = LRequest(string_proxy=kwargs.get('string_proxy', ''))