Exemple #1
0
def get_new_stock_code(year=None):

    lr = LRequest()
    stock_codes = []

    if year is None:
        year = str(datetime.date.today().year)

    lr.load('http://quotes.money.163.com/data/ipo/shengou.html?reportdate=%s' %
            year)
    # lr.loads(BeautifulSoup(lr.body).prettify())

    for ele in lr.xpaths('//table[@id="plate_performance"]/tr/td[3]'):  # codes
        # print ele.text.strip()
        stock_codes.append(ele.text.strip())

    for ele in lr.xpaths(
            '//div[@class="fn_cm_pages"]//a[contains(@href, "page")]'
    )[:-1]:  # pages
        u = urljoin('http://quotes.money.163.com/data/ipo/shengou.html',
                    ele.attrib['href'])

        lr.load(u)
        lr.loads(BeautifulSoup(lr.body, 'lxml').prettify())

        for ce in lr.xpaths(
                '//table[@id="plate_performance"]/tr/td[3]'):  # codes
            # print ce.text.strip()
            stock_codes.append(ce.text.strip())

    return stock_codes
Exemple #2
0
def do(queue, string_proxy):
    lr = LRequest(string_proxy=string_proxy)
    while 1:
        try:
            # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk
            category = queue.get(timeout=30)
            url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus(
                category)

            lr.load(url)
            if check_captcha(lr):
                lr.load(url)
            ele = lr.xpath('//h2[@id="s-result-count"]')

            f.write('%s\t%s\n' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(',', '')))
            f.flush()
            print '%s\t%s' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(',', ''))

        except Empty:
            print 'empty'
            break
        except Exception as e:
            queue.put(category)
            print 'EEEEEEEEE %s' % e
Exemple #3
0
def get_codes(delay=.0):  # 20200810: need delay 4s
    codes = []
    urls = [
        'http://app.finance.ifeng.com/list/stock.php?t=ha&f=symbol&o=asc',
        'http://app.finance.ifeng.com/list/stock.php?t=hs&f=symbol&o=asc',
        'http://app.finance.ifeng.com/list/stock.php?t=sa&f=symbol&o=asc',
        'http://app.finance.ifeng.com/list/stock.php?t=kcb&f=symbol&o=asc',
    ]

    lr = LRequest(delay=delay)

    try:
        for url, m in urls:
            # logger.info('Load: %s' % url)
            lr.load(url, isdecode=True)
            while 1:
                for ele in lr.xpaths(
                        '//div[@class="tab01"]/table//td[1]/a')[:-1]:
                    code = ele.text.strip()
                    if code.isdigit():
                        codes.append(code)

                next_ele = lr.xpath(u'//a[contains(text(), "下一页")]')
                if next_ele is None:
                    break
                next_url = urljoin(url, next_ele.attrib['href'])
                # logger.info('Load: %s' % next_url)
                lr.load(next_url, isdecode=True)
    except:
        logger.error(traceback.format_exc())
    return codes
Exemple #4
0
    def __init__(self,
                 delay=0.0,
                 cache=None,
                 debuglevel=0):  #, input, output, **kwargs):
        # threading.Thread.__init__(self)

        # self.input = input
        # self.output = output

        self.count = 0
        self.cache = cache
        self.debuglevel = debuglevel
        self.lr = LRequest(delay=delay)
Exemple #5
0
    def __init__(self, **kwargs):

        Amazon.CACHE_ROOT = kwargs.get('CACHE_ROOT', 'I:\\cache')
        Amazon.CACHE_EXPIRED_DAYS = kwargs.get('CACHE_EXPIRED_DAYS', 15)

        Amazon.max_workers = kwargs.get('max_workers', 1)
        Amazon.string_proxies = kwargs.get('string_proxies', [])

        if len(Amazon.string_proxies) > 0:
            Amazon.lr = LRequest(string_proxy=Amazon.string_proxies[0])

        Amazon.captcha = GsaCaptcha(ip='192.168.1.188', port='8000')

        self.executor = LThreadPoolExecutor(max_workers=Amazon.max_workers)
Exemple #6
0
    def __init__(self, query, *args, **kwargs):

        self.query = query

        self._tld = kwargs.get('tld', 'com')
        self._filter = kwargs.get('filter', 0)
        self._lang = kwargs.get('lang', 'en')
        self._num = kwargs.get('num', 100)
        self._page = kwargs.get('page', 0)

        timeout = kwargs.get('timeout', 90)
        string_proxy = kwargs.get('string_proxy', None)

        self.lr = LRequest(timeout=timeout, string_proxy=string_proxy, handers=[GoogleHTTPErrorProcessor(), ])
Exemple #7
0
def get_codes_sina(delay=.0):
    codes = []
    # url = 'http://vip.stock.finance.sina.com.cn/datacenter/hqstat.html#jdgd'
    url = '''http://money.finance.sina.com.cn/quotes_service/api/jsonp_v2.php/IO.XSRV2.CallbackList['ys65jC9HtVOEBgTh']/StatisticsService.getPeriodList?page=1&num=9999&sort=_5high&asc=0&node=adr_hk'''

    lr = LRequest(delay=delay)

    try:
        lr.load(url, isdecode=True)

        for s in json.loads(lr.body.split('](', 1)[-1][:-2]):
            codes.append(s['symbol'])
    except:
        logger.error(traceback.format_exc())

    return codes
Exemple #8
0
def do(queue, string_proxy):
    lr = LRequest(string_proxy=string_proxy)
    while 1:
        try:
            # https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=sheets+silk
            category = queue.get(timeout=30)
            url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%%3Daps&field-keywords=%s' % urllib.quote_plus(
                category)

            lr.load(url)
            if check_captcha(lr):
                lr.load(url)

            total_price = 0.0
            count = 0.0
            price_eles = lr.xpaths(
                '//span[contains(@class, "s-price a-text-bold")]')
            for price_ele in price_eles:  # $49.99
                price = price_ele.text.replace('$', '').replace(',', '').split(
                    '-', 1)[0].strip()
                try:
                    float(price)
                except:
                    pass
                else:
                    total_price += float(price)
                    count += 1
            if count > 0:
                ave_price = total_price / count

            ele = lr.xpath('//h2[@id="s-result-count"]')

            f.write('%s\t%s\t%.2f\n' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(
                    ',', ''), ave_price))
            f.flush()
            print '%s\t%s\t%.2f' % (category, ele.text.split(
                'result', 1)[0].split('of')[-1].strip().replace(',',
                                                                ''), ave_price)

        except Empty:
            print 'empty'
            break
        except Exception as e:
            traceback.print_exc()
            queue.put(category)
            print 'EEEEEEEEE %s' % e
Exemple #9
0
    def __init__(self, **kwargs):

        self.lr = LRequest(string_proxy=kwargs.get('string_proxy', ''))

        self.captcha = GsaCaptcha(ip=kwargs.get('gsa_ip', '192.168.1.188'), port=kwargs.get('gsa_port', '8000'))

        self.CACHE_ROOT = kwargs.get('cache_root', 'I:\\cache_amazon')
        self.CACHE_PAGES_ROOT = kwargs.get('cache_page', os.path.join(self.CACHE_ROOT, 'pages'))
        self.CACHE_IMAGES_ROOT = kwargs.get('cache_image', os.path.join(self.CACHE_ROOT, 'images'))

        if not os.path.exists(self.CACHE_ROOT): os.makedirs(self.CACHE_ROOT)
        if not os.path.exists(self.CACHE_PAGES_ROOT): os.makedirs(self.CACHE_PAGES_ROOT)
        if not os.path.exists(self.CACHE_IMAGES_ROOT): os.makedirs(self.CACHE_IMAGES_ROOT)

        self.domain = kwargs.get('domain', 'amazon.com')

        self.CACHE_EXPIRED_DAYS = kwargs.get('cache_expired_days', 15)
Exemple #10
0
def iter_name(string_proxy, queue):
    lr = LRequest(string_proxy)

    while 1:
        try:
            url, deep = queue.get(timeout=30)
            xp = '//ul[@id="zg_browseRoot"]/%s/li/a' % '/'.join(['ul' for i in range(deep)])
            # print xp
            lr.load(url.encode('utf-8'))

            next_deep = deep + 1
            for ele in lr.xpaths(xp):
                name = ele.text.strip()
                if name not in categories:
                    categories.add(name)
                    print name.encode('utf-8')
                    queue.put([ele.attrib['href'], next_deep])

        except Empty:
            print 'Empty'
Exemple #11
0
def get_all_codes():
    stock_code_url = 'http://quote.eastmoney.com/center/gridlist.html'  # 'http://quote.eastmoney.com/stocklist.html' # us: http://quote.eastmoney.com/usstocklist.html
    exchanges = ['ss', 'sz', 'hk']

    lr = LRequest()
    stock_codes = []

    lr.load(stock_code_url)

    # stock_eles = lr.xpath('//div[@id="quotesearch"]//li/a[@target="_blank"]')
    stock_exchange_eles = lr.xpaths('//div[@id="quotesearch"]/ul')

    for i, stock_exchange_ele in enumerate(stock_exchange_eles):
        stock_eles = stock_exchange_ele.xpath('./li/a[@target="_blank"]')
        for stock_ele in stock_eles:
            # code = stock_ele.get('href').rsplit('/', 1)[-1].split('.', 1)[0]
            if stock_ele.text:
                code = stock_ele.text.split('(', 1)[-1].split(')', 1)[0]

                stock_codes.append((exchanges[i], code))

    return stock_codes
def iter_name(string_proxy, queue):
    lr = LRequest(string_proxy)

    while 1:
        try:
            url, deep = queue.get(timeout=30)
            xp = '//ul[@id="zg_browseRoot"]/%s/li/a' % '/'.join(['ul' for i in range(deep)])
            # logger.info(xp)
            lr.load(url)

            next_deep = deep + 1
            for ele in lr.xpaths(xp):
                name = ele.text.strip()
                if name not in categories:
                    categories.add(name)
                    logger.info(name)
                    queue.put([ele.attrib['href'], next_deep])
        # except KeyboardInterrupt:
        #     return
        except Exception as ex:
            traceback.print_exec()
        except Empty:
            logger.info('Empty')
Exemple #13
0
    def __init__(self, ip='127.0.0.1', port='80'):
        self.ip = ip
        self.port = port

        self.lr = LRequest()
Exemple #14
0
# -*- coding: utf-8 -*-
__author__ = 'xtwxfxk'

import urllib.parse
from lutils.lrequest import LRequest

# url = 'https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0'
url = 'https://www.amazon.com/Best-Sellers-Home-Kitchen-Décor-Products/zgbs/home-garden/1063278'
# url = urllib.parse.quote('https://www.amazon.com/Best-Sellers-Home-Kitchen-Décor-Products/zgbs/home-garden/1063278')
# url = urllib.parse.urlencode('https://www.amazon.com/Best-Sellers-Home-Kitchen-Décor-Products/zgbs/home-garden/1063278')
url = urllib.parse.quote(url, safe='https:/')
print(url)
lr = LRequest()

lr.load(url, is_decode=True)
eles = lr.xpaths('//ul[@id="zg_browseRoot"]/ul/ul/ul/li/a')

for ele in eles:
    print(ele.text.strip(), ele.attrib['href'])

# https://www.amazon.com/Best-Sellers-Home-Kitchen-D%C3%A9cor-Products/zgbs/home-garden/1063278/
Exemple #15
0
        def __init__(self, **kwargs):

            self.lr = LRequest(string_proxy=kwargs.get('string_proxy', ''))