Esempio n. 1
0
def _update_ohlc_daily(date, symbol, table, exchange):
    if symbol == 'ALL':
        items = get_all_symbols(exchange)
    else:
        items = [symbol]

    data = pd.DataFrame()
    cur_idx = 0
    total = len(items)
    for t in items:
        cur_idx += 1
        print "%s/%s .. " % (cur_idx, total)
        df = get_date_ohlc(exchange, t, date)
        if df is not None:
            if len(df) == 1:
                data = data.append(df)
            else:
                yyhtools.info("%s %s %s wrong data." %
                              (exchange, symbol, date))
                yyhtools.error(str(df))

    yyhtools.info("_update_ohlc_daily finished")
    if len(data) == 0:
        return None
    return data
Esempio n. 2
0
def _update_ohlc_daily(date, curr_id, table):
    if curr_id == 0:
        items = codes.all_items
    else:
        item = codes.currid2item.get(curr_id)
        if not item:
            yyhtools.error("curr_id=%s not fund" % curr_id)
        items = [item]

    data = pd.DataFrame()
    cur_idx = 0
    total = len(items)
    for t in items:
        cur_idx +=1
        print "%s/%s .. " % (cur_idx, total)
        PAGE_URL = INVESTING_HOST + t["page_url"]
        df = get_data(PAGE_URL, API_URL, t["curr_id"], date)
        if df is not None:
            if len(df) == 1:
                data = data.append(df)
            else:
                yyhtools.info("%s(curr_id=%s) 获取重复数据." % (t['name'], t['curr_id']))
                yyhtools.error(str(df))
        # else:
        #     yyhtools.error("%s(curr_id=%s) is None" % (t['name'], t['curr_id']))
    print "_update_ohlc_daily finished"
    if len(data) == 0:
        return None
    return data
Esempio n. 3
0
def get_data(exchange):
    df = None
    for _ in range(3):
        try:
            time.sleep(0.1)
            csv_path = "http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=%s&render=download" % exchange
            df = pd.read_csv(csv_path)
            df['Symbol'] = df['Symbol'].apply(string.strip).apply(string.rstrip)
            df['MarketCap'] = df['MarketCap'].apply(marketcap_to_float)
            df.to_sql('us_%s' % exchange, engine, if_exists='replace', index=True, index_label='id')
            ytrack.success("us_%s 数据更新成功" % exchange)
            break
        except requests.exceptions.ConnectionError as e:
            yyhtools.error(traceback.format_exc())
            return
    if df is None:
        ytrack.error("us_%s 数据更新失败." % exchange)
        return

    symbols = df['Symbol'].values.tolist()
    sql = 'select Symbol, cid from us_%s_cid' % exchange
    try:
        a = engine.execute(sql)
        aa = a.fetchall()
        cids = {}
        for symbol, cid in aa:
            cids[symbol] = cid
        for symbol in symbols:
            if symbol not in cids:
                cid = get_cid(exchange, symbol)
                sql = 'insert into us_%s_cid(Symbol, cid) values("%s", "%s")' % (exchange, symbol, cid)
                engine.execute(sql)
                ytrack.success("cid(%s, %s)=%s" % (exchange, symbol, cid))
    except Exception as e:
        yyhtools.error(trackback.format_exc())
Esempio n. 4
0
def get_all_data(items, data_dir):
    if DEBUG:
        items = items[:1]
    for t in items:
        dst = "%s/%s/%s.csv" % (CURDIR, data_dir, t['code'])
        if os.path.exists(dst):
            yyhtools.error("%s exists.." % dst)
            continue
        PAGE_URL = INVESTING_HOST + t["page_url"]
        df = get_data(PAGE_URL, API_URL, t["curr_id"])
        if df is not None:
            df.to_csv(dst)
            yyhtools.info("%s finished." % dst)
        else:
            yyhtools.error("%s is None" % dst)
Esempio n. 5
0
def get_cid(exchange, symbol):
    for _ in range(3):
        try:
            time.sleep(0.005)
            page_url = 'https://www.google.com.hk/finance/historical?q=%s:%s' % (exchange, symbol)
            r = s.get(page_url, proxies=proxies)
            html = lxml.html.parse(StringIO(r.text))
            res = html.xpath('//input[@name=\"cid\"]')
            if len(res) > 0:
                node = res[0]
                return node.value
            return '0'
        except Exception as e:
            print traceback.format_exc()
            yyhtools.error(traceback.format_exc())
            return '0'
Esempio n. 6
0
def get_all_symbols(exchange):
    try:
        sql = 'select Symbol, cid from us_%s_cid' % exchange
        a = engine.execute(sql)
        aa = a.fetchall()
        cids = {}
        symbols = []
        for symbol, cid in aa:
            if not cid or cid == '0':
                continue
            symbols.append(symbol)
        yyhtools.success("get all symbols len(symbols) = %s" % (len(symbols)))
        return symbols
    except Exception as e:
        yyhtools.success("get empty symbols")
        yyhtools.error(trackback.format_exc())
        return []
Esempio n. 7
0
def get_date_ohlc(exchange, symbol, date):
    print exchange, symbol, date
    for _ in range(3):
        try:
            time.sleep(0.005)
            page_url = 'https://www.google.com.hk/finance/historical?q=%s:%s' % (
                exchange, symbol)
            r = s.get(page_url, proxies=proxies)
            html = lxml.html.parse(StringIO(r.text))
            res = html.xpath('//table[@class=\"gf-table historical_price\"]')
            sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)
            if sarr == '':
                return None
            df = pd.read_html(sarr, skiprows=[0])[0]
            df.columns = ['date', 'open', 'high', 'low', 'close', 'amount']
            df = df.drop('amount', axis=1)

            def date_to_int(s):
                y, m, d = s.split("-")
                return int(y) * 10000 + int(m) * 100 + int(d)

            df['date'] = df['date'].apply(date_to_int)
            # df['date'] = pd.to_datetime(df['date'], format=u"%Y-%m-%d")
            df = df.drop_duplicates('date')
            cmp_d = int(date.strftime("%Y%m%d"))
            df = df[df.date == cmp_d]
            if len(df) > 0:
                df['date'] = int(date.strftime("%Y%m%d"))
                code = get_code(symbol)
                assert code > 0, 'symbol code is %s' % code
                df.insert(0, 'code', code)
                df = df.set_index('code')
                return df
            return None
        except Exception as e:
            print traceback.format_exc()
            yyhtools.error(traceback.format_exc())
            return None
Esempio n. 8
0
File: jin10.py Progetto: rainly/hq
def get_data():
    page_url = 'http://www.jin10.com/'
    s = requests.Session()
    s.headers.update({
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
    })

    r = None
    for _ in range(3):
        try:
            time.sleep(0.5)
            r = s.get(page_url)
            break
        except requests.exceptions.ConnectionError as e:
            yyhtools.error("%s" % (page_url))
            yyhtools.error(traceback.format_exc())
            return

    if r is None:
        yyhtools.error("requests.get('%s') is None" % page_url)
        return

    soup = BeautifulSoup(r.text.encode(r.encoding))
    allnews = soup.findAll("div", {"class": "newsline"})
    cnt = 0
    session = Session()
    for news in allnews:
        try:
            id = long(news.attrs.get('id')) / 100
            html = str(news)
            session.merge(News(id=id, html=html))
            cnt += 1
        except:
            ytrack.fail(traceback.format_exc())
    session.commit()
    ytrack.success("%s 成功更新 %s 条记录." % ('jin10_news', cnt))
Esempio n. 9
0
def get_data(page_url, api_url, curr_id):
    s = requests.Session()
    s.headers.update({
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
    })

    for _ in range(3):
        try:
            time.sleep(0.5)
            resp = s.get(page_url)
            break
        except requests.exceptions.ConnectionError as e:
            yyhtools.error("%s %s %s" % (page_url, api_url, curr_id))
            yyhtools.error(traceback.format_exc())
            return

    s.headers.update({"X-Requested-With": "XMLHttpRequest"})

    data = {
        "action": "historical_data",
        "curr_id": str(curr_id),
        "interval_sec": "Daily"
    }
    end_date = datetime.datetime(2016, 12, 2, 0, 0)
    result = pd.DataFrame()
    while True:
        st_date = end_date - datetime.timedelta(days=500)
        data['st_date'] = str(st_date.strftime("%Y/%m/%d"))
        data['end_date'] = str(end_date.strftime("%Y/%m/%d"))
        r = None
        for _ in range(3):
            try:
                time.sleep(0.5)
                r = s.post(api_url, data=data)
                break
            except requests.exceptions.ConnectionError as e:
                yyhtools.error("%s %s %s" % (page_url, api_url, curr_id))
                yyhtools.error(traceback.format_exc())
                continue
        if r is None:
            break

        html = lxml.html.parse(StringIO(r.text))
        try:
            res = html.xpath('//table[@id=\"curr_table\"]')
        except Exception as e:
            yyhtools.error("%s %s %s" % (page_url, api_url, curr_id))
            yyhtools.error(traceback.format_exc())
            break
        if six.PY3:
            sarr = [etree.tostring(node).decode('utf-8') for node in res]
        else:
            sarr = [etree.tostring(node) for node in res]
            sarr = ''.join(sarr)

        if sarr == '':
            break
        df = pd.read_html(sarr)[0]
        if len(df) == 0:
            break
        if len(df) == 1 and df.iloc[0][u'日期'] == 'No results found...':
            break
        result = result.append(df, ignore_index=True)
        end_date = st_date - datetime.timedelta(days=1)

        if len(df) < 10:
            print df
        if DEBUG:
            break
    if len(result) > 0:
        if len(result.columns) == 6:
            result.columns = [
                'date', 'close', 'open', 'high', 'low', 'percentage'
            ]
        else:
            result.columns = [
                'date', 'close', 'open', 'high', 'low', 'amount', 'percentage'
            ]
        result['date'] = pd.to_datetime(result['date'], format=u"%Y年%m月%d日")
        return result
    return None
Esempio n. 10
0
def get_data(page_url, api_url, curr_id, end_date):
    '''
    取 end_date 这一天的数据
    '''
    for _ in range(3):
        try:
            # time.sleep(0.5)
            # resp = s.get(page_url)
            break
        except (requests.exceptions.ConnectionError, Exception) as e:
            yyhtools.error("%s %s %s" % (page_url, api_url, curr_id))
            yyhtools.error(traceback.format_exc())
            return


    data = {"action": "historical_data",
            "curr_id": str(curr_id),
            "interval_sec": "Daily"}
    # end_date = datetime.datetime(2016, 12, 2, 0, 0)
    st_date = end_date
    data['st_date'] = str(st_date.strftime("%Y/%m/%d"))
    data['end_date'] = str(end_date.strftime("%Y/%m/%d"))
    r = None
    for _ in range(3):
        try:
            time.sleep(0.5)
            r = s.post(api_url, data=data)
            break
        except requests.exceptions.ConnectionError as e:
            yyhtools.error("%s %s %s" % (page_url, api_url, curr_id))
            yyhtools.error(traceback.format_exc())
            continue
    if r is None:
        return

    html = lxml.html.parse(StringIO(r.text))
    try:
        res = html.xpath('//table[@id=\"curr_table\"]')
    except Exception as e:
        yyhtools.error("%s %s %s" % (page_url, api_url, curr_id))
        yyhtools.error(traceback.format_exc())
        return
    if six.PY3:
        sarr = [etree.tostring(node).decode('utf-8') for node in res]
    else:
        sarr = [etree.tostring(node) for node in res]
        sarr = ''.join(sarr)

    if sarr == '':
        return
    df = pd.read_html(sarr)[0]
    if len(df) == 0:
        return
    if len(df) == 1 and df.iloc[0][u'日期'] == 'No results found...':
        return

    if len(df.columns) == 6:
        df.columns = ['date', 'close', 'open', 'high', 'low', 'volume']
        df.insert(5, 'amount', 0)
    else:
        df.columns = ['date', 'close', 'open', 'high', 'low', 'amount', 'volume']
    df['date'] = pd.to_datetime(df['date'], format=u"%Y年%m月%d日")
    df = df.drop('volume', axis=1)
    df = df.drop('amount', axis=1)
    df = df[df.date==end_date]
    if len(df) > 1:
        yyhtools.error("%s %s数据重复" % (curr_id, end_date))
        yyhtools.error(str(df))
        df = df[:1]
    df.insert(0, 'code', curr_id)
    df = df.set_index('code')
    df['date'] = int(end_date.strftime("%Y%m%d"))
    return df