def test_everydayCount(self): Stocktradedate.importList() for d in [ d[0] for d in set( list(HSGTCGHold.getlist().values_list('tradedate'))) ]: dcount = HSGTCGHold.getlist(d).count() print(d, dcount) from stocks.models import HSGTCG tdate = HSGTCGHold.getNearestTradedate() while tdate > convertToDate('2018-5-1'): dcount = HSGTCGHold.getlist(tdate).count() dcount1 = HSGTCG.getlist().filter(tradedate=tdate, hamount__gte=8000).count() print(tdate, dcount, dcount1, dcount - dcount1) tdate = HSGTCGHold.getNearestTradedate(tdate - datetime.timedelta(1)) from stocks.models import HSGTCG tdate = HSGTCGHold.getNearestTradedate() while tdate > convertToDate('2018-5-1'): df1 = pd.DataFrame(list(HSGTCGHold.getlist(tdate).values('code'))) dcount = len(df1) df2 = pd.DataFrame( list(HSGTCG.getlist().filter( tradedate=tdate, hamount__gte=8000).values('code'))) dcount1 = len(df2) print(tdate, dcount, dcount1, dcount - dcount1) if dcount - dcount1 != 0 and tdate > convertToDate('2018-6-1'): print('数据不一致:', end='') print(list(HSGTCGHold.dfNotInAnotherdf(df1, df2)['code'])) tdate = HSGTCGHold.getNearestTradedate(tdate - datetime.timedelta(1))
def get_real_datelisting(cls, start, end): """ 取数据的真实区间列表 :param start: :param end: :return: list of datetime.date() """ datesse = cls() return datesse.trade_date_sse.filter( tradedate__gte=convertToDate(start), tradedate__lte=convertToDate(end))
def daterange(start_date, end_date): """ Iterating through a range of dates 例子: start_date = date(2013, 1, 1) end_date = date(2015, 6, 2) for single_date in daterange(start_date, end_date): print(single_date.strftime("%Y-%m-%d")) :param start_date: 开始日期 :param end_date: 结束日期 :return: 从start到end截止的日期序列 """ from datetime import timedelta for n in range(int((convertToDate(end_date) - convertToDate(start_date) + 1).days)): yield start_date + timedelta(n)
def scrapjson(url): import requests, json # 异常处理 最多三次抓取 for _ in range(3): try: response = requests.get(url, timeout=40) response = response.content.decode() data = response # data = data[len('var CiydgPzJ='):len(response) - 2] data = data[len('var CiydgPzJ='):] data_list = json.loads( data.replace('pages', '"pages"').replace('data', ' "data"')) df = pd.DataFrame(data_list['data']) df['code'] = df.SCODE.astype(str) df['hamount'] = df.SHAREHOLDPRICE.apply( lambda x: round(x / 10000, 2)).astype(float) df['tradedate'] = df['HDDATE'].apply( lambda x: convertToDate(str(x)[:10])).astype(datetime.date) if len(df) > 0: break except Exception as e: print('requests.get(url, timeout=40)\n{}'.format(e.args)) time.sleep(1) return df[['code', 'tradedate', 'hamount']]
def getStockHdStatistics(cls, code, browser, retryCount=3): """ 抓取持股统计 :param code: 股票代码 :param browser: webdriver浏览器 :return: """ url = 'http://data.eastmoney.com/hsgtcg/StockHdStatistics.aspx?stock={}'.format( code) for i in range(retryCount): df = cls.scrap(url, browser) if len(df) > 0: # 修复持股数量 df['hvol'] = df['hvol'].apply( lambda x: HSGTCG.hz2Num(x)).astype(float) df['hamount'] = df['hamount'].apply( lambda x: HSGTCG.hz2Num(x)).astype(float) df['close'] = df['close'].astype(float) df['tradedate'] = df['tradedate'].apply( lambda x: convertToDate(x)).astype(datetime.date) df = df[df['tradedate'].apply( lambda x: Stocktradedate.if_tradeday( x))] # 删除不是交易日的数据。这是东方财富网页版的bug df.index = pd.RangeIndex(len(df.index)) break else: pass return df
def test_newcomingin(self): """ 新加入市值八千万的个股 :return: """ Stocktradedate.importList() from stocks.models import HSGTCG # 2018 - 06 - 04 新增北向持股金额大于八千万 list1 = [ '603658', '600460', '002812', '002557', '600188', '000690', '600329' ] tdate = HSGTCGHold.getNearestTradedate() # tdate = HSGTCGHold.getNearestTradedate('2018-6-4') tdate1 = HSGTCGHold.getNearestTradedate(tdate - datetime.timedelta(1)) hsg = HSGTCGHold.getlist(tdate) hsg1 = HSGTCGHold.getlist(tdate1) list2 = [] for c in hsg.exclude(code__in=hsg1.values_list('code')): list2.append(c.code) # 验证是否前一天市值小于八千万 for code in list2: df = pd.DataFrame( list( HSGTCG.getlist(code).filter(tradedate__gte=tdate1).values( ).order_by('-tradedate'))) data1 = float(df.iloc[-2].hamount) data2 = float(df.iloc[-1].hamount) if not (data1 >= 8000 and data2 < 8000): print('不是新增持股金额大于八千万:{} 持股金额:{} {}'.format(code, data1, data2)) tdate = HSGTCGHold.getNearestTradedate() while tdate > convertToDate('2018-5-2'): tdate1 = HSGTCGHold.getNearestTradedate(tdate - datetime.timedelta(1)) hsg = HSGTCGHold.getlist(tdate) hsg1 = HSGTCGHold.getlist(tdate1) list2 = [] for c in hsg.exclude(code__in=hsg1.values_list('code')): list2.append(c.code) # 验证是否前一天市值小于八千万 for code in list2: df = pd.DataFrame( list( HSGTCG.getlist(code).filter(tradedate__gte=tdate1). values().order_by('-tradedate'))) if len(df) > 1 and HSGTCG.getlist().filter( tradedate=tdate1).count() > 0: # if len(df)> 1 : data1 = float(df.iloc[-2].hamount) data2 = float(df.iloc[-1].hamount) if not (data1 >= 8000 and data2 < 8000): print('{} 不是新增持股金额大于八千万:{} 持股金额:{} {}'.format( tdate, code, data1, data2)) tdate = HSGTCGHold.getNearestTradedate(tdate - datetime.timedelta(1))
def test_scrapt(self): """ 个股北向持股 :return: """ code = '600066' code = '000425' code = '000792' code = '002493' url = 'http://data.eastmoney.com/hsgtcg/StockHdStatistics.aspx?stock={}'.format( code) browser = webdriver.Firefox() browser.maximize_window() try: browser.get(url) for x in ['lxml', 'xml', 'html5lib']: # 可能会出现lxml版本大于4.1.1时,获取不到table try: time.sleep(random.random() / 4) soup = BeautifulSoup(browser.page_source, x) table = soup.find_all(id='tb_cgtj')[0] if table: break except: time.sleep(0.1) print('using BeautifulSoup {}'.format(x)) # soup = BeautifulSoup(browser.page_source, 'lxml') # table = soup.find_all(id='tb_cgtj')[0] df = pd.read_html(str(table), header=1)[0] df.columns = [ 'date', 'related', 'close', 'zd', 'hvol', 'hamount', 'hpercent', 'oneday', 'fiveday', 'tenday' ] for i in df.index: v = df.iloc[i] print('{} {} {} {}'.format(v.close, v.hvol, v.hamount, v.hpercent)) HSGTCG.objects.get_or_create(code=code, close=v.close, hvol=str2Float(v.hvol), hamount=str2Float(v.hamount), hpercent=v.hpercent, tradedate=convertToDate(v.date)) finally: if browser: browser.close() hsgtcg = HSGTCG.getlist(code) # hsgtcg = HSGTCG.getlist() print(hsgtcg) self.assertTrue(hsgtcg.count() > 10, '保存的数量: {}'.format(hsgtcg.count())) self.assertTrue(isinstance(hsgtcg[0].tradedate, datetime.date)) self.assertTrue(hsgtcg.filter(tradedate=None).count() == 0)
def importjsonList(cls, enddate=None, days=5): """ 导入市值大于指定值的列表 网址: http://data.eastmoney.com/hsgtcg/StockStatistics.aspx 直接下载json文件转换,效率比importList高 :param firefoxHeadless: 是否显示浏览器界面: True 不显示界面 False 显示界面 默认不显示浏览器界面 :return: 最近交易日期的列表jsonname """ if enddate: end = convertToDate(enddate) else: end = cls.getNearestTradedate() hsgh = cls.getlist(tradedate=end) if hsgh.count() > 0: return hsgh pagesize = 300 # 每页数据量 page = 1 sortRule = -1 # sortRule-1 按照市值降序排序; 1 按照市值升序排序 start = end - datetime.timedelta(days) jsname = cls.getRandomStr('letter') for page in range(1, 100): if page > 1: sr = -1 # st=SHAREHOLDPRICE 按照持股市值排序; st sortType url = 'http://dcfm.eastmoney.com//em_mutisvcexpandinterface/api/js/' \ 'get?type=HSGTHDSTA&token=70f12f2f4f091e459a279469fe49eca5&st=SHAREHOLDPRICE&sr={sortRule}' \ '&p={page}&ps={pagesize}&js=var%20{jsname}={pages:(tp),data:(x)}&filter=(MARKET%20in%20(%27001%27,%27003%27))' \ '(HDDATE%3E=^{start}^%20and%20HDDATE%3C=^{end}^)&rt=50950960' \ .replace('{start}', str(start)).replace('{end}', str(end)) \ .replace('{sortRule}', str(sortRule)) \ .replace('{pagesize}', str(pagesize)) \ .replace('{page}', str(page)) \ .replace('{jsname}', str(jsname)) df = cls.scrapjson(url) dfn = df[df['hamount'] >= MINHAMOUNT] if len(dfn) > 0: dfn = dfn[dfn['tradedate'].apply( lambda x: Stocktradedate.if_tradeday(x))] # 去除重复数据 dfn = dfn[~dfn.duplicated()] cls.savedf(dfn[['code', 'tradedate']]) print('page: {}'.format(page)) if len(df[df['hamount'] < MINHAMOUNT]): # 持股金额小于 break return cls.getlist(tradedate=cls.getNearestTradedate())
def getlist(cls, tradedate=None): """ 返回列表 :param tradedate: 交易日期 :return: objects.all().filter(tradedate=convertToDate(tradedate)) """ if tradedate: # 返回所有代码 # from stocks.models import convertToDate return cls.objects.all().filter(tradedate=convertToDate(tradedate)) return cls.objects.all()
def importList(cls, firefoxHeadless=True): i, j = 0, 0 while i < 10 and j == 0: # 最多循环十次,若j在退出循环的时候为0,则无数据 hsgh = HSGTCGHold.getlist( tradedate=datetime.datetime.now().date() - datetime.timedelta(i + 1)) i += 1 j = hsgh.count() if j == 0: HSGTCGHold.importList() hsgh = HSGTCGHold.getlist( tradedate=datetime.datetime.now().date() - datetime.timedelta(1)) browser = cls.getBrowser(firefoxHeadless) try: for code in list(hsgh.values_list('code')): hsghc = hsgh.filter(code=code) if hsghc.count() > 0: continue url = 'http://data.eastmoney.com/hsgtcg/StockHdStatistics.aspx?stock={}'.format( code[0]) df = cls.scrap(url, browser) # 修复持股数量 df['hvol'] = df['hvol'].apply( lambda x: HSGTCGHold.hz2Num(x)).astype(float) df['hamount'] = df['hamount'].apply( lambda x: HSGTCGHold.hz2Num(x)).astype(float) df['close'] = df['close'].astype(float) with transaction.atomic(): for i in df.index: v = df.iloc[i] try: print('saving ... {} {}'.format(code[0], v.close)) HSGTCG.objects.get_or_create( code=code[0], close=v.close, hvol=v.hvol, hamount=v.hamount, hpercent=v.hpercent, tradedate=convertToDate(v.date)) except Exception as e: # print(code[0], v, type(v.close), type(v.hpercent)) print(code[0], e.args) # raise Exception(e.args) finally: if browser: browser.close()
def importStockListing(cls, start=None): """ 插入所有股票RPS预备数据 :return: """ if start is None: # 数据库中最大的已计算日期 latest = cls.getlist('stock').aggregate( Max('tradedate'))['tradedate__max'] if latest: start = cls.getNearestTradedate(latest, -5) else: start = '2014-1-1' codelist = Listing.getlist('stock') # todo 如果已经插入,则判断是否有更新 try: # 批量创建对象,减少SQL查询次数 querysetlist = [] delisted = [] # quantaxis中无数据list qssaved = [] tdate = cls.getNearestTradedate() realStart120 = cls.getNearestTradedate(start, -120) realStart = cls.getNearestTradedate(start, -250) # with transaction.atomic(): # for v in codelist.values()[11:100]: for v in codelist.values(): print('Dealing {} {} {}'.format(format(v['id'], '05d'), v['code'], v['name'])) try: # get stockcode code = Listing.objects.get(code=v['code'], category=10) # 本地获取指数日线数据 data = qa.QA_fetch_stock_day_adv( v['code'], realStart, datetime.datetime.now().strftime("%Y-%m-%d")).to_qfq() if len(data) > 120: df = pd.DataFrame(data.close) df['rps120'] = round(df.close / df.close.shift(120), 3) df['rps250'] = round(df.close / df.close.shift(250), 3) del df['close'] if code.timeToMarket > realStart120: # 上市日期较早 cutDay = 120 else: cutDay = 250 df = df[cutDay:] df.reset_index(inplace=True) df.columns = ['tradedate', 'code', 'rps120', 'rps250'] del df['code'] df['tradedate'] = df['tradedate'].apply( lambda x: convertToDate(str(x)[:10])).astype( datetime.date) df['code_id'] = code.id df, dfsaved = cls.dfNotInModel(df, code.id, df['tradedate'].min()) if len(df) > 0: # print(df) cls.savedf(df) if len(dfsaved) > 0: # 日期在原来保存区间的数据 qssaved.append(dfsaved) except Exception as e: delisted.append(v['code']) print(len(delisted), e.args) # print(df) cls.updateSaved(qssaved) print('保存过的数据更新数量 {} \n {}'.format(len(qssaved), qssaved)) print('delisted count {} :\n {}'.format(len(delisted), delisted)) # RPSprepare.objects.bulk_create(querysetlist) except Exception as e: print(e.args) return cls.getlist('stock')