def get_dalian_ranks(year=2018, month=5, day=11): minus_month = int(month) - 1 values = { 'memberDealPosiQuotes.variety': 'b', 'memberDealPosiQuotes.trade_type': '0', 'year': str(year), 'month': str(minus_month), 'day': str(day), 'contract.contract_id': 'all', 'contract.variety_id': 'b', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://www.dce.com.cn', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.dce.com.cn', 'Referer': 'http://www.dce.com.cn/publicweb/quotesdata/memberDealPosiQuotes.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', } url = "http://www.dce.com.cn/publicweb/quotesdata/memberDealPosiQuotes.html" base_url = "http://www.dce.com.cn/publicweb/quotesdata/memberDealPosiQuotes.html" session = requests.Session() session.headers.update(headers) base_html = session.get(base_url) print(base_html.text) print(session.headers) session.headers.update(headers) print(session.headers) variety_list = [ 'a', 'b', 'm', 'y', 'p', 'c', 'cs', 'jd', 'fb', 'bb', 'l', 'v', 'pp', 'j', 'jm', 'i' ] cn_variety_names = [ '豆一', '豆二', '豆粕', '豆油', '棕榈油', '玉米', '玉米淀粉', '鸡蛋', '纤维板', '胶合板', '聚乙烯', '聚氯乙烯', '聚丙烯', '焦炭', '焦煤', '铁矿石', ] # s = session.post(url, values) # print(s.text) # tree = html.fromstring(s.text) # table = tree.xpath('//*[@id="printData"]/div/table[2]')[0] # print(table) # data = [ # [td.text_content().strip() for td in row.findall('td')] # for row in table.findall('tr') # ] # print(data) # df = pd.DataFrame(data, ) df = None for a_var, cn_name in zip(variety_list, cn_variety_names): values['contract.variety_id'] = a_var s = session.post( url, values, ) print(s.text) tree = html.fromstring(s.text) table = tree.xpath('//*[@id="printData"]/div/table[2]')[0] print(table) data = [[td.text_content().strip() for td in row.findall('td')] for row in table.findall('tr')] print(data) col_names = [ 'rank', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG', 'rank2', 'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'rank3', 'PARTICIPANTABBR3', 'CJ3', 'CJ3_CHG', ] t_df = pd.DataFrame(data, columns=col_names) t_df.dropna(axis='index', inplace=True) # t_df.columns = col_names print(t_df) t_df.to_csv('t_df_dalian_{instrument}.csv'.format(instrument=a_var), index=False) # df[0].str.contains(a_header) t_df.drop(t_df[t_df['rank'].str.contains('总计')].index, inplace=True, axis='index') t_df.drop(['rank2', 'rank3'], axis='columns', inplace=True) print(t_df) t_df['instrumentid'] = a_var t_df['productname'] = cn_name import datetime today = datetime.datetime.now() t_df['date'] = today if df is None: df = t_df else: df = df.append(t_df) print(df) df.to_csv('t_df_dalian_sum.csv', index=False) from db_insert2 import set_ranks_df set_ranks_df(df, year=year, month=month, day=day, exchange='DCE') return
def czce_scrape(year=2018, month=6, day=11): from lxml import html url = 'http://www.czce.com.cn/portal/DFSStaticFiles/Future/' \ '{year:>04}/{year:>04}{month:>02}{day:>02}/FutureDataHolding.htm'.format(year=year, month=month, day=day) parser = html.HTMLParser(encoding='gbk') # root = html.document_fromstring(content, parser=parser) try: tree = html.parse(url, parser=parser) except Exception as e: print(e) return # tree.docinfo table = tree.findall('//table')[1] data = [[td.text_content().strip() for td in row.findall('td')] for row in table.findall('tr')] df = pd.DataFrame(data, ) df[0] = df[0].str.replace('\xa0', '') print(df) form_header = ['品种', '合约', '合计'] temp_index = [] start_index = [] import collections header_index_dict = collections.OrderedDict() for a_header in form_header: a_index_list = df.index[df[0].str.contains(a_header)].tolist() if not a_index_list: continue header_index_dict.update({a_header: a_index_list}) if '品种' in header_index_dict: contracts = header_index_dict['品种'] if '合计' not in header_index_dict: print("Error") end_index = header_index_dict['合计'] for beg, end in zip(contracts, end_index): t_df = df[beg:end] t_df = t_df.applymap(lambda x: x.replace(',', '') if x else x) t_df.reset_index(inplace=True, drop=True) print(t_df) h_str = t_df.iat[0, 0] # '品种:苹果AP 日期:2018-05-23' import re # m = re.match( # r"品种:(?P<productname>[\u4e00-\u9fa5]+)(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) m = re.match( r"品种:(?P<productname>[\u4e00-\u9fa5]+)?(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) t_instrumentid = m.group('instrumentid') t_productname = m.group('productname') if not t_productname: t_productname = t_instrumentid t_date = m.group('date') # productname 铜 instrumentid cu1804 t_df = t_df.drop([0, 1]) t_df['instrumentid'] = t_instrumentid t_df['productname'] = t_productname t_df['date'] = t_date print(t_df) col_names = [ 'RANK', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG', 'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'PARTICIPANTABBR3', 'CJ3', 'CJ3_CHG', 'INSTRUMENTID', 'PRODUCTNAME', 'DATE' ] t_df.columns = col_names t_df['VARIETY'] = True print(t_df) from db_insert2 import set_ranks_df set_ranks_df(t_df, year=year, month=month, day=day, exchange='CZCE') if '合约' in header_index_dict: instruments_index = header_index_dict['合约'] len_instruments_index = len(instruments_index) if '合计' not in header_index_dict: print("Error") end_index = header_index_dict['合计'] end_index = end_index[-len_instruments_index:] for beg, end in zip(instruments_index, end_index): t_df = df[beg:end] t_df = t_df.applymap(lambda x: x.replace(',', '') if x else x) t_df.reset_index(inplace=True, drop=True) print(t_df) h_str = t_df.iat[0, 0] import re # m = re.match( # r"品种:(?P<productname>[\u4e00-\u9fa5]+)(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) m = re.match( r"合约:(?P<productname>[\u4e00-\u9fa5]+)?(?P<instrumentid>[a-zA-Z\d]+)\W*日期:(?P<date>[\d-]+)", h_str) t_instrumentid = m.group('instrumentid') t_productname = m.group('productname') if not t_productname: t_productname = "EMPTY" t_date = m.group('date') # productname 铜 instrumentid cu1804 t_df = t_df.drop([0, 1]) t_df['INSTRUMENTID'] = t_instrumentid t_df['PRODUCTNAME'] = t_productname print(t_df) col_names = [ 'RANK', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG', 'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'PARTICIPANTABBR3', 'CJ3', 'CJ3_CHG', 'INSTRUMENTID', 'PRODUCTNAME', ] t_df.columns = col_names t_df['VARIETY'] = False print(t_df) from db_insert2 import set_ranks_df set_ranks_df(t_df, year=year, month=month, day=day, exchange='CZCE') return
def cffex_rank_by_contract(year=2018, month=5, day=1, contract='IF'): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Connection': 'keep-alive', # 'Host': 'www.shfe.com.cn', 'Referer': 'http://www.cffex.com.cn/ccpm/', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', } # params = {'firstname': 'Ryan', 'lastname': 'Mitchell'} # url = 'http://www.shfe.com.cn/data/instrument/ContractBaseInfo20180321.dat' # url = "http://www.shfe.com.cn/data/dailydata/kx/kx20180319.dat" url = "http://www.cffex.com.cn/sj/ccpm/201804/09/IF.xml" url = "http://www.cffex.com.cn/sj/ccpm/{year:>04}{month:>02}/{day:>02}/{contract}.xml"\ .format(year=year, month=month, day=day, contract=contract) # http://www.shfe.com.cn/data/dailydata/kx/pm20180327.dat # http://www.shfe.com.cn/data/instrument/ContractDailyTradeArgument20180321.dat request = requests.get(url, headers=headers) if "gray_404" in request.text: return print(request.encoding) # request.encoding = 'gbk' print(request.text) print(request.encoding) # 网页错误 <td class="gray_404">您要查看的网页可能已被删除、名称已被更改,或者暂时不可用。</td> # response.encoding = 'gbk' # print # response.text try: root = ET.fromstring(request.text) except xml.etree.ElementTree.ParseError as e: print(e) return for child in root: print("_" * 30) print(child.tag, child.attrib) for grandchild in child: print(grandchild.tag, grandchild.attrib) def iter_docs(root): author_attr = root.attrib for doc in root.iter('data'): doc_dict = author_attr.copy() doc_dict.update(doc.attrib) doc_dict['data'] = doc.text yield doc_dict def iter_rows(root): for data in root.findall('data'): doc_dict = {} for attr in data: # doc_dict = {} val = data.find(attr.tag).text doc_dict[attr.tag] = val # print(attr.tag, val) print(doc_dict) yield doc_dict # pass t_li = list(iter_rows(root)) print(t_li) doc_df = pd.DataFrame(t_li) # doc_df = doc_df.applymap(lambda x: x.replace(',', '') if x else x) print(doc_df) doc_df.to_csv("zhongjin_xml1.csv", encoding="gbk") df = doc_df.copy() df['rank'] = df['rank'].astype(int, copy=False) grouped = df.groupby(['instrumentid']) whole_df = None for gr in grouped.groups: print(gr) a_group_df = grouped.get_group(gr) grand_groups = a_group_df.groupby(['datatypeid']) # df.sort_values(by=['col1']) # print(a_group_df.sort_values(by=['rank'])) print("-" * 60) temp_merge_df = None for grand_group in ['0', '1', '2']: print(grand_group) grand_group_df = grand_groups.get_group(grand_group) # print(grand_group_df.sort_values(by=['rank'])) grand_group_df = grand_group_df.sort_values(by=['rank']) if temp_merge_df is None: temp_merge_df = grand_group_df else: selected_cols = [ 'rank', 'shortname', 'volume', 'varvolume', 'partyid' ] t_y_df = grand_group_df[selected_cols] name_mapper = { col: col + '_' + grand_group if col != 'rank' else col for col in selected_cols } t_y_df.rename(columns=lambda x: x.strip(), inplace=True) t_y_df.rename(columns=name_mapper, inplace=True) result = pd.merge(temp_merge_df, t_y_df, on='rank') temp_merge_df = result print(temp_merge_df) # temp_merge_df.to_csv("pandas_merge_0412.csv", encoding='gbk') if whole_df is None: whole_df = temp_merge_df else: result = pd.concat([whole_df, temp_merge_df], axis=0) whole_df = result # print(whole_df) whole_df.to_csv("whole_pandas_merge_0412_v2.csv", encoding='gbk') # break # break # whole_df['tradingday'] = pd.to_datetime(whole_df['tradingday'], format='%Y%m%d', errors='ignore') # print(whole_df['tradingday']) from db_insert2 import insert_db try: # Index(['rank', 'shortname_0', 'volume_0', 'varvolume_0', 'partyid_0', # 'shortname_1', 'volume_1', 'varvolume_1', 'partyid_1', 'shortname_2', # 'volume_2', 'varvolume_2', 'partyid_2'], # dtype='object') whole_df = whole_df[[ 'instrumentid', 'productid', 'rank', 'shortname', 'tradingday', 'varvolume', 'volume', 'shortname_1', 'volume_1', 'varvolume_1', 'shortname_2', 'volume_2', 'varvolume_2', 'partyid', 'partyid_1', 'partyid_2' ]] except Exception as e: print(e) col_mapper = { 'shortname_2': 'PARTICIPANTABBR3', 'shortname': 'PARTICIPANTABBR1', 'volume': 'CJ1', 'varvolume_2': 'CJ3_CHG', 'instrumentid': 'INSTRUMENTID', 'volume_2': 'CJ3', 'volume_1': 'CJ2', 'varvolume': 'CJ1_CHG', 'rank': 'RANK', 'shortname_1': 'PARTICIPANTABBR2', 'productid': 'PRODUCTNAME', 'varvolume_1': 'CJ2_CHG', 'tradingday': 'REPORT_DATE', 'partyid': 'PARTICIPANTID1', 'partyid_1': 'PARTICIPANTID2', 'partyid_2': 'PARTICIPANTID3' } whole_df.rename(columns=col_mapper, inplace=True) # whole_df['EXCHANGE'] = "CFFEX" whole_df['VARIETY'] = False # t_y_df.rename(columns=name_mapper, inplace=True) print(whole_df) set_ranks_df(whole_df, year=year, month=month, day=day, exchange='CFFEX') # insert_db(whole_df, tablename='ranks', con='sqlite:///exchange.sqlite') return
def get_dce_ranks( year=2018, month=6, day=14, variety_list=None, ): minus_month = int(month) - 1 values = { 'memberDealPosiQuotes.variety': 'b', 'memberDealPosiQuotes.trade_type': '0', 'year': str(year), 'month': str(minus_month), 'day': str(day), 'contract.contract_id': 'all', 'contract.variety_id': 'b', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://www.dce.com.cn', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.dce.com.cn', 'Referer': 'http://www.dce.com.cn/publicweb/quotesdata/memberDealPosiQuotes.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', } url = "http://www.dce.com.cn/publicweb/quotesdata/memberDealPosiQuotes.html" base_url = "http://www.dce.com.cn/publicweb/quotesdata/memberDealPosiQuotes.html" session = requests.Session() session.headers.update(headers) base_html = session.get(base_url) print(base_html.text) print(session.headers) session.headers.update(headers) print(session.headers) df = None ''' memberDealPosiQuotes.variety: b memberDealPosiQuotes.trade_type: 0 year: 2018 month: 5 day: 14 contract.contract_id: b1901 contract.variety_id: b contract: ''' for a_var in variety_list: for instrument in variety_list[a_var]: values['contract.variety_id'] = a_var values['memberDealPosiQuotes.variety'] = a_var values['contract.contract_id'] = instrument s = session.post( url, values, ) print(s.text) # //*[@id="memberDealPosiQuotesForm"]/div/div[1]/div[4]/div/ul[2]/li[1] tree = html.fromstring(s.text) tables = tree.xpath('//*[@id="printData"]/div/table[2]') if not tables: continue table = tables[0] print(table) data = [[td.text_content().strip() for td in row.findall('td')] for row in table.findall('tr')] print(data) if len(data) <= 1: continue col_names = [ 'RANK', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG', 'RANK2', 'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'RANK3', 'PARTICIPANTABBR3', 'CJ3', 'CJ3_CHG', ] t_df = pd.DataFrame(data, columns=col_names) # t_df = t_df.applymap(lambda x: x.replace(',', '')) t_df.dropna(axis='index', inplace=True) t_df = t_df.applymap(lambda x: x.replace(',', '')) # t_df.columns = col_names print(t_df) # year=2018, month=5, day=11 t_df.to_csv( './csv/dce/t_df_dce_ins_{instrument}_{year}_{month}_{day}.csv'. format(instrument=instrument, year=year, month=month, day=day), index=False) # df[0].str.contains(a_header) t_df.drop(t_df[t_df['RANK'].str.contains('总计')].index, inplace=True, axis='index') t_df.drop(['RANK2', 'RANK3'], axis='columns', inplace=True) print(t_df) t_df['INSTRUMENTID'] = instrument t_df['PRODUCTNAME'] = a_var import datetime today = datetime.datetime.now() # t_df['DATE'] = today if df is None: df = t_df else: df = df.append(t_df) print(df) df['VARIETY'] = False df.to_csv('./csv/dce/t_df_dce_ins_sum_{year}_{month}_{day}.csv'.format( year=year, month=month, day=day), index=False) from db_insert2 import set_ranks_df set_ranks_df(df, year=year, month=month, day=day, exchange='DCE') return
def czce_scrape_variety_only(year=2018, month=6, day=11): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://www.czce.com.cn', 'Upgrade-Insecure-Requests': '1', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.czce.com.cn', 'Referer': 'http://www.czce.com.cn/portal/jysj/qhjysj/ccpm/A09112003index_1.htm', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', } url = 'http://www.czce.com.cn/portal/DFSStaticFiles/Future/' \ '{year:>04}/{year:>04}{month:>02}{day:>02}/FutureDataHolding.htm'.format( year=year, month=month, day=day) base_url = "http://www.czce.com.cn/portal/jysj/qhjysj/ccpm/A09112003index_1.htm" session = requests.Session() session.headers.update(headers) base_html = session.get(base_url) print(base_html.text) print(session.headers) session.headers.update(headers) print(session.headers) s = session.get(url, ) # print(s.text) s.encoding = 'gbk' print(s.text) from lxml import html tree = html.fromstring(s.text) table = tree.xpath('//table[@id="senfe"]')[0] data = [[td.text_content().strip() for td in row.findall('td')] for row in table.findall('tr')] df = pd.DataFrame(data, ) df[0] = df[0].str.replace('\xa0', '') print(df) form_header = ['品种', '合约', '合计'] temp_index = [] start_index = [] import collections header_index_dict = collections.OrderedDict() for a_header in form_header: a_index_list = df.index[df[0].str.contains(a_header)].tolist() if not a_index_list: continue header_index_dict.update({a_header: a_index_list}) if '品种' in header_index_dict: contracts = header_index_dict['品种'] if '合计' not in header_index_dict: print("Error") end_index = header_index_dict['合计'] for beg, end in zip(contracts, end_index): t_df = df[beg:end] t_df = t_df.applymap(lambda x: x.replace(',', '') if x else x) t_df.reset_index(inplace=True, drop=True) print(t_df) h_str = t_df.iat[0, 0] # '品种:苹果AP 日期:2018-05-23' import re # m = re.match( # r"品种:(?P<productname>[\u4e00-\u9fa5]+)(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) m = re.match( r"品种:(?P<productname>[\u4e00-\u9fa5]+)?(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) t_instrumentid = m.group('instrumentid') t_productname = m.group('productname') if not t_productname: t_productname = t_instrumentid t_date = m.group('date') # productname 铜 instrumentid cu1804 t_df = t_df.drop([0, 1]) t_df['instrumentid'] = t_instrumentid t_df['productname'] = t_productname print(t_df) col_names = [ 'RANK', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG', 'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'PARTICIPANTABBR3', 'CJ3', 'CJ3_CHG', 'INSTRUMENTID', 'PRODUCTNAME', ] t_df.columns = col_names t_df['VARIETY'] = True print(t_df) from db_insert2 import set_ranks_df set_ranks_df(t_df, year=year, month=month, day=day, exchange='CZCE') return