def validate_data(): db = mongodb_utility.connect_db(db_name="list_company") sz_df = mongodb_utility.export2df(database=db, collection="sz_stock") sh_df = mongodb_utility.export2df(database=db, collection="sh_stock") fzb_df = mongodb_utility.export2df(database=db, collection="fzb") # llb_df = mongodb_utility.export2df(database=db,collection="llb") # lrb_df = mongodb_utility.export2df(database=db,collection="lrb") # missing_fzb_sz = sz_df[~sz_df['_id'].isin(fzb_df['机构ID'])] missing_fzb_sh = sh_df[~sh_df['_id'].isin(fzb_df['机构ID'])] missing_sh_a = missing_fzb_sh[pd.Series( missing_fzb_sh['A股上市日期']).notnull()][["_id", "A股上市日期"]] # print(missing_sh_a["A股上市日期"]) # missing_sh_a["A股上市日期"] = missing_sh_a["A股上市日期"].apply(lambda x: x.strip()) missing_sh_a["A股上市日期"] = missing_sh_a["A股上市日期"].apply( lambda x: str(x.strip())[0:4]) missing_sh_2017 = missing_sh_a[missing_sh_a["A股上市日期"] < "2017"].copy() print(missing_sh_2017.count()) missing_sh_2017["max_year"] = "2016" missing_sh_2017["A股上市日期"] = "2016" download_list = missing_sh_2017.values.tolist() """download missing data """ url = 'http://www.cninfo.com.cn/cninfo-new/data/download' # frDownloader.getFR(url=url, stock_list=download_list) # missing_fzb_sh.to_csv("missing_fzb_sh.csv") # print(missing_fzb_sh.count()) return
def update_sh_b_data(update_year="2016"): url = 'http://www.cninfo.com.cn/cninfo-new/data/download' """get stock list""" db = mongodb_utility.connect_db(db_name="list_company") sh_df = mongodb_utility.export2df(database=db, collection="sh_stock") """download sh B stock data""" sh_df_b = sh_df[Series(sh_df['A股上市日期']).isnull()][["_id"]] sh_df_b["_id"] = sh_df_b["_id"].apply(lambda x: str(int(x))) sh_df_b["min_year"] = update_year sh_df_b["max_year"] = update_year shb_list = sh_df_b.values.tolist() frDownloader.getFR(url=url, stock_list=shb_list)
def validate_reports_download(table1='fzb', table2='lrb', auto_download=False): db = mongodb_utility.connect_db(db_name="list_company") df1 = mongodb_utility.export2df(database=db, collection=table1) df2 = mongodb_utility.export2df(database=db, collection=table2) t1_notin_t2 = df1[~df1['_id'].isin(df2['_id'])][['_id', '机构ID', '报告年度']] t1_notin_t2['报告年度'] = t1_notin_t2['报告年度'].apply( lambda x: str(x.strip())[0:4]) t1_notin_t2 = t1_notin_t2.drop_duplicates(['机构ID', '报告年度']) missing_t2 = t1_notin_t2[['机构ID', '报告年度']].copy() missing_t2['max_year'] = missing_t2['报告年度'] download_list = missing_t2.values.tolist() print(len(download_list)) if auto_download: url = 'http://www.cninfo.com.cn/cninfo-new/data/download' frDownloader.getFR(url=url, stock_list=download_list, types=[table2]) return download_list
def validate_bw_reports(): db = mongodb_utility.connect_db(db_name="list_company") fzb_df = mongodb_utility.export2df(database=db, collection="fzb") llb_df = mongodb_utility.export2df(database=db, collection="llb") lrb_df = mongodb_utility.export2df(database=db, collection="lrb") missing_fzb_in_lrb = fzb_df[~fzb_df['_id'].isin(lrb_df['_id'])][[ '_id', '机构ID', '报告年度' ]] print(missing_fzb_in_lrb) missing_fzb_in_lrb.to_csv('missing_fzb_lrb.csv') missing_fzb_in_llb = fzb_df[~fzb_df['_id'].isin(llb_df['_id'])][[ '_id', '机构ID', '报告年度' ]] missing_fzb_in_llb.to_csv('missing_fzb_llb.csv') missing_lrb_in_fzb = lrb_df[~lrb_df['_id'].isin(fzb_df['_id'])][[ '_id', '机构ID', '报告年度' ]] missing_lrb_in_fzb.to_csv('missing_lrb_fzb.csv') missing_llb_in_fzb = llb_df[~llb_df['_id'].isin(fzb_df['_id'])][[ '_id', '机构ID', '报告年度' ]] missing_llb_in_fzb.to_csv('missing_llb_fzb.csv') missing_lrb_in_llb = lrb_df[~lrb_df['_id'].isin(llb_df['_id'])][[ '_id', '机构ID', '报告年度' ]] missing_lrb_in_llb.to_csv('missing_lrb_llb.csv') missing_llb_in_lrb = llb_df[~llb_df['_id'].isin(lrb_df['_id'])][[ '_id', '机构ID', '报告年度' ]] missing_llb_in_lrb.to_csv('missing_llb_lrb.csv')