def run_function(metal_lst, metal_dict, recommend, conn, client, error_path): ''' :param metal_lst: list and elements are str, the metal list, like ['Cu', 'Pb'....] :param metal_dict: dict, the dictinary contains the content relevant to the certain metal, like {'Cu':'铜'}... :param recommend: df, the total dataframe we have extracted from the html, finished in step2 :param conn: object, the object link the database :param client: object, the client of the aip :param error_path:str, the path we need to record the error ''' already_error = other_function.load_json(error_path) raise_error = main_function(metal_lst, metal_dict, recommend, conn, client) if already_error != {}: for k, v in already_error.items(): if k in raise_error.keys(): pass else: raise_error[k] = already_error[k] other_function.dump_json(raise_error, error_path) if raise_error != {}: print('find some error, please check it') print('completed')
def check_function(metal_lst, metal_dict, recommend, conn, client, error_path): ''' :param metal_lst: list and elements are str, the metal list, like ['Cu', 'Pb'....] :param metal_dict: dict, the dictinary contains the content relevant to the certain metal, like {'Cu':'铜'}... :param recommend: df, the total dataframe we have extracted from the html, finished in step2 :param conn: object, the object link the database :param client: object, the client of the aip :param error_path:str, the path we need to record the error ''' already_error = other_function.load_json(error_path) if already_error == {}: print('no error, no need to check') return else: recommend = recommend[recommend['url'].isin(already_error)] raise_error = main_function(metal_lst, metal_dict, recommend, conn, client) other_function.dump_json(raise_error, error_path) print('completed')
if error_link == []: print('the error file is empty, it is unnecessary to run again') else: print('begin to run the error link again') error = [] for website in error_link: tmp = crawler.crawl(wait_time, website, webdriver_link, switch) if tmp == 1: error.append(website) other_function.dump_json(error, error_link_path) print( 'crawling the error link is finished, please check the error file to see if it is empty, if not then run again' ) elif switch == 'run': switch = True original_error = other_function.load_json(error_link_path) if original_error != []: print('error file is not empty, remember to check please') error = [] for website in website_link_lst: tmp = crawler.crawl(wait_time, website, webdriver_link, True)
current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) html_error_path = './step2_data/html/error_log_{}.json'.format( current_time) recommend_error_path = './step2_data/recommend/error_log_{}.json'.format( current_time) print( '####################################################################') print('begin to extract the info from the html') extracter = html_extracter(conn) df_crawl = pd.read_sql('Select * from html', con=conn) problem = extracter.extract(df_crawl) if problem != {}: other_function.dump_json(problem, html_error_path) print('error found, please check the error file:{}'.format( html_error_path)) else: print('no error found') print('finish extracting the info from the html') print( '####################################################################') print('begin to extract the recommend from the content') recommend = recommend_extracter(conn) df_content = pd.read_sql('Select * from content', conn) keyword = [ '震荡', '偏强', '观望', '做多', '轻仓', '反弹', '偏弱', '上涨', '企稳', '承压', '卖出', '短线', '短多', '整理', '止损', '多仓', '突破', '支持', '上行', '空间', '回补', '低位', '悲观', '回落',
def train_func(price_4e, met, metal_columns, window_list, train_period, predict_period, threshold, freq_win, repo_win, conn): ''' :param price_4e:df, the total data we get from 4e :param met:str, the metal we need to predict :param metal_columns: str, the metal column in the LME file :param window_list: [1, 3, 5, 10, 20, 60] :param train_period:list, [datetime1, datetim2], the training period :param predict_period:list, [datetime1, datetim2], the predicting period :param threshold:int, the threshold we use :param freq_win:int, how many reports after selection we need to consider :param repo_win:int, how many days we need to consider for one day preiction :param conn:object, the connection between the code and the database :return discrete_param:dict, the discretization quantile :return scor:dataframe, the predict score of the train period ''' #here we get the sentiment for the train period, as the sentiment is rolling #according to the train period, then we need to use the recommends during #the train period to define the dividing points sentiment = mf.get_sentiment(met, train_period, conn) print('getting sentiment score') #here we get the price for the train period, here we may miss some data at the #bottom because the bottom data we can not get the future result, hence the return #price would be NaN #window_list could be [1, 3, 5, 10, 20, 60] price_forward = mf.get_price(price_4e, metal_columns, [1, 3, 5, 10, 20, 60], train_period) print('the length before price_forward merge is : {}'.format( len(price_forward))) #here we define the discrete points for the return price and the sentiment_article #which are based on the train period data. #newly added discrete_columns = ['return_{}d'.format(i) for i in window_list] # discrete_columns = ['return_1d', 'return_3d', 'return_5d', # 'return_10d', 'return_20d', 'return_60d'] discrete_param = {} for i in discrete_columns: tier_list = mf.tier_point(3) quan_list = mf.quan_point(tier_list, price_forward[~pd.isna(price_forward[i])][i]) discrete_param[i + '_discrete'] = quan_list for i in ['Sentiment_article']: tier_list = mf.tier_point(3) quan_list = mf.quan_point(tier_list, sentiment[~pd.isna(sentiment[i])][i]) discrete_param[i + '_discrete'] = quan_list #after merging the data, we need to check the length of it. price_sentiment = price_forward.merge(sentiment, left_on='Index', right_on='date', how='inner') price_sentiment.drop([ 'Index', 'title', '{}_fact'.format(met), '{}_action'.format(met), '{}_new_action'.format(met), 'Sentiment' ], axis=1, inplace=True) print('the length after price_forward merge is : {}'.format( len(price_sentiment))) #here we need to drop the nan columns of the price_sentiment, and also, #we need to guarantee the data doesn't have duplicates. price_sentiment = price_sentiment.dropna() print('the original length of the dataframe : {}'.format( len(price_sentiment))) price_sentiment.drop_duplicates(keep='first', inplace=True) print('the processsed length of the dataframe : {}'.format( len(price_sentiment))) #divide the price data(after dropna and drop_duplicates) into 3 groups. for i in window_list: res = price_sentiment['return_{}d'.format(i)].apply( lambda x: mf.discrete_method_with_point( discrete_param['return_{}d_discrete'.format(i)], x)) price_sentiment['discrete_{}d'.format(i)] = [j - 1 for j in res] del res #divide the sentiment data into 3 groups #and the last step help to change the classification into [-1, 0, 1] res = price_sentiment['Sentiment_article'].apply( lambda x: mf.discrete_method_with_point( discrete_param['Sentiment_article_discrete'], x)) price_sentiment['discrete_score'] = [i - 1 for i in res] del res #define the table name, which maybe saved in the inermediate backup train_period_start = datetime.datetime.strftime(train_period[0], '%Y%m%d') train_period_end = datetime.datetime.strftime(train_period[1], '%Y%m%d') predict_period_start = datetime.datetime.strftime(predict_period[0], '%Y%m%d') predict_period_end = datetime.datetime.strftime(predict_period[1], '%Y%m%d') table_name = '{}_{}_{}_{}_{}_{}'.format(met, threshold, freq_win, repo_win, predict_period_start, predict_period_end) score_class = mf.Score(met, table_name, window_list, keep_intermediate=True) accur = score_class.cal_accur(price_sentiment) accur = accur.sort_values( ['url', 'date', 'news_type', 'company', 'prec_horizon']) score = score_class.cal_score(price_sentiment, accur, threshold, freq_win, score_point_num=3, cal_date=repo_win) #we need to have the final score quantile to apply in the predicting period. for i in window_list: tmp = score[score['horizon'] == i].copy() if len(tmp) != 0: tier_list = mf.tier_point(3) quan_list = mf.quan_point(tier_list, tmp['score']) discrete_param['final_score_{}d'.format(i)] = quan_list else: discrete_param['final_score_{}d'.format(i)] = [0] other_function.dump_json( discrete_param, './discrete_param/{}/{}_{}_{}_{}_{}_{}.json'.format( met, met, train_period_start, train_period_end, threshold, freq_win, repo_win)) return discrete_param, accur
best_param[eval(key)] = eval(val) else: best_param, res = tpf.adjust_param( price_4e, met, metal_columns, short_term_horizon, train_period, predict_period, threshold_lst, short_term_freq_win, short_term_repo_win, short_term_predict_half, short_term_whether_use_threshold_for_horizons, conn) res.to_csv( './adjustment_intermediate/{}/{}_{}_{}_short_term_adjustment.csv' .format(met, met, predict_start_date, predict_end_date), index=False) best_param_out = {} for key, val in best_param.items(): best_param_out[str(key)] = str(val) hyper_param['short_term'][met][short_period_key] = best_param_out other_function.dump_json(hyper_param, hyper_path) for hor, best_threshold in best_param.items(): if predict_mode == 'reproduction': ans = online_reproduction(price_4e, met, metal_columns, [hor], train_period, predict_period, best_threshold, short_term_freq_win, short_term_repo_win, conn) elif predict_mode == 'run': ans = main_controller(price_4e, met, metal_columns, [hor], train_period, predict_period, best_threshold, short_term_freq_win, short_term_repo_win, conn) ans.to_csv('./predict_result/{}/{}/{}_{}_{}_{}.csv'.format( met, hor, predict_result_name, best_threshold, hor, predict_mode),