Example #1
0
def run_function(metal_lst, metal_dict, recommend, conn, client, error_path):
    '''
    :param metal_lst: list and elements are str, the metal list, like ['Cu', 'Pb'....]
    :param metal_dict: dict, the dictinary contains the content relevant to the certain metal, like {'Cu':'铜'}...
    :param recommend: df, the total dataframe we have extracted from the html, finished in step2
    :param conn: object, the object link the database
    :param client: object, the client of the aip
    :param error_path:str, the path we need to record the error
    '''

    already_error = other_function.load_json(error_path)

    raise_error = main_function(metal_lst, metal_dict, recommend, conn, client)

    if already_error != {}:

        for k, v in already_error.items():

            if k in raise_error.keys():
                pass
            else:
                raise_error[k] = already_error[k]

    other_function.dump_json(raise_error, error_path)
    if raise_error != {}:
        print('find some error, please check it')
    print('completed')
Example #2
0
def check_function(metal_lst, metal_dict, recommend, conn, client, error_path):
    '''
    :param metal_lst: list and elements are str, the metal list, like ['Cu', 'Pb'....]
    :param metal_dict: dict, the dictinary contains the content relevant to the certain metal, like {'Cu':'铜'}...
    :param recommend: df, the total dataframe we have extracted from the html, finished in step2
    :param conn: object, the object link the database
    :param client: object, the client of the aip
    :param error_path:str, the path we need to record the error
    '''

    already_error = other_function.load_json(error_path)

    if already_error == {}:
        print('no error, no need to check')
        return
    else:
        recommend = recommend[recommend['url'].isin(already_error)]

    raise_error = main_function(metal_lst, metal_dict, recommend, conn, client)

    other_function.dump_json(raise_error, error_path)

    print('completed')
Example #3
0
        if error_link == []:

            print('the error file is empty, it is unnecessary to run again')

        else:
            print('begin to run the error link again')

            error = []

            for website in error_link:
                tmp = crawler.crawl(wait_time, website, webdriver_link, switch)
                if tmp == 1:
                    error.append(website)

            other_function.dump_json(error, error_link_path)
            print(
                'crawling the error link is finished, please check the error file to see if it is empty, if not then run again'
            )
    elif switch == 'run':

        switch = True

        original_error = other_function.load_json(error_link_path)

        if original_error != []:
            print('error file is not empty, remember to check please')

        error = []
        for website in website_link_lst:
            tmp = crawler.crawl(wait_time, website, webdriver_link, True)
Example #4
0
    current_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
    html_error_path = './step2_data/html/error_log_{}.json'.format(
        current_time)
    recommend_error_path = './step2_data/recommend/error_log_{}.json'.format(
        current_time)
    print(
        '####################################################################')
    print('begin to extract the info from the html')
    extracter = html_extracter(conn)
    df_crawl = pd.read_sql('Select * from html', con=conn)

    problem = extracter.extract(df_crawl)

    if problem != {}:
        other_function.dump_json(problem, html_error_path)
        print('error found, please check the error file:{}'.format(
            html_error_path))
    else:
        print('no error found')
    print('finish extracting the info from the html')
    print(
        '####################################################################')

    print('begin to extract the recommend from the content')
    recommend = recommend_extracter(conn)
    df_content = pd.read_sql('Select * from content', conn)

    keyword = [
        '震荡', '偏强', '观望', '做多', '轻仓', '反弹', '偏弱', '上涨', '企稳', '承压', '卖出', '短线',
        '短多', '整理', '止损', '多仓', '突破', '支持', '上行', '空间', '回补', '低位', '悲观', '回落',
def train_func(price_4e, met, metal_columns, window_list, train_period,
               predict_period, threshold, freq_win, repo_win, conn):
    '''
    :param price_4e:df, the total data we get from 4e
    :param met:str, the metal we need to predict
    :param metal_columns: str, the metal column in the LME file
    :param window_list: [1, 3, 5, 10, 20, 60]
    :param train_period:list, [datetime1, datetim2], the training period
    :param predict_period:list, [datetime1, datetim2], the predicting period
    :param threshold:int, the threshold we use
    :param freq_win:int, how many reports after selection we need to consider
    :param repo_win:int, how many days we need to consider for one day preiction
    :param conn:object, the connection between the code and the database

    :return discrete_param:dict, the discretization quantile
    :return scor:dataframe, the predict score of the train period
    '''
    #here we get the sentiment for the train period, as the sentiment is rolling
    #according to the train period, then we need to use the recommends during
    #the train period to define the dividing points
    sentiment = mf.get_sentiment(met, train_period, conn)
    print('getting sentiment score')

    #here we get the price for the train period, here we may miss some data at the
    #bottom because the bottom data we can not get the future result, hence the return
    #price would be NaN
    #window_list could be [1, 3, 5, 10, 20, 60]
    price_forward = mf.get_price(price_4e, metal_columns,
                                 [1, 3, 5, 10, 20, 60], train_period)
    print('the length before price_forward merge is : {}'.format(
        len(price_forward)))

    #here we define the discrete points for the return price and the sentiment_article
    #which are based on the train period data.
    #newly added
    discrete_columns = ['return_{}d'.format(i) for i in window_list]

    #    discrete_columns = ['return_1d', 'return_3d', 'return_5d',
    #                        'return_10d', 'return_20d', 'return_60d']
    discrete_param = {}
    for i in discrete_columns:
        tier_list = mf.tier_point(3)
        quan_list = mf.quan_point(tier_list,
                                  price_forward[~pd.isna(price_forward[i])][i])
        discrete_param[i + '_discrete'] = quan_list
    for i in ['Sentiment_article']:
        tier_list = mf.tier_point(3)
        quan_list = mf.quan_point(tier_list,
                                  sentiment[~pd.isna(sentiment[i])][i])
        discrete_param[i + '_discrete'] = quan_list

    #after merging the data, we need to check the length of it.
    price_sentiment = price_forward.merge(sentiment,
                                          left_on='Index',
                                          right_on='date',
                                          how='inner')
    price_sentiment.drop([
        'Index', 'title', '{}_fact'.format(met), '{}_action'.format(met),
        '{}_new_action'.format(met), 'Sentiment'
    ],
                         axis=1,
                         inplace=True)
    print('the length after price_forward merge is : {}'.format(
        len(price_sentiment)))

    #here we need to drop the nan columns of the price_sentiment, and also,
    #we need to guarantee the data doesn't have duplicates.
    price_sentiment = price_sentiment.dropna()
    print('the original length of the dataframe : {}'.format(
        len(price_sentiment)))
    price_sentiment.drop_duplicates(keep='first', inplace=True)
    print('the processsed length of the dataframe : {}'.format(
        len(price_sentiment)))

    #divide the price data(after dropna and drop_duplicates) into 3 groups.
    for i in window_list:
        res = price_sentiment['return_{}d'.format(i)].apply(
            lambda x: mf.discrete_method_with_point(
                discrete_param['return_{}d_discrete'.format(i)], x))
        price_sentiment['discrete_{}d'.format(i)] = [j - 1 for j in res]
        del res

    #divide the sentiment data into 3 groups
    #and the last step help to change the classification into [-1, 0, 1]
    res = price_sentiment['Sentiment_article'].apply(
        lambda x: mf.discrete_method_with_point(
            discrete_param['Sentiment_article_discrete'], x))
    price_sentiment['discrete_score'] = [i - 1 for i in res]
    del res

    #define the table name, which maybe saved in the inermediate backup
    train_period_start = datetime.datetime.strftime(train_period[0], '%Y%m%d')
    train_period_end = datetime.datetime.strftime(train_period[1], '%Y%m%d')

    predict_period_start = datetime.datetime.strftime(predict_period[0],
                                                      '%Y%m%d')
    predict_period_end = datetime.datetime.strftime(predict_period[1],
                                                    '%Y%m%d')
    table_name = '{}_{}_{}_{}_{}_{}'.format(met, threshold, freq_win, repo_win,
                                            predict_period_start,
                                            predict_period_end)

    score_class = mf.Score(met,
                           table_name,
                           window_list,
                           keep_intermediate=True)

    accur = score_class.cal_accur(price_sentiment)
    accur = accur.sort_values(
        ['url', 'date', 'news_type', 'company', 'prec_horizon'])

    score = score_class.cal_score(price_sentiment,
                                  accur,
                                  threshold,
                                  freq_win,
                                  score_point_num=3,
                                  cal_date=repo_win)

    #we need to have the final score quantile to apply in the predicting period.
    for i in window_list:
        tmp = score[score['horizon'] == i].copy()
        if len(tmp) != 0:
            tier_list = mf.tier_point(3)
            quan_list = mf.quan_point(tier_list, tmp['score'])
            discrete_param['final_score_{}d'.format(i)] = quan_list
        else:
            discrete_param['final_score_{}d'.format(i)] = [0]
    other_function.dump_json(
        discrete_param, './discrete_param/{}/{}_{}_{}_{}_{}_{}.json'.format(
            met, met, train_period_start, train_period_end, threshold,
            freq_win, repo_win))

    return discrete_param, accur
Example #6
0
                best_param[eval(key)] = eval(val)
        else:
            best_param, res = tpf.adjust_param(
                price_4e, met, metal_columns, short_term_horizon, train_period,
                predict_period, threshold_lst, short_term_freq_win,
                short_term_repo_win, short_term_predict_half,
                short_term_whether_use_threshold_for_horizons, conn)
            res.to_csv(
                './adjustment_intermediate/{}/{}_{}_{}_short_term_adjustment.csv'
                .format(met, met, predict_start_date, predict_end_date),
                index=False)
            best_param_out = {}
            for key, val in best_param.items():
                best_param_out[str(key)] = str(val)
            hyper_param['short_term'][met][short_period_key] = best_param_out
            other_function.dump_json(hyper_param, hyper_path)

        for hor, best_threshold in best_param.items():
            if predict_mode == 'reproduction':
                ans = online_reproduction(price_4e, met, metal_columns, [hor],
                                          train_period, predict_period,
                                          best_threshold, short_term_freq_win,
                                          short_term_repo_win, conn)
            elif predict_mode == 'run':
                ans = main_controller(price_4e, met, metal_columns, [hor],
                                      train_period, predict_period,
                                      best_threshold, short_term_freq_win,
                                      short_term_repo_win, conn)
            ans.to_csv('./predict_result/{}/{}/{}_{}_{}_{}.csv'.format(
                met, hor, predict_result_name, best_threshold, hor,
                predict_mode),