Beispiel #1
0
def test(daySpan=0):
    data = pd.read_csv('/home/mars/Data/000032.csv')
    # 按时间的增序排列
    data = data[::-1]
    # 记录每轮的分数和对应删除特征值

    # 按时间的增序排列
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    # result = pd.DataFrame(data)
    # 除掉数据为NAN的行数据
    deal_result = result.dropna(axis=0)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        # 对X处理, Y值做二分化处理

        data_y = dataY_from_dataFrame(deal_result)
    else:
        data_y = dataY_for_Nmean(deal_result, N=2)
    s_deal_data = (data_x, data_y)
    # 特征选择
    #delete_feature = Filter(use=False).feature_RandomForest(deal_result=deal_result, final_data=s_deal_data, data_y=data_y, cicle=3)
    delete_feature = []
    # 除去多余的特征
    t_deal_result = deal_result.drop(labels=delete_feature, axis=1)
    final_data_X = dataX_from_dataFrame(t_deal_result)
    final_data = (final_data_X, data_y)
    # fit模型
    print('最佳特征值测试模型:')
    print('')
    random_forest(final_data)
Beispiel #2
0
def test():
    data = pd.read_csv(
        '/home/mars/Data/finialData/electronic_infomation/000021.csv')
    data = data[::-1]
    result = get_other_indicators(data)

    #result = data[['price_change', 'p_change']]
    deal_result = result.dropna(axis=0)
    close = deal_result['close']
    print(close.shape)
    s_deal_data = deal_data_from_dataFrame(deal_result)
    data_x = s_deal_data[0]
    data_y = s_deal_data[1]
    # 特征处理
    #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0]
    # 归一化
    final_data_x = nr.standardized_mars(data_x)

    pca_x = oc.LOF_PCA_for_Clustering(final_data_x)

    final_data_x_LOF = oc.replace_Singular(final_data_x, oc.get_pred_test())
    print('final_data_x_LOF', final_data_x_LOF[:16])

    print(final_data_x_LOF.shape)
    #降维处理
    #pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9)
    # #############################################################################
    # Compute clustering with MeanShift
    x_train = final_data_x_LOF[:int(len(data_x) * 0.7)]
    print('x_train', x_train.shape)
    x_test = final_data_x_LOF[int(len(data_x) * 0.7):]
    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(x_train, quantile=0.2, random_state=1)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
    ms.fit(final_data_x_LOF)
    labels = ms.labels_
    print('error size', labels[labels != 0].size)
    print('index of not 0 *******')
    print([i for i, x in enumerate(labels) if x != 0])
    print('*******')
    print(labels)
    print(labels.shape)
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    #score = metrics.silhouette_score(pca_x, labels, metric='euclidean')
    #score1 = metrics.calinski_harabaz_score(pca_x, labels)
    #print(score)
    #print(score1)

    print("number of estimated clusters : %d" % n_clusters_)
    plt.plot(range(len(close)), close)
    plt.plot(range(len(labels)), labels)
    plt.show()
    # #############################################################################
    # Plot result
    '''
Beispiel #3
0
def deal_dataFrame(sql, del_list):
    # 默认连接本地数据库
    con = conn.mysql_operator()
    original_data = con.get_pd_data(sql=sql)

    s_deal_data = original_data.drop(columns=del_list)
    # 充电指标
    t_deal_data = get_other_indicators(s_deal_data).dropna(axis=0)
    return t_deal_data
Beispiel #4
0
def compare_s_no(dataPath=""):
    data = pd.read_csv(dataPath)
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)[-100:]
    # 利用LOF处理原始数据进行重新的决策
    final_data = deal_data_from_dataFrame(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])
    print(final_data_x.shape)
    # 直接使用pca数据,将100%做特异值处理,随机森林的训练
    # 拿100%的数据进行PCA
    data_y = final_data[1]
    data_x = final_data[0]
    final_data_x = nr.standardized_mars(data_x)
    # 拿100%的数据进行PCA
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=54)

    # 奇异值处理
    oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)

    #random_forest((lof_data_x, new_all_y))

    lof_pred = oc.get_pred_test()
    error_index = oc.get_delete_index()
    lof_data_y = oc.replace_Singular(data_y, lof_pred)

    fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
    ax1.scatter(range(len(data_y)), data_y, label='data_y')
    error_close = data_y[error_index]
    # ax1.plot(range(len(lof_pred)), lof_pred, label='lof_pred')

    ax1.scatter(error_index, error_close, label='error_y', c='r', alpha=0.2)
    # ax1.xlabel('x -')
    # ax1.ylabel('y -')
    # ax1.title('plot open')
    ax1.legend()
    # ax2.ylabel('close')

    error_lof_y = lof_data_y[error_index]

    ax2.scatter(range(len(lof_data_y)), lof_data_y, label='lof_data_y')
    ax2.scatter(error_index, error_lof_y, label='error_lof_y', c='r', alpha=0.2)
    # ax2.plot(close**2, label='quadratic')
    ax2.legend()
    # 调整cavas 的间隔
    print(len(data_y))
    print(len(lof_data_y))
    plt.tight_layout()
    plt.show()
Beispiel #5
0
def main_mul_class():
    data = pd.read_csv('/home/mars/Data/000032.csv')
    # 按时间的增序排列
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    delete_feature = []
    deal_result = result.dropna(axis=0)
    # print(deal_result)
    print('***')
    columns = list(deal_result.columns.values)
    #print(len(columns))
    final_data = deal_data_for_multiclassfy(deal_result)
    classes = range(-10,11,1)
    feature_importances = random_forest(final_data, classes=classes)
    print(feature_importances)
Beispiel #6
0
def analyze_lof(dataPath=""):
    data = pd.read_csv(dataPath)
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    deal_result = result.dropna(axis=0)
    # 利用LOF处理原始数据进行重新的决策
    # final_data = deal_data_from_dataFrame(deal_result)

    # 获得电子信息的板块的数据
    # NDX_sql = 'SELECT open,close,low,high,volume,other,change_rate, DATE_ADD(date,INTERVAL 1 DAY) as date from global_info where industry_name = "纳斯达克" order by date asc;'
    # NDX_delete_list = ['id', 'category_name', 'industry_name', 'industry_key', 'total_money']
    # 对于nan的值进行向前填充
    # NDXData = deal_dataFrame(NDX_sql, [])

    final_data = deal_data(deal_result)
    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])

    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=62)
    print(pca_x.shape)
    # x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

    # x奇异值处理
    lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
    error_index = oc.get_delete_index()
    print(error_index)
    result = deal_result.index.tolist()
    # 写入所有的日期,奇异值存在的标志为1
    with open('300113_data.csv', 'w+') as f:
        f.write('date')
        f.write(',')
        f.write('300113_Sigular')
        f.write('\n')
        for index, date in enumerate(result):
            if index in error_index:
                f.write(date)
                f.write(',')
                f.write('1')
                f.write('\n')
            else:
                f.write(date)
                f.write(',')
                f.write('0')
                f.write('\n')
Beispiel #7
0
    def __init__(self, filepath='/home/mars/Data/002446.csv', use=True):
        '''

        :param filepath:本身数据集的文件位置
        :param use: 是否使用本方法自带的数据集,False 表示另启数据集
        '''
        self.use = use
        if use:
            data = pd.read_csv(filepath)
            data = data[::-1]
            result = get_other_indicators(data)
            deal_result = result.dropna(axis=0)
            # print(deal_result)
            final_data = self._deal_data_from_dataFrame(deal_result)
            self.data_x = final_data[0]
            self.data_y = final_data[1]
        else:
            pass
Beispiel #8
0
def get_data(dataPath=''):

    stock_code = '\'000032'

    # 获得数据
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    # print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        # print('code:', code[1:])
        path = '/home/mars/Data/finialData/electronic_infomation/' + code[
            1:] + '.csv'
        code_data = pd.read_csv(path, index_col='date')

        result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接,并且去掉NAN数据
    df = pd.concat(dataList, axis=1, sort=False)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    # print('new_df:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    # print('new_df2:', new_df.get('price_change'))
    # print('all shape:', new_df.shape)
    deal_result = new_df

    data_x = dataX_from_dataFrame(deal_result)
    data_y = dataY_from_dataFrame(deal_result)
    return (data_x, data_y)
Beispiel #9
0
def choose_model():
    #使用遗传算法选择模型
    data = pd.read_csv('/home/mars/Data/000032.csv')
    # 按时间的增序排列
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    # 除掉数据为NAN的行数据
    deal_result = result.dropna(axis=0)
    # 对Y值做二分化处理
    s_deal_result = deal_data_from_dataFrame(deal_result)

    # 对数据做不同的时间跨度的处理,包括Y值的处理
    # s_deal_result = deal_data_for_Nmean(deal_result, N=3)

    # 特征选择, 也可以不做特征选择
    #final_data = Filter(use=False).Variance_selection(threshold=3, data=s_deal_result)
    final_data = []
    data_x = final_data[0]
    data_y = final_data[1]
    ratio = 0.7
    x_train = data_x[:int(len(data_x) * ratio)]
    print(x_train.shape)
    x_test = data_x[int(len(data_x) * ratio):]
    y_train = data_y[:int(len(data_y) * ratio)]
    y_test = data_y[int(len(data_y) * ratio):]
    '''
    max_time_mins:最大的测试时间(分钟为单位)
    mutation_rate: 变异概率
    crossover_rate: 交换概率
    n_jobs: 线程数
    generations: 遗传算法进化次数,可理解为迭代次数
    population_size: 每次进化中种群大小
    '''
    tpot = TPOTClassifier(verbosity=2, max_time_mins=40, config_dict="TPOT light", population_size=50, mutation_rate=0.9,
                          crossover_rate=0.1, n_jobs=-1)

    tpot.fit(x_train.astype(float), y_train.astype(float))
    # 利用测试的最优的算法进行在测试集上的测试
    print(tpot.score(x_test.astype(float), y_test.astype(float)))
Beispiel #10
0
def main_3():
    # 股票存放的集合
    path = '/home/mars/Data/finialData/electronic_infomation/'
    parents = os.listdir(path)
    # 存放不同的股票的测试分数
    scoreList = []
    for parent in parents:
        child = os.path.join(path, parent)
        m_data = pd.read_csv(child)
        m_data = m_data[::-1]
        result = get_other_indicators(m_data)
        deal_data = result.dropna(axis=0)
        # 对数据做不同的时间跨度的处理
        deal_result = deal_data_for_Nmean(deal_data, N=3)
        #特征选择
        #final_data = Filter(use=False).Variance_selection(threshold=3, data=deal_result)
        final_data = []
        # print(deal_result)
        print('***')
        # print(len(columns))
        random_forest(final_data, ratio=0.7)
        scoreList.append((getScore(), parent))

    print(scoreList)
Beispiel #11
0
        #print(data.loc[indexs].values)

    x = np.copy(temp_x[:-1])
    return (x, y)


# Generate sample data
# centers = [[1, 1], [-1, -1], [1, -1]]
# X, _ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6)

if __name__ == '__main__':

    data = pd.read_csv(
        '/home/mars/Data/finialData/electronic_infomation/002544.csv')
    data = data[::-1]
    result = get_other_indicators(data)

    #result = data[['price_change', 'p_change']]
    deal_result = result.dropna(axis=0)
    # close = deal_result['close']
    #
    s_deal_data = deal_data_from_dataFrame(deal_result)
    data_x = s_deal_data[0]
    data_y = s_deal_data[1]
    print('data_x', data_x.shape)
    # 特征处理
    #t_deal_data_x = Filter(use=False).Variance_selection(threshold=3, data=s_deal_data)[0]
    # 归一化
    final_data_x = nr.standardized_mars(data_x)
    #
    # pca_x = oc.LOF_PCA_for_Clustering(final_data_x)
Beispiel #12
0
def main():
    # 记录每轮的分数和对应删除特征值
    score_features = []

    data = pd.read_csv('/home/mars/Data/000032.csv')
    # 按时间的增序排列
    data = data[::-1]
    # 加入其他的指标
    result = get_other_indicators(data)
    #result = pd.DataFrame(data)
    # 除掉数据为NAN的行数据
    deal_result = result.dropna(axis=0)
    # 对X处理, Y值做二分化处理
    data_x = dataX_from_dataFrame(deal_result)
    data_y = dataY_from_dataFrame(deal_result)
    s_deal_result = (data_x, data_y)
    # 对数据做不同的时间跨度的处理,包括Y值的处理
    #s_deal_result = deal_data_for_Nmean(deal_result, N=3)

    # 特征选择, 也可以不做特征选择
    #final_data = Filter(use=False).Variance_selection(threshold=3, data=s_deal_result)
    feature_importances = np.copy(random_forest(s_deal_result))
    delete_feature = []
    # 初始的特征值不用删除,

    score_features.append((getScore(),np.copy(delete_feature)))

    columns = list(deal_result.columns.values)
    #print(feature_importances)
    number = len(feature_importances)
    # 除掉特征贡献小于 1 / number 的特征
    for i in range(number):
        if feature_importances[i] < (1 / number):
            # delect_index.append(i)
            delete_feature.append(columns[i])
    # if 'price_change' in delete_feature:
    #     delete_feature.remove('price_change')

    # # 获得需要删除的列名
    # for i in delect_index:
    #     delete_feature.append(columns[i])

    # 去掉贡献很小的features
    for round in range(1, 10):
        # 被删除的特征值不能少于原数量的一半
        if len(delete_feature) <= int(number*2/3):
            if delete_feature == []:
                break
            else:
                # 去掉贡献值小的特征值
                deal_result_2 = deal_result.drop(labels=delete_feature, axis=1)
                print(delete_feature)
                print(round, '>>***')
                columns_2 = list(deal_result_2.columns.values)
                print(len(columns_2))
                final_data_2_X = dataX_from_dataFrame(deal_result_2)
                final_data_2 = (final_data_2_X, data_y)
                feature_importances_2 = random_forest(final_data_2)
                score_features.append((getScore(), np.copy(delete_feature)))
                #print(feature_importances_2)
                # 讲贡献值最小的两位排除
                for i in range(2):
                    min_index = list(feature_importances_2).index(min(feature_importances_2))
                    delete_feature.append(columns_2[min_index])
                del(deal_result_2)
                del(columns_2)
                del(final_data_2_X)
                del(feature_importances_2)
        else:
            break
    # 比较得出最高的分数,并且输出对应的特征值
    scoreList = []
    for one in score_features:
        score = one[0]
        scoreList.append(score)

    max_socre = max(scoreList)
    max_index = scoreList.index(max_socre)
    remove_feature = score_features[max_index][1]
    print(score_features)
    print(max_socre, remove_feature)
Beispiel #13
0
def other_main():

    np.random.seed(42)
    data = pd.read_csv(
        '/home/mars/Data/finialData/electronic_infomation/300297.csv')
    data = data[::-1]
    result = get_other_indicators(data)
    delete_feature = []
    deal_result = result.dropna(axis=0)
    # print(deal_result)
    print('***')
    #print(len(columns))

    final_data = deal_data_from_dataFrame(deal_result)
    data_y = final_data[1]
    final_data_x = nr.standardized_mars(final_data[0])
    print(final_data_x.shape)
    pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=0.9)

    # xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))
    # # Generate normal (not abnormal) training observations
    # X = 0.3 * np.random.randn(100, 2)
    # X_train = np.r_[X + 2, X - 2]
    # # Generate new normal (not abnormal) observations
    # X = 0.3 * np.random.randn(20, 2)
    # X_test = np.r_[X + 2, X - 2]
    # # Generate some abnormal novel observations
    # X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

    # fit the model for novelty detection (novelty=True)
    print('pca_x', pca_x.shape)
    clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
    clf.fit(pca_x)
    # DO NOT use predict, decision_function and score_samples on X_train as this
    # would give wrong results but only on new unseen data (not used in X_train),
    # e.g. X_test, X_outliers or the meshgrid
    y_pred_test = clf.predict(pca_x)
    print(y_pred_test)
    error_index = [i for i, x in enumerate(y_pred_test) if x == -1]

    print('error size', y_pred_test[y_pred_test == -1].size)
    print('index of witch is -1 *******')
    print([i for i, x in enumerate(y_pred_test) if x == -1])
    print('*******')
    # y_pred_outliers = clf.predict(X_outliers)
    # n_error_test = y_pred_test[y_pred_test == -1].size
    # n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
    '''
    # plot the learned frontier, the points, and the nearest vectors to the plane
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.title("Novelty Detection with LOF")
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
    a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
    plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
    
    s = 40
    b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
    b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
                     edgecolors='k')
    c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
                    edgecolors='k')
    plt.axis('tight')
    plt.xlim((-5, 5))
    plt.ylim((-5, 5))
    plt.legend([a.collections[0], b1, b2, c],
               ["learned frontier", "training observations",
                "new regular observations", "new abnormal observations"],
               loc="upper left",
               prop=matplotlib.font_manager.FontProperties(size=11))
    plt.xlabel(
        "errors novel regular: %d/40 ; errors novel abnormal: %d/40"
        % (n_error_test, n_error_outliers))
    plt.show()
    
    '''
    '''
Beispiel #14
0
def fit_randomForest_MS(daySpan=0, dataPath="", stock_code=''):
    # data = pd.read_csv(dataPath)
    # data = data[::-1]
    # print(data[:10])
    # # 加入其他的指标
    # result = get_other_indicators(data)
    # deal_result = result.dropna(axis=0)
    # 利用LOF处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)

    # 根据stock_code获得相关性矩阵对应的相关性数据
    #stock_code = '\'300017'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    #print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        ##print('code:', code[1:])
        path = '/home/mars/Data/finialData/electronic_infomation/' + code[
            1:] + '.csv'
        code_data = pd.read_csv(path, index_col='date')

        result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接,并且去掉NAN数据
    df = pd.concat(dataList, axis=1)
    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    #print('new_df:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    #print('new_df2:', new_df.get('price_change'))
    print('all shape:', new_df.shape)
    #new_df.to_csv('300017_conbine.csv')
    deal_result = new_df

    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        #
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)

    all_y = s_deal_data[1]
    MSx_train, MSx_test, MSy_train, MSy_test = ms.getMS_repx_data(
        final_data_x, all_y)
    all_x = np.vstack((MSx_train, MSx_test))
    all_y = np.concatenate((MSy_train, MSy_test), axis=0)
    max_score = fit_randomForest_rep(data=(all_x, all_y))
    print('综上的最高得分为:', max_score)
Beispiel #15
0
# 提取前10的相关性股票
top_10 = collection.index.tolist()
top_10 = top_10[:5]
# 获得对应的数据
dataList = []
code_name = []
for code in top_10:
    code_name.append(code)
    # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
    # code = code_relation[code_relation.get(stock_code)==score].index
    #print('code:', code[1:])
    path = '/home/mars/Data/finialData/electronic_infomation/' + code[
        1:] + '.csv'
    code_data = pd.read_csv(path, index_col='date')

    result = get_other_indicators(code_data)
    # 数据整合
    dataList.append(result)
# 按照时间对接,并且去掉NAN数据
df = pd.concat(dataList, axis=1, sort=False)

# pandas会 按照文件的index索引来进行重新的拼接
new_df = df.sort_index()
#print('new_df:', new_df[:5])

new_df.dropna(axis=0, inplace=True)
#print('new_df2:', new_df.get('price_change'))
#print('all shape:', new_df.shape)
deal_result = new_df

data_x = dataX_from_dataFrame(deal_result)
Beispiel #16
0
def fit_SVM(daySpan=0, code=None):
    stock_code = '\'000021'
    collection = pd.read_csv('/home/mars/Data/finialData/code_correlation.csv',
                             index_col=0)
    collection = collection.sort_values(stock_code, ascending=False)

    print(collection)

    # 提取前10的相关性股票
    top_10 = collection.index.tolist()
    top_10 = top_10[:5]
    # 获得对应的数据
    dataList = []
    code_name = []
    for code in top_10:
        code_name.append(code)
        # df[(df.BoolCol==3)&(df.attr==22)].index.tolist()
        # code = code_relation[code_relation.get(stock_code)==score].index
        print('code:', code[1:])
        path = '/home/mars/Data/finialData/electronic_infomation/' + code[
            1:] + '.csv'
        code_data = pd.read_csv(path, index_col='date')

        result = get_other_indicators(code_data)
        # 数据整合
        dataList.append(result)
    # 按照时间对接,并且去掉NAN数据
    df = pd.concat(dataList, axis=1)

    # pandas会 按照文件的index索引来进行重新的拼接
    new_df = df.sort_index()
    print('new_df:', new_df[:5])

    new_df.dropna(axis=0, inplace=True)
    print('new_df2:', new_df.get('price_change'))
    print('all shape:', new_df.shape)
    deal_result = new_df
    # 利用LOF处理原始数据进行重新的决策
    #final_data = deal_data_from_dataFrame(deal_result)
    data_x = dataX_from_dataFrame(deal_result)
    if daySpan == 0:
        #
        data_y = dataY_from_dataFrame(deal_result)
    else:

        data_y = dataY_for_Nmean(deal_result, N=daySpan)
        data_x = data_x[:len(data_y)]
    s_deal_data = (data_x, data_y)

    # data_y = final_data[1]
    final_data_x = nr.standardized_mars(s_deal_data[0])
    print(final_data_x.shape)

    # 直接使用pca数据,将0.7做特异值处理,以后重新组合起来进行,随机森林的训练
    # 拿100%的数据进行PCA
    all_y = s_deal_data[1]
    scoreListInfo = []
    for i in range(6, 40, 1):
        pca_x = PCA_mars.getPcaComponent(final_data_x, n_components=i)
        print(pca_x.shape)
        #x_train, x_test, y_train, y_test = train_test_split(pca_x, all_y, test_size=0.3, random_state=0, shuffle=False)

        # 奇异值处理
        lof_data_x = oc.LOF_PCA_for_Clustering(pca_x, isUsePCA=False)
        #all_x = np.vstack((lof_data_x, x_test))
        print(pca_x.shape)
        x_train, x_test, y_train, y_test = train_test_split(lof_data_x,
                                                            all_y,
                                                            test_size=0.3,
                                                            random_state=0,
                                                            shuffle=False)

        # fit the model
        #for fig_num, kernel in enumerate(('linear', 'rbf', 'poly','sigmoid')):
        for c in np.arange(0.1, 1, 0.1):
            clf = svm.SVC(gamma=c, kernel='rbf')
            clf.fit(x_train, y_train)
            score = clf.score(x_test, y_test)
            print(score)
            scoreListInfo.append((score, i, c))
    #print(scoreListInfo)
    scoreList = []
    for one in scoreListInfo:
        score = one[0]
        scoreList.append(score)
    max_score = max(scoreList)
    max_index = scoreList.index(max_score)
    # error_ratio = scoreInfoList[max_index][1]
    components = scoreListInfo[max_index][1]
    c = scoreListInfo[max_index][2]
    del scoreListInfo
    del scoreList
    print('best paramers:')
    print(max_score, c, components)
    return (max_score, c, components)