Ejemplo n.º 1
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                    modes=['markers', 'markers+lines', 'lines'],
                    colors=[c, 'red', 'blue'], return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                      columns=['ah', 'bh', 'ch'])
    x_labels = ['low', 'high']
    y_labels = ['small', 'large']
    # TODO: this plot was not JSON serializable, use a different serialization method for all plots
    hmdf = pf.heatmap_df(df, x_labels=x_labels, y_labels=y_labels, return_plot=True)

    df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
    triangle = pf.triangle(df[['q', 'w', 'e']], return_plot=True)

    fnamedict = {"xys": xys, "xym": xym, "xy_colors":xy_colors,
                 "hmb": hmb, "his": his, "bar": bar,
                 "pcp": pcp, "vio": vio, "scm": scm,
                 'triangle': triangle,
                 'hmdf': hmdf
                 }

    for fname, obj in fnamedict.items():
        if obj in [vio, scm]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f, cls=MontyEncoder)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Ejemplo n.º 2
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'],
                      return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    fnamedict = {
        "xys": xys,
        "xym": xym,
        "xy_colors": xy_colors,
        "hmb": hmb,
        "his": his,
        "bar": bar,
        "pcp": pcp,
        "vio": vio,
        "scm": scm
    }

    for fname, obj in fnamedict.items():
        if obj in [vio, scm]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Ejemplo n.º 3
0
X_test = X_test.drop('formula', axis=1)

rf_reg = RandomForestRegressor(n_estimators=50, random_state=1)
rf_reg.fit(X_train, y_train)

# get fit statistics
print('training R2 = ' + str(round(rf_reg.score(X_train, y_train), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_train, y_pred=rf_reg.predict(X_train))))
print('test R2 = ' + str(round(rf_reg.score(X_test, y_test), 3)))
print('test RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_test, y_pred=rf_reg.predict(X_test))))


# ### check what are the most important features used by random forest model

# In[27]:


importances = rf.feature_importances_
# included = np.asarray(included)
included = X.columns.values
indices = np.argsort(importances)[::-1]

pf = PlotlyFig(y_title='Importance (%)',
               title='Feature by importances',
               mode='notebook',
               fontsize=20,
               ticksize=15)

pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

def pre():
    print("既然有了这么多数据,我们需要考虑好谁是输入,谁是输出")
    print("这里我们规定K-var作为预测项,然后就是其他所有的数字项都是features")
    df = pd.read_csv('引入结构中的密度.csv')
    print(df.columns)
    y = df['K_VRH'].values
    excluded = [
        "G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id",
        "poisson_ratio", "structure", "composition", "composition_oxid"
    ]
    X = df.drop(excluded, axis=1)
    print("现在有{}个可能的特征:\n\n".format(X.shape[1], X.columns.values))
    lr = LinearRegression()
    lr.fit(X, y)
    #看一下我们的结果如何
    print("训练的r2是:{}".format(round(lr.score(X, y), 3)))
    print("训练后的RMSE是{:.3f}".format(
        np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X)))))

    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation = KFold(n_splits=10, shuffle=False, random_state=1)
    scores = cross_val_score(lr,
                             X,
                             y,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    print(scores)
    # print("暂停30s休息看看结果")
    #time.sleep(30)
    rmse_scores = []

    for s in scores:
        #print(s)
        #print(np.sqrt(abs(s)))
        rmse_scores.append(np.sqrt(abs(s)))

    print(rmse_scores)
    r2_scores = cross_val_score(lr,
                                X,
                                y,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    print(r2_scores)

    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores))))

    #到这里我们发现预测的结果是不错的
    #但是我们还是需要画图看一下效果
    pf = PlotlyFig(x_title='DFT (MP) bulk modules(Gpa)',
                   y_title='Predicated bulk modules(Gpa)',
                   title='Linear regression',
                   filename='lr_regression.jpg')
    pf.xy(xy_pairs=[(y, cross_val_predict(lr, X, y, cv=crossvalidation)),
                    ([0, 400], [0, 400])],
          labels=df['formula'],
          modes=['markers', 'lines'],
          lines=[{}, {
              'color': 'black',
              'dash': 'dash'
          }],
          showlegends=False)
    print("这就是线性回归的威力,感觉还是不错的")
    print("\n\n")

    #现在我们尝试使用随机森林来看一下结果如何
    rf = RandomForestRegressor(n_estimators=50, random_state=1)
    rf.fit(X, y)
    print("随机森林的r2是:{}".format(round(rf.score(X, y), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(np.sqrt(mean_squared_error(y_true=y, y_pred=rf.predict(X))), 3)))
    #单看整个数据集上效果还是不错的

    importances = rf.feature_importances_
    included = X.columns.values
    indices = np.argsort(importances)[::-1]

    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf,
                             X,
                             y,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf,
                                X,
                                y,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='DFT (MP) bulk modulus (GPa)',
                      y_title='Random forest bulk modulus (GPa)',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y, cross_val_predict(rf, X, y, cv=crossvalidation)),
              ([0, 400], [0, 400])],
             labels=df['formula'],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)

    print("\n\n")
Ejemplo n.º 5
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'], return_plot=True)
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c], xlabels, ylabels,
                                         return_plot=True)
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b], cols=xlabels,
                                                return_plot=True)
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = \
        self.pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)[
            'layout']
        vio_true = self.fopen("template_vio.json")
        self.assertTrue(vio_test == vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c], return_plot=True)['layout']
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        hmdf_test = self.pf.heatmap_df(df, x_labels=x_labels,
                                       y_labels=y_labels,
                                       return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_trianlge(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
def pre_gap_forjiyuan():
    #首先载入我们之前建立的vector
    data = pd.read_csv('2d_bulk.csv')
    print(data.columns)
    print(data.describe())
    df = pd.read_csv('vector_new_plustitle.csv')
    print(df.columns)
    print(df.describe())
    df['is_daoti'] = np.nan
    df['bulk_gap'] = np.nan

    j = 0

    print("首先我们需要把之前bulk的is_daoti和预测的gaps放到vector中")
    print("有个问题,某些材料id没有对应信息,只能采取填充法,先规定nan,然后用平均值填充")

    print(len(df.index))

    for i in range(len(df.index)):
        #print("this is {}th".format(i+1))
        for j in range(len(data.index)):
            str = data.ix[j, 'mp_id']
            str = str[3:]
            if (eval(str) == df.ix[i, 'id']):
                #print(df.ix[j,'id'])
                #print(str)
                df.ix[i, 'is_daoti'] = data.ix[j, 'is_daoti']
                df.ix[i, 'bulk_gap'] = data.ix[j, 'gaps']
                break
    df = df.fillna(method="ffill")
    #df.to_csv('plus_bulk_isdaoti_2dvector.csv')
    print(df[df.isnull().values == True])

    y_gap = df['gap'].values
    y_m = df['efm'].values

    unwanted = ['gap', 'id', 'efm']
    X_df = df.drop(unwanted, axis=1, inplace=False)
    X_gap = X_df.values

    X_gap = preprocessing.scale(X_gap)
    X_m = X_gap
    crossvalidation = KFold(n_splits=5, shuffle=True, random_state=2)

    #首先进行线性回归
    print("首先进行线性回归")
    #print(metrics.SCORERS.keys())
    lr = LinearRegression()
    lr.fit(X_gap, y_gap)
    #看一下我们的结果如何
    print("线性训练的r2是:{}".format(round(lr.score(X_gap, y_gap), 3)))
    print("训练后的RMSE是{:.3f}".format(
        np.sqrt(mean_squared_error(y_true=y_gap, y_pred=lr.predict(X_gap)))))
    scores = cross_val_score(lr,
                             X_gap,
                             y_gap,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    rmse_scores = []
    for s in scores:
        rmse_scores.append(np.sqrt(abs(s)))
    print(rmse_scores)
    r2_scores = cross_val_score(lr,
                                X_gap,
                                y_gap,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    print("相关系数是:{}".format(r2_scores))
    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores))))

    #然后开始进行随机森林的预测
    print("开始进行随机森林的预测")
    rf = RandomForestRegressor(n_estimators=90,
                               max_features=10,
                               max_depth=12,
                               min_samples_split=2,
                               random_state=1)
    rf.fit(X_gap, y_gap)
    print("随机森林的r2是:{}".format(round(rf.score(X_gap, y_gap), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_gap,
                                       y_pred=rf.predict(X_gap))), 3)))

    print("看一下回归效果和什么关系更加密切")
    importances = rf.feature_importances_
    included = X_df.columns.values
    indices = np.argsort(importances)[::-1]
    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances gap',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf,
                             X_gap,
                             y_gap,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf,
                                X_gap,
                                y_gap,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='2d HSE calculate gap(ev)',
                      y_title='Random forest predicated 2d gap(ev)',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y_gap, cross_val_predict(rf, X_gap, y_gap, cv=crossvalidation)),
              ([0, 100], [0, 100])],
             labels=[],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)

    #这一部分是调参的过程

    #print("首先是n_estimaters的个数和max_features进行调参")
    #param_test1={'n_estimators':range(50,130,10),'max_features':range(5,15)}
    #gsearch1=gridsearchcv(estimator=randomforestregressor(random_state=10),
    #                     param_grid=param_test1,scoring='neg_mean_squared_error',cv=5)
    #gsearch1.fit(x_gap,y_gap)
    #print(gsearch1.cv_results_)
    #print("最好的参数是{}".format(gsearch1.best_params_))
    #print("最好的均方误差是{}".format(gsearch1.best_score_))

    #print("得到了最好的n_estimators是90,最大特征数是10")

    #print("最后是对于max_depth和min_samples_split的调参")
    #param_test3={'max_depth':range(4,20,2),'min_samples_split':range(2,5,1)}
    #gsearch3=gridsearchcv(estimator=randomforestregressor(random_state=10,n_estimators=90,max_features=10),
    #                      param_grid=param_test3,scoring='neg_mean_squared_error',cv=5)
    #gsearch3.fit(x_gap,y_gap)
    #print(gsearch3.cv_results_)
    #print("最好的参数是{}".format(gsearch3.best_params_))
    #print("最好的准确率是{}".format(gsearch3.best_score_))

    #print("得到了最好的max_depth是12,min_samples_split是2")

    #开始进行有效质量的预测
    print("开始进行有效质量预测")
    print("首先,支持向量机")
    svm_m = svm.SVR(gamma='scale', C=1.0)
    svm_m.fit(X=X_m, y=y_m)
    print("支持向量机的r2是{:.3f}".format(svm_m.score(X_m, y_m)))
    print("支持向量机的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_m, y_pred=svm_m.predict(X_m))),
            3)))
    scores = cross_val_score(svm_m,
                             X_m,
                             y_m,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(svm_m,
                                X_m,
                                y_m,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于支持向量机,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))

    print("没错,开始进行随机森林的预测")
    rf_m = RandomForestRegressor(n_estimators=120, random_state=1)
    rf_m.fit(X_m, y_m)
    print("随机森林的r2是:{}".format(round(rf_m.score(X_m, y_m), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_m, y_pred=rf_m.predict(X_m))),
            3)))

    print("看一下预测有效质量效果和什么关系更加密切")
    importances = rf_m.feature_importances_
    included = X_df.columns.values
    indices = np.argsort(importances)[::-1]
    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances efm',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf_m,
                             X_m,
                             y_m,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf_m,
                                X_m,
                                y_m,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='2d efm calculate efm(ev)',
                      y_title='Random forest predicated 2d efm',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y_m, cross_val_predict(rf_m, X_m, y_m, cv=crossvalidation)),
              ([0, 100], [0, 100])],
             labels=[],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)
    print("all work done!")
Ejemplo n.º 7
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy(
            [(a, b), (a, c), (c, c)],
            modes=['markers', 'markers+lines', 'lines'],
            colors=[c, 'red', 'blue'],
            return_plot=True)
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c],
                                         xlabels,
                                         ylabels,
                                         return_plot=True)
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b],
                                                cols=xlabels,
                                                return_plot=True)
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = \
        self.pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)[
            'layout']
        vio_true = self.fopen("template_vio.json")
        self.assertTrue(vio_test == vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c],
                                          return_plot=True)['layout']
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):
        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        b = [2, 4, 6, 8, 10, 2, 4, 6, 8, 10]
        c = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        df = pd.DataFrame(data=np.asarray([a, b, c]).T,
                          columns=['a', 'b', 'c'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        self.pf.heatmap_df(df,
                           x_labels=x_labels,
                           y_labels=y_labels,
                           return_plot=True)
Ejemplo n.º 8
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_test['data'] = [p.to_plotly_json() for p in xys_test['data']]
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_test['data'] = [p.to_plotly_json() for p in xym_test['data']]
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy(
            [(a, b), (a, c), (c, c)],
            modes=['markers', 'markers+lines', 'lines'],
            colors=[c, 'red', 'blue'],
            return_plot=True)
        xy_colors_test['data'] = [
            p.to_plotly_json() for p in xy_colors_test['data']
        ]
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c],
                                         xlabels,
                                         ylabels,
                                         return_plot=True)
        hmb_test['data'] = [p.to_plotly_json() for p in hmb_test['data']]
        hmb_true = self.fopen("template_hmb.json")
        self.assertEqual(hmb_test, hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_test['data'] = [p.to_plotly_json() for p in his_test['data']]
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_test['data'] = [p.to_plotly_json() for p in bar_test['data']]
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b],
                                                cols=xlabels,
                                                return_plot=True)
        pcp_test['data'] = [p.to_plotly_json() for p in pcp_test['data']]
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = self.pf.violin([a, b, c, b, a, c, b],
                                  cols=xlabels,
                                  return_plot=True)['layout']
        vio_test = vio_test.to_plotly_json()
        vio_true = self.fopen("template_vio.json")

        # Avoid errors from CircleCI's different plotly config
        for vio in [vio_test, vio_true]:
            vio["xaxis"]["range"] = [-0.167009, 0.167009]
        self.assertDictEqual(vio_test, vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c],
                                          return_plot=True)['layout']
        scm_test = scm_test.to_plotly_json()
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        with self.assertWarns(UserWarning):
            hmdf_test = self.pf.heatmap_df(df,
                                           x_labels=x_labels,
                                           y_labels=y_labels,
                                           return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_triangle(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
Ejemplo n.º 9
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                    modes=['markers', 'markers+lines', 'lines'],
                    colors=[c, 'red', 'blue'], return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # plotly figures need to be converted jsonable data
    xys['data'] = [p.to_plotly_json() for p in xys['data']]
    xym['data'] = [p.to_plotly_json() for p in xym['data']]
    xy_colors['data'] = [p.to_plotly_json() for p in xy_colors['data']]
    hmb['data'] = [p.to_plotly_json() for p in hmb['data']]
    his['data'] = [p.to_plotly_json() for p in his['data']]
    bar['data'] = [p.to_plotly_json() for p in bar['data']]
    pcp['data'] = [p.to_plotly_json() for p in pcp['data']]

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    # plotly layout needs to be converted jsonable data
    vio = {'layout': vio['layout'].to_plotly_json()}
    scm = {'layout': scm['layout'].to_plotly_json()}

    df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                      columns=['ah', 'bh', 'ch'])
    x_labels = ['low', 'high']
    y_labels = ['small', 'large']
    # TODO: this plot was not JSON serializable, use a different serialization method for all plots
    hmdf = pf.heatmap_df(df, x_labels=x_labels, y_labels=y_labels, return_plot=True)
    hmdf['data'] = [p.to_plotly_json() for p in hmdf['data']]

    df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
    triangle = pf.triangle(df[['q', 'w', 'e']], return_plot=True)

    fnamedict = {"xys": xys, "xym": xym, "xy_colors":xy_colors,
                 "hmb": hmb, "his": his, "bar": bar,
                 "pcp": pcp, "vio": vio, "scm": scm,
                 'triangle': triangle,
                 'hmdf': hmdf
                 }

    for fname, obj in fnamedict.items():
        if fname in ["vio", "scm"]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f, cls=MontyEncoder)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Ejemplo n.º 10
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_test['data'] = [p.to_plotly_json() for p in xys_test['data']]
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_test['data'] = [p.to_plotly_json() for p in xym_test['data']]
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'], return_plot=True)
        xy_colors_test['data'] = [p.to_plotly_json() for p in xy_colors_test['data']]
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c], xlabels, ylabels,
                                         return_plot=True)
        hmb_test['data'] = [p.to_plotly_json() for p in hmb_test['data']]
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_test['data'] = [p.to_plotly_json() for p in his_test['data']]
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_test['data'] = [p.to_plotly_json() for p in bar_test['data']]
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b], cols=xlabels,
                                                return_plot=True)
        pcp_test['data'] = [p.to_plotly_json() for p in pcp_test['data']]
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = self.pf.violin(
            [a, b, c, b, a, c, b], cols=xlabels, return_plot=True)['layout']
        vio_test = vio_test.to_plotly_json()
        vio_true = self.fopen("template_vio.json")

        # Avoid errors from CircleCI's different plotly config
        for vio in [vio_test, vio_true]:
            vio["xaxis"]["range"] = [-0.167009, 0.167009]
        self.assertDictEqual(vio_test, vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c], return_plot=True)['layout']
        scm_test = scm_test.to_plotly_json()
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        with self.assertWarns(UserWarning):
            hmdf_test = self.pf.heatmap_df(df, x_labels=x_labels,
                                           y_labels=y_labels,
                                           return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_triangle(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
Ejemplo n.º 11
0
def random_class():
    X_df=pd.DataFrame()
   
    df=pd.read_csv('full_vector_all.csv',index_col = [0])
    df=df.fillna(df.mean())
    print(df.index.name) 
    #print(df.index)    
    print(df[df.isnull().values==True])        
    df.index.name='mp_id'
    print(df.index.name)
    #读取数据,补充缺失值,更改index
   
    y=df['is_daoti'].values
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid','is_daoti']
    X_df=df.drop(uwnanted_columns,axis=1,inplace=False)
    print(X_df[X_df.isnull().values==True]) 
    X=X_df.values
    
    #选定相关的feature,把原始的区分画出来
    X_fig=df.loc[:,['maximum MendeleevNumber','range oxidation state']]
    X_fig=X_fig.values
    print(X_fig.shape)    
    plt.scatter(X_fig[y==0,0],X_fig[y==0,1],color='red')
    plt.scatter(X_fig[y==1,0],X_fig[y==1,1],color='blue')
    plt.xlabel('maximum MendeleevNumber')
    plt.ylabel('range oxidation state')
    plt.title('is_daoti classifier (red means no daoti,blue means daoti)')
    plt.show()

    
    crossvalidation=KFold(n_splits=10,shuffle=True,random_state=10)
    X=preprocessing.scale(X)
    #print(np.mean(X))

    
    #print("首先是支持向量机")
    #svm_=svm.SVC(kernel='rbf',random_state=1,class_weight='balanced')
    #print("先看一下默认的结果:")
    #svm_.fit(X,y)
    #y_pre_svm=svm_.predict(X)
    #print("准确度是{}".format(metrics.accuracy_score(y,y_pre_svm)))
    #print("召回率是{}".format(metrics.recall_score(y,y_pre_svm)))
    #print("试着先进行调参")

   
    #a=np.array(range(100,200,10))
    #c=list(1./a)
    #print(c)
    #param_test_svm={'gamma':c,'C':[0.5,0.6,0.7,0.8,0.9,1,1.2,1.4,1.6,1.8]}
    #gsearch_svm=GridSearchCV(estimator=svm.SVC(random_state=10),
    #                      param_grid=param_test_svm,scoring='accuracy',cv=5)
    #gsearch_svm.fit(X,y)
    #print(gsearch_svm.cv_results_)
    #print("最好的参数是{}".format(gsearch_svm.best_params_))
    #print("最好的准确率是{}".format(gsearch_svm.best_score_))
    
   
    #svm_=svm.SVC(kernel='rbf',C=1.2,gamma=0.00625,random_state=1,class_weight='balanced')
    #scores_svm=cross_val_score(svm_,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    #print(scores_svm)      
    #print("现在展示交叉验证的结果")    
    #print("经过{}次交叉验证,准确度的平均值是{:.3f}".format(len(scores_svm),np.mean(scores_svm)))

    #首先是随机森林的分类方法
    print("然后是随机森林")
    clf=RandomForestClassifier(n_estimators=100,oob_score=False,random_state=2)
    clf.fit(X,y)
    y_pre=clf.predict(X)
    print("看一下y:{}".format(y))
    print("看一下y_pre:{}".format(y_pre))
    print("对于全部样本训练的准确度是:{}".format(round(clf.score(X,y),3)))
    print("全部样本,准确率是{}".format(metrics.accuracy_score(y,y_pre)))
    print("全部样本,召回率是{}".format(metrics.recall_score(y,y_pre)))
    print("全部样本,精度是{}".format(metrics.precision_score(y,y_pre)))

     
    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation=KFold(n_splits=10,shuffle=True,random_state=10)
    scores=cross_val_score(clf,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    print(scores)    
    print("现在展示交叉验证的结果")    
    print("经过{}次交叉验证,准确性的平均值是{:.3f}".format(len(scores),np.mean(scores)))

    #展示feature的重要程度
    importances=clf.feature_importances_    
    included=X_df.columns.values
    indices=np.argsort(importances)[::-1]
    pf=PlotlyFig(y_title='importance(%)',title='Feature by importances(classfier)',fontsize=20,ticksize=15)
    pf.bar(x=included[indices][0:10],y=importances[indices][0:10])

    print("最后,用最好的参数进行验证效果")     
    clf=RandomForestClassifier(n_estimators=100,max_features=14,max_depth=16,min_samples_split=6,random_state=3)
    scores_best=cross_val_score(clf,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    print(scores)    
    print("现在展示交叉验证的结果")   
    print("经过{}次交叉验证,准确性的平均值是{:.3f}".format(len(scores),np.mean(scores)))

   
    

    #对随机森林进行调参

    #print("首先是n_estimaters的个数进行调参")
    #param_test1={'n_estimators':range(50,110,10)}
    #gsearch1=GridSearchCV(estimator=RandomForestClassifier(random_state=10),
    #                      param_grid=param_test1,scoring='accuracy',cv=5)
    #gsearch1.fit(X,y)
    #print(gsearch1.cv_results_)
    #print("最好的参数是{}".format(gsearch1.best_params_))
    #print("最好的准确率是{}".format(gsearch1.best_score_))

    #print("其次是对于max_features的调参")
    #param_test2={'max_features':range(6,15)}
    #gsearch2=GridSearchCV(estimator=RandomForestClassifier(random_state=10,n_estimators=80),
    #                      param_grid=param_test2,scoring='accuracy',cv=5)
    #gsearch2.fit(X,y)
    #print(gsearch2.cv_results_)
    #print("最好的参数是{}".format(gsearch2.best_params_))
    #print("最好的准确率是{}".format(gsearch2.best_score_))

    #print("最后是对于max_depth和min_samples_split的调参")
    #param_test3={'max_depth':range(8,20,2),'min_samples_split':range(2,8,2)}
    #gsearch3=GridSearchCV(estimator=RandomForestClassifier(random_state=10,n_estimators=80,max_features=14),
    #                      param_grid=param_test3,scoring='accuracy',cv=5)
    #gsearch3.fit(X,y)
    #print(gsearch3.cv_results_)
    #print("最好的参数是{}".format(gsearch3.best_params_))
    #print("最好的准确率是{}".format(gsearch3.best_score_))
    
    
    
    


    ##然后是朴素贝叶斯
    #print("然后是朴素贝叶斯")
    #bayes=GaussianNB()
    #bayes.fit(X,y)
    #print("训练的准确度是:{}".format(round(bayes.score(X,y),3)))
    ##print("训练后的RMSE是{:.3f}".format(np.sqrt(mean_squared_error(y_true=y,y_pred=bayes.predict(X)))))    
    #scores_bayes=cross_val_score(bayes,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    #print(scores_bayes)  
    #r2_scores_bayes=cross_val_score(clf,X,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    #print(r2_scores_bayes)
    #print("现在展示交叉验证的结果")
    #print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores_bayes),np.mean(np.abs(r2_scores_bayes))))
    #print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores_bayes),np.mean(scores_bayes)))

    #然后是支持向量机
    #然后正式预测体材料的分类 

    clf.fit(X,y)  
    X_2d=pd.DataFrame()
    df_2d=pd.read_csv('2d_vector_plus.csv',index_col = [0])    
    print(df_2d.index.name)       
    print(df_2d[df_2d.isnull().values==True])        
    df_2d.index.name='mp_id'
    print(df_2d.index.name)
    #读取数据,补充缺失值,更改index   
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid','is_daoti']
    X_2d=df_2d.drop(uwnanted_columns,axis=1,inplace=False)
    X_2d=X_2d.fillna(X_2d.mean())
    X_2d_value=X_2d.values
    X_2d_value=preprocessing.scale(X_2d_value)
    y_pre_2d=clf.predict(X_2d_value)
    df_2d['is_daoti']=2
    for i in range(len(df_2d['is_daoti'].values)):
        df_2d.ix[i,['is_daoti']]=y_pre_2d[i]
    
    df_2d.to_csv('预测是否为导体.csv')

    

    print("all work done")
Ejemplo n.º 12
0
def pre():

    ##导入之后首先进行测试是否有缺失值,并且命名行索引,以及添加gaps数值
    X=pd.DataFrame()
    df=pd.read_csv('full_vector_all.csv',index_col = [0])     
    print(df.index)     
    print(df.index.name)    
    df.index.name='mp_id'
    print(df.index.name)
   
    y=df['gaps'].values
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid']
    X=df.drop(uwnanted_columns,axis=1,inplace=False)
    X=X.fillna(X.mean())
    print(X[X.isnull().values==True])   

    #首先我想先规定下数据的大小范围,对于范围超额的进行近似
    #for co in df.columns:
    #    for row in df.index:            
    #        if(df.loc[row,co]<0.000001 and df.loc[row,co]>-0.000001 and df.loc[row,co]!=0):
    #            print(df.loc[row,co])
    #            df.loc[row,co]=0
    #        elif(df.loc[row,co]>1000000):
    #            print(df.loc[row,co])
    #            df.loc[row,co]=1000000  
     
    #X.to_csv("delete_af_vector.csv")           
    print("这里我们规定hse作为预测项,然后就是其他所有的数字项都是features")    
    print("现在有{}个可能的特征\n\n".format(X.shape[1]))
    print("X 的维度是{}".format(X.shape))   
    print(X[X.isnull().values==True])   
    print(X.isnull().values.any())
    X_pr = X.values
    X_pr=preprocessing.scale(X_pr)


    #首先是线性回归
    lr=LinearRegression()    
    #print(X)
    lr.fit(X_pr,y)   
    print("训练的r2是:{}".format(round(lr.score(X_pr,y),3)))
    print("训练后的RMSE是{:.3f}".format(np.sqrt(mean_squared_error(y_true=y,y_pred=lr.predict(X_pr)))))

    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)
    scores=cross_val_score(lr,X_pr,y,scoring='neg_mean_squared_error',cv=crossvalidation,n_jobs=1)
    print(scores)
   # print("暂停30s休息看看结果")
    #time.sleep(30)
    rmse_scores=[]
    
    for s in scores:
        #print(s)
        #print(np.sqrt(abs(s)))
        rmse_scores.append(np.sqrt(abs(s)))        

    print(rmse_scores)
    r2_scores=cross_val_score(lr,X_pr,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    print(r2_scores)

    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),np.mean(np.abs(rmse_scores))))

    #到这里我们发现预测的结果是不错的
    #但是我们还是需要画图看一下效果

    pf=PlotlyFig(x_title='HSE calculate gap(ev)',y_title='Predicated gap(ev)',title='Linear regression',filename='lr_regression.jpg')
    pf.xy(xy_pairs=[(y,cross_val_predict(lr,X_pr,y,cv=crossvalidation)),([0,400],[0,400])],labels=df['full_formula'],modes=['markers','lines'],lines=[{},{'color':'black','dash':'dash'}],showlegends=False)
    print("这就是线性回归的威力,感觉还是不错的")
    print("\n\n")

    #现在我们尝试使用随机森林来看一下结果如何
    rf=RandomForestRegressor(n_estimators=100,random_state=1)
    rf.fit(X_pr,y)
    print("随机森林的r2是:{}".format(round(rf.score(X_pr,y),3)))
    print("随机森林的是RMSE是:{}".format(round(np.sqrt(mean_squared_error(y_true=y,y_pred=rf.predict(X_pr))),3)))
    #单看整个数据集上效果还是不错的

    importances=rf.feature_importances_
    #print(importances)
    included=X.columns.values
    indices=np.argsort(importances)[::-1]
    #print(indices)

    pf=PlotlyFig(y_title='importance(%)',title='Feature by importances',fontsize=20,ticksize=15)
    pf.bar(x=included[indices][0:10],y=importances[indices][0:10])


    scores=cross_val_score(rf,X_pr,y,scoring='neg_mean_squared_error',cv=crossvalidation,n_jobs=1)
    r2_scores=cross_val_score(rf,X_pr,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    rmse_scores_rf=np.sqrt(abs(scores))
    
    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='HSE calculate gap(ev)',y_title='Random forest predicated gap(ev)',title='Random forest regression',filename='rf_regression.html')
   
   #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y, cross_val_predict(rf, X_pr, y, cv=crossvalidation)), ([0, 450], [0, 450])], 
      labels=df['full_formula'], modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], showlegends=False)



    #现在开始正式进行测试了
    rf.fit(X_pr,y)
    print("正式开始进行预测")
    df_2d=pd.read_csv('预测是否为导体.csv',index_col = [0])  
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid']
    X_2d=df_2d.drop(uwnanted_columns,axis=1,inplace=False)
    X_2d=X_2d.fillna(X_2d.mean())   
    print(X_2d[X_2d.isnull().values==True]) 
    
    df_2d.index.name='mp_id'
    print(df_2d.index.name)
    #读取数据,补充缺失值,更改index   
    
    
   
    X_2d_value=X_2d.values
    X_2d_value=preprocessing.scale(X_2d_value)
    y_pre_2d=rf.predict(X_2d_value)
    df_2d['gaps']=-10
    for i in range(len(df_2d['gaps'].values)):
        df_2d.ix[i,['gaps']]=y_pre_2d[i]
    
    df_2d.to_csv('2d_bulk.csv')

    

    print("all work done")



    print("\n\n")