Beispiel #1
0
def plot_simple_xy():
    """
    Very simple xy plot with all default settings.
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    pf = PlotlyFig(title="Basic Example", filename="basic.html")
    pf.xy(([1, 2, 3], [4, 5, 6]))
Beispiel #2
0
def plot_simple_xy():
    """
    Very simple xy plot with all default settings.
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    pf = PlotlyFig(title="Basic Example", filename="basic.html")
    pf.xy(([1, 2, 3], [4, 5, 6]))
Beispiel #3
0
def plot_expt_compt_band_gaps(citrine_api_key, limit=0):
    """
    Pulls experimental band gaps from Citrine (w/o dataset limitations) and
        evaluate the DFT computed band gaps (data from materialsproject.org)
        in xy scatter plot. To compare the right values, we pick the computed
        band gaps calculated for a chemical formula that has the lowest energy
        above hull (the most stable structure).
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plots in "offline" mode poped in the default browser.
    """

    # pull experimental band gaps from Citrine
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['chemicalFormula', 'Band gap']
    df_ct = cdr.get_dataframe(prop='band gap', data_type='experimental',
                              show_columns=cols, max_results=limit).rename(
        columns={'chemicalFormula': 'Formula', 'Band gap': 'Expt. gap'})
    df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition
    df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals
    df_ct['Formula'] = df_ct['Formula'].transform(
        lambda x: Composition(x).get_reduced_formula_and_factor()[0])

    # pull computational band gaps from the Materials Project
    df = MPDataRetrieval().get_dataframe(
        criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}},
        properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'],
        index_mpid=False).rename(
        columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap',
                 'material_id': 'mpid'})


    # pick the most stable structure
    df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()]
    df_final = df_ct.merge(df_mp, on='Formula').drop(
                                    'e_above_hull', axis=1).set_index('mpid')
    pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)',
                   y_title='Computed Band Gap (eV)',
                   filename='band_gaps')

    # computed vs. experimental band gap:
    pf.xy([
        ('Expt. gap', 'MP computed gap'),
        ([0, 12], [0, 12])
    ],
        lines=[{}, {'color': 'black', 'dash': 'dash'}],
        labels=df_final.index, modes=['markers', 'lines'],
        names=['Computed vs. expt.', 'Expt. gap'])

    # residual:
    residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float)
    pf.set_arguments(x_title='Experimental band gap (eV)',
                    y_title='Residual (Computed - Expt.) Band Gap (eV)',
                    filename='band_gap_residuals')
    pf.xy(('Expt. gap', residuals), labels = df_final.index)
Beispiel #4
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                    modes=['markers', 'markers+lines', 'lines'],
                    colors=[c, 'red', 'blue'], return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                      columns=['ah', 'bh', 'ch'])
    x_labels = ['low', 'high']
    y_labels = ['small', 'large']
    # TODO: this plot was not JSON serializable, use a different serialization method for all plots
    hmdf = pf.heatmap_df(df, x_labels=x_labels, y_labels=y_labels, return_plot=True)

    df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
    triangle = pf.triangle(df[['q', 'w', 'e']], return_plot=True)

    fnamedict = {"xys": xys, "xym": xym, "xy_colors":xy_colors,
                 "hmb": hmb, "his": his, "bar": bar,
                 "pcp": pcp, "vio": vio, "scm": scm,
                 'triangle': triangle,
                 'hmdf': hmdf
                 }

    for fname, obj in fnamedict.items():
        if obj in [vio, scm]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f, cls=MontyEncoder)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'],
                      return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    fnamedict = {
        "xys": xys,
        "xym": xym,
        "xy_colors": xy_colors,
        "hmb": hmb,
        "his": his,
        "bar": bar,
        "pcp": pcp,
        "vio": vio,
        "scm": scm
    }

    for fname, obj in fnamedict.items():
        if obj in [vio, scm]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Beispiel #6
0
def plot_bulk_shear_moduli():
    """
    Very basic example of xy scatter plot of Voigt-Reuss-Hill (VRH) average
        bulk vs. shear modulus. Poisson ratio as marker colors make the
        distinction between materials with different bulk/shear modulus ratios
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    df = load_elastic_tensor()
    pf = PlotlyFig(df,
                   y_title='Bulk Modulus (GPa)',
                   x_title='Shear Modulus (GPa)',
                   filename='bulk_shear_moduli.jpeg')
    pf.xy(('G_VRH', 'K_VRH'), labels='material_id', colors='poisson_ratio',
          colorscale='Picnic', limits={'x': (0, 300)})
Beispiel #7
0
def plot_thermoelectrics(citrine_api_key, limit=0):
    """
    Scatter plot of the properties of thermoelectric materials based on the data
        available in http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp
        The data is extracted via Citrine data retrieval tools. The dataset
        id on Citrine is 150557
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = [
        'Electrical resistivity', 'Seebeck coefficient',
        'Thermal conductivity', 'Thermoelectric figure of merit (zT)'
    ]
    df_te = cdr.get_dataframe(
        criteria={
            'data_type': 'experimental',
            'data_set_id': 150557,
            'max_results': limit
        },
        properties=['Seebeck coefficient'],
        secondary_fields=True,
    )
    df_te[cols] = df_te[cols].astype(float)
    df_te = df_te[(df_te['Electrical resistivity'] > 5e-4) & \
                  (df_te['Electrical resistivity'] < 0.1)]
    df_te = df_te[abs(df_te['Seebeck coefficient']) < 500].rename(
        columns={'Thermoelectric figure of merit (zT)': 'zT'})

    print(df_te.head())
    pf = PlotlyFig(df_te,
                   x_scale='log',
                   fontfamily='Times New Roman',
                   hovercolor='white',
                   x_title='Electrical Resistivity (cm/S)',
                   y_title='Seebeck Coefficient (uV/K)',
                   colorbar_title='Thermal Conductivity (W/m.K)',
                   filename='thermoelectrics.html')
    pf.xy(('Electrical resistivity', 'Seebeck coefficient'),
          labels=['chemicalFormula', 'Preparation method', 'Crystallinity'],
          sizes='zT',
          colors='Thermal conductivity',
          color_range=[0, 5])
Beispiel #8
0
def plot_bulk_shear_moduli():
    """
    Very basic example of xy scatter plot of Voigt-Reuss-Hill (VRH) average
        bulk vs. shear modulus. Poisson ratio as marker colors make the
        distinction between materials with different bulk/shear modulus ratios
    Returns:
        plotly plot in "offline" mode poped in the default browser.
    """
    df = load_elastic_tensor()
    pf = PlotlyFig(df,
                   y_title='Bulk Modulus (GPa)',
                   x_title='Shear Modulus (GPa)',
                   filename='bulk_shear_moduli')
    pf.xy(('G_VRH', 'K_VRH'),
          labels='material_id',
          colors='poisson_ratio',
          colorscale='Picnic')
Beispiel #9
0
def plot_thermoelectrics(citrine_api_key, limit=0):
    """
    Scatter plot of the properties of thermoelectric materials based on the data
        available in http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp
        The data is extracted via Citrine data retrieval tools. The dataset
        id on Citrine is 150557
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['Electrical resistivity', 'Seebeck coefficient',
            'Thermal conductivity', 'Thermoelectric figure of merit (zT)']
    df_te = cdr.get_dataframe(criteria={'data_type': 'experimental',
                                        'data_set_id': 150557,
                                        'max_results': limit},
                              properties=['Seebeck coefficient'],
                              secondary_fields=True,
                              )
    df_te[cols] = df_te[cols].astype(float)
    df_te = df_te[(df_te['Electrical resistivity'] > 5e-4) & \
                  (df_te['Electrical resistivity'] < 0.1)]
    df_te = df_te[abs(df_te['Seebeck coefficient']) < 500].rename(
                columns={'Thermoelectric figure of merit (zT)': 'zT'})

    print(df_te.head())
    pf = PlotlyFig(df_te,
                   x_scale='log',
                   fontfamily='Times New Roman',
                   hovercolor='white',
                   x_title='Electrical Resistivity (cm/S)',
                   y_title='Seebeck Coefficient (uV/K)',
                   colorbar_title='Thermal Conductivity (W/m.K)',
                   filename='thermoelectrics.html')
    pf.xy(('Electrical resistivity', 'Seebeck coefficient'),
          labels=['chemicalFormula', 'Preparation method', 'Crystallinity'],
          sizes='zT',
          colors='Thermal conductivity',
          color_range=[0, 5])
Beispiel #10
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'], return_plot=True)
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c], xlabels, ylabels,
                                         return_plot=True)
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b], cols=xlabels,
                                                return_plot=True)
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = \
        self.pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)[
            'layout']
        vio_true = self.fopen("template_vio.json")
        self.assertTrue(vio_test == vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c], return_plot=True)['layout']
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        hmdf_test = self.pf.heatmap_df(df, x_labels=x_labels,
                                       y_labels=y_labels,
                                       return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_trianlge(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
Beispiel #11
0

def get_mp_bandgap(formula):
    #这个函数的作用是给定一定的化学组成,返回稳定状态的带隙
    #而mo数据库需要用到interger的化学式
    reduced_formula = Composition(formula).get_integer_formula_and_factor()[0]
    struct_list = mpr.get_data(reduced_formula)
    if struct_list:
        return sorted(struct_list,
                      key=lambda e: e['energy_per_atom'])[0]['band_gap']


df['Computed band gap'] = df['chemicalFormula'].apply(get_mp_bandgap)

from matminer.figrecipes.plot import PlotlyFig

pf = PlotlyFig(df,
               x_title='Experimental band gap (ev)',
               y_title='Computed band gap (ev)',
               mode='notebook',
               fontsize=20,
               ticksize=15)
pf.xy([('Experimental band gap', 'Computed band gap'), ([0, 10], [0, 10])],
      modes=['markers', 'lines'],
      lines=[{}, {
          'color': 'black',
          'dash': 'dash'
      }],
      labels='chemicalFormula',
      showlegends=False)
df.head()
Beispiel #12
0
def pre():

    ##导入之后首先进行测试是否有缺失值,并且命名行索引,以及添加gaps数值
    X=pd.DataFrame()
    df=pd.read_csv('full_vector_all.csv',index_col = [0])     
    print(df.index)     
    print(df.index.name)    
    df.index.name='mp_id'
    print(df.index.name)
   
    y=df['gaps'].values
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid']
    X=df.drop(uwnanted_columns,axis=1,inplace=False)
    X=X.fillna(X.mean())
    print(X[X.isnull().values==True])   

    #首先我想先规定下数据的大小范围,对于范围超额的进行近似
    #for co in df.columns:
    #    for row in df.index:            
    #        if(df.loc[row,co]<0.000001 and df.loc[row,co]>-0.000001 and df.loc[row,co]!=0):
    #            print(df.loc[row,co])
    #            df.loc[row,co]=0
    #        elif(df.loc[row,co]>1000000):
    #            print(df.loc[row,co])
    #            df.loc[row,co]=1000000  
     
    #X.to_csv("delete_af_vector.csv")           
    print("这里我们规定hse作为预测项,然后就是其他所有的数字项都是features")    
    print("现在有{}个可能的特征\n\n".format(X.shape[1]))
    print("X 的维度是{}".format(X.shape))   
    print(X[X.isnull().values==True])   
    print(X.isnull().values.any())
    X_pr = X.values
    X_pr=preprocessing.scale(X_pr)


    #首先是线性回归
    lr=LinearRegression()    
    #print(X)
    lr.fit(X_pr,y)   
    print("训练的r2是:{}".format(round(lr.score(X_pr,y),3)))
    print("训练后的RMSE是{:.3f}".format(np.sqrt(mean_squared_error(y_true=y,y_pred=lr.predict(X_pr)))))

    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)
    scores=cross_val_score(lr,X_pr,y,scoring='neg_mean_squared_error',cv=crossvalidation,n_jobs=1)
    print(scores)
   # print("暂停30s休息看看结果")
    #time.sleep(30)
    rmse_scores=[]
    
    for s in scores:
        #print(s)
        #print(np.sqrt(abs(s)))
        rmse_scores.append(np.sqrt(abs(s)))        

    print(rmse_scores)
    r2_scores=cross_val_score(lr,X_pr,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    print(r2_scores)

    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),np.mean(np.abs(rmse_scores))))

    #到这里我们发现预测的结果是不错的
    #但是我们还是需要画图看一下效果

    pf=PlotlyFig(x_title='HSE calculate gap(ev)',y_title='Predicated gap(ev)',title='Linear regression',filename='lr_regression.jpg')
    pf.xy(xy_pairs=[(y,cross_val_predict(lr,X_pr,y,cv=crossvalidation)),([0,400],[0,400])],labels=df['full_formula'],modes=['markers','lines'],lines=[{},{'color':'black','dash':'dash'}],showlegends=False)
    print("这就是线性回归的威力,感觉还是不错的")
    print("\n\n")

    #现在我们尝试使用随机森林来看一下结果如何
    rf=RandomForestRegressor(n_estimators=100,random_state=1)
    rf.fit(X_pr,y)
    print("随机森林的r2是:{}".format(round(rf.score(X_pr,y),3)))
    print("随机森林的是RMSE是:{}".format(round(np.sqrt(mean_squared_error(y_true=y,y_pred=rf.predict(X_pr))),3)))
    #单看整个数据集上效果还是不错的

    importances=rf.feature_importances_
    #print(importances)
    included=X.columns.values
    indices=np.argsort(importances)[::-1]
    #print(indices)

    pf=PlotlyFig(y_title='importance(%)',title='Feature by importances',fontsize=20,ticksize=15)
    pf.bar(x=included[indices][0:10],y=importances[indices][0:10])


    scores=cross_val_score(rf,X_pr,y,scoring='neg_mean_squared_error',cv=crossvalidation,n_jobs=1)
    r2_scores=cross_val_score(rf,X_pr,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    rmse_scores_rf=np.sqrt(abs(scores))
    
    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='HSE calculate gap(ev)',y_title='Random forest predicated gap(ev)',title='Random forest regression',filename='rf_regression.html')
   
   #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y, cross_val_predict(rf, X_pr, y, cv=crossvalidation)), ([0, 450], [0, 450])], 
      labels=df['full_formula'], modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], showlegends=False)



    #现在开始正式进行测试了
    rf.fit(X_pr,y)
    print("正式开始进行预测")
    df_2d=pd.read_csv('预测是否为导体.csv',index_col = [0])  
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid']
    X_2d=df_2d.drop(uwnanted_columns,axis=1,inplace=False)
    X_2d=X_2d.fillna(X_2d.mean())   
    print(X_2d[X_2d.isnull().values==True]) 
    
    df_2d.index.name='mp_id'
    print(df_2d.index.name)
    #读取数据,补充缺失值,更改index   
    
    
   
    X_2d_value=X_2d.values
    X_2d_value=preprocessing.scale(X_2d_value)
    y_pre_2d=rf.predict(X_2d_value)
    df_2d['gaps']=-10
    for i in range(len(df_2d['gaps'].values)):
        df_2d.ix[i,['gaps']]=y_pre_2d[i]
    
    df_2d.to_csv('2d_bulk.csv')

    

    print("all work done")



    print("\n\n") 
Beispiel #13
0
# In[16]:


from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict

pf = PlotlyFig(x_title='Fracture Toughness (Mpa m^(1/2))',
               y_title='Predicted Fracture Toughness (Mpa m^(1/2))',
               title='Linear regression',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(lr, X, y, cv=crossvalidation)), ([0, 12], [0, 12])], 
      labels=df_ft_1['Formula'], 
      modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )


# ## random forest model

# In[17]:


from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1)

rf.fit(X, y)
print('training R2 = ' + str(round(rf.score(X, y), 3)))
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy(
            [(a, b), (a, c), (c, c)],
            modes=['markers', 'markers+lines', 'lines'],
            colors=[c, 'red', 'blue'],
            return_plot=True)
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c],
                                         xlabels,
                                         ylabels,
                                         return_plot=True)
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b],
                                                cols=xlabels,
                                                return_plot=True)
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = \
        self.pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)[
            'layout']
        vio_true = self.fopen("template_vio.json")
        self.assertTrue(vio_test == vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c],
                                          return_plot=True)['layout']
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):
        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        b = [2, 4, 6, 8, 10, 2, 4, 6, 8, 10]
        c = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        df = pd.DataFrame(data=np.asarray([a, b, c]).T,
                          columns=['a', 'b', 'c'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        self.pf.heatmap_df(df,
                           x_labels=x_labels,
                           y_labels=y_labels,
                           return_plot=True)
Beispiel #15
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_test['data'] = [p.to_plotly_json() for p in xys_test['data']]
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_test['data'] = [p.to_plotly_json() for p in xym_test['data']]
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy(
            [(a, b), (a, c), (c, c)],
            modes=['markers', 'markers+lines', 'lines'],
            colors=[c, 'red', 'blue'],
            return_plot=True)
        xy_colors_test['data'] = [
            p.to_plotly_json() for p in xy_colors_test['data']
        ]
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c],
                                         xlabels,
                                         ylabels,
                                         return_plot=True)
        hmb_test['data'] = [p.to_plotly_json() for p in hmb_test['data']]
        hmb_true = self.fopen("template_hmb.json")
        self.assertEqual(hmb_test, hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_test['data'] = [p.to_plotly_json() for p in his_test['data']]
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_test['data'] = [p.to_plotly_json() for p in bar_test['data']]
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b],
                                                cols=xlabels,
                                                return_plot=True)
        pcp_test['data'] = [p.to_plotly_json() for p in pcp_test['data']]
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = self.pf.violin([a, b, c, b, a, c, b],
                                  cols=xlabels,
                                  return_plot=True)['layout']
        vio_test = vio_test.to_plotly_json()
        vio_true = self.fopen("template_vio.json")

        # Avoid errors from CircleCI's different plotly config
        for vio in [vio_test, vio_true]:
            vio["xaxis"]["range"] = [-0.167009, 0.167009]
        self.assertDictEqual(vio_test, vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c],
                                          return_plot=True)['layout']
        scm_test = scm_test.to_plotly_json()
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        with self.assertWarns(UserWarning):
            hmdf_test = self.pf.heatmap_df(df,
                                           x_labels=x_labels,
                                           y_labels=y_labels,
                                           return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_triangle(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
Beispiel #16
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                    modes=['markers', 'markers+lines', 'lines'],
                    colors=[c, 'red', 'blue'], return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # plotly figures need to be converted jsonable data
    xys['data'] = [p.to_plotly_json() for p in xys['data']]
    xym['data'] = [p.to_plotly_json() for p in xym['data']]
    xy_colors['data'] = [p.to_plotly_json() for p in xy_colors['data']]
    hmb['data'] = [p.to_plotly_json() for p in hmb['data']]
    his['data'] = [p.to_plotly_json() for p in his['data']]
    bar['data'] = [p.to_plotly_json() for p in bar['data']]
    pcp['data'] = [p.to_plotly_json() for p in pcp['data']]

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    # plotly layout needs to be converted jsonable data
    vio = {'layout': vio['layout'].to_plotly_json()}
    scm = {'layout': scm['layout'].to_plotly_json()}

    df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                      columns=['ah', 'bh', 'ch'])
    x_labels = ['low', 'high']
    y_labels = ['small', 'large']
    # TODO: this plot was not JSON serializable, use a different serialization method for all plots
    hmdf = pf.heatmap_df(df, x_labels=x_labels, y_labels=y_labels, return_plot=True)
    hmdf['data'] = [p.to_plotly_json() for p in hmdf['data']]

    df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
    triangle = pf.triangle(df[['q', 'w', 'e']], return_plot=True)

    fnamedict = {"xys": xys, "xym": xym, "xy_colors":xy_colors,
                 "hmb": hmb, "his": his, "bar": bar,
                 "pcp": pcp, "vio": vio, "scm": scm,
                 'triangle': triangle,
                 'hmdf': hmdf
                 }

    for fname, obj in fnamedict.items():
        if fname in ["vio", "scm"]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f, cls=MontyEncoder)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Beispiel #17
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_test['data'] = [p.to_plotly_json() for p in xys_test['data']]
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_test['data'] = [p.to_plotly_json() for p in xym_test['data']]
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'], return_plot=True)
        xy_colors_test['data'] = [p.to_plotly_json() for p in xy_colors_test['data']]
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c], xlabels, ylabels,
                                         return_plot=True)
        hmb_test['data'] = [p.to_plotly_json() for p in hmb_test['data']]
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_test['data'] = [p.to_plotly_json() for p in his_test['data']]
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_test['data'] = [p.to_plotly_json() for p in bar_test['data']]
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b], cols=xlabels,
                                                return_plot=True)
        pcp_test['data'] = [p.to_plotly_json() for p in pcp_test['data']]
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = self.pf.violin(
            [a, b, c, b, a, c, b], cols=xlabels, return_plot=True)['layout']
        vio_test = vio_test.to_plotly_json()
        vio_true = self.fopen("template_vio.json")

        # Avoid errors from CircleCI's different plotly config
        for vio in [vio_test, vio_true]:
            vio["xaxis"]["range"] = [-0.167009, 0.167009]
        self.assertDictEqual(vio_test, vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c], return_plot=True)['layout']
        scm_test = scm_test.to_plotly_json()
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        with self.assertWarns(UserWarning):
            hmdf_test = self.pf.heatmap_df(df, x_labels=x_labels,
                                           y_labels=y_labels,
                                           return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_triangle(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
def pre():
    print("既然有了这么多数据,我们需要考虑好谁是输入,谁是输出")
    print("这里我们规定K-var作为预测项,然后就是其他所有的数字项都是features")
    df = pd.read_csv('引入结构中的密度.csv')
    print(df.columns)
    y = df['K_VRH'].values
    excluded = [
        "G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id",
        "poisson_ratio", "structure", "composition", "composition_oxid"
    ]
    X = df.drop(excluded, axis=1)
    print("现在有{}个可能的特征:\n\n".format(X.shape[1], X.columns.values))
    lr = LinearRegression()
    lr.fit(X, y)
    #看一下我们的结果如何
    print("训练的r2是:{}".format(round(lr.score(X, y), 3)))
    print("训练后的RMSE是{:.3f}".format(
        np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X)))))

    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation = KFold(n_splits=10, shuffle=False, random_state=1)
    scores = cross_val_score(lr,
                             X,
                             y,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    print(scores)
    # print("暂停30s休息看看结果")
    #time.sleep(30)
    rmse_scores = []

    for s in scores:
        #print(s)
        #print(np.sqrt(abs(s)))
        rmse_scores.append(np.sqrt(abs(s)))

    print(rmse_scores)
    r2_scores = cross_val_score(lr,
                                X,
                                y,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    print(r2_scores)

    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores))))

    #到这里我们发现预测的结果是不错的
    #但是我们还是需要画图看一下效果
    pf = PlotlyFig(x_title='DFT (MP) bulk modules(Gpa)',
                   y_title='Predicated bulk modules(Gpa)',
                   title='Linear regression',
                   filename='lr_regression.jpg')
    pf.xy(xy_pairs=[(y, cross_val_predict(lr, X, y, cv=crossvalidation)),
                    ([0, 400], [0, 400])],
          labels=df['formula'],
          modes=['markers', 'lines'],
          lines=[{}, {
              'color': 'black',
              'dash': 'dash'
          }],
          showlegends=False)
    print("这就是线性回归的威力,感觉还是不错的")
    print("\n\n")

    #现在我们尝试使用随机森林来看一下结果如何
    rf = RandomForestRegressor(n_estimators=50, random_state=1)
    rf.fit(X, y)
    print("随机森林的r2是:{}".format(round(rf.score(X, y), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(np.sqrt(mean_squared_error(y_true=y, y_pred=rf.predict(X))), 3)))
    #单看整个数据集上效果还是不错的

    importances = rf.feature_importances_
    included = X.columns.values
    indices = np.argsort(importances)[::-1]

    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf,
                             X,
                             y,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf,
                                X,
                                y,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='DFT (MP) bulk modulus (GPa)',
                      y_title='Random forest bulk modulus (GPa)',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y, cross_val_predict(rf, X, y, cv=crossvalidation)),
              ([0, 400], [0, 400])],
             labels=df['formula'],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)

    print("\n\n")
def pre_gap_forjiyuan():
    #首先载入我们之前建立的vector
    data = pd.read_csv('2d_bulk.csv')
    print(data.columns)
    print(data.describe())
    df = pd.read_csv('vector_new_plustitle.csv')
    print(df.columns)
    print(df.describe())
    df['is_daoti'] = np.nan
    df['bulk_gap'] = np.nan

    j = 0

    print("首先我们需要把之前bulk的is_daoti和预测的gaps放到vector中")
    print("有个问题,某些材料id没有对应信息,只能采取填充法,先规定nan,然后用平均值填充")

    print(len(df.index))

    for i in range(len(df.index)):
        #print("this is {}th".format(i+1))
        for j in range(len(data.index)):
            str = data.ix[j, 'mp_id']
            str = str[3:]
            if (eval(str) == df.ix[i, 'id']):
                #print(df.ix[j,'id'])
                #print(str)
                df.ix[i, 'is_daoti'] = data.ix[j, 'is_daoti']
                df.ix[i, 'bulk_gap'] = data.ix[j, 'gaps']
                break
    df = df.fillna(method="ffill")
    #df.to_csv('plus_bulk_isdaoti_2dvector.csv')
    print(df[df.isnull().values == True])

    y_gap = df['gap'].values
    y_m = df['efm'].values

    unwanted = ['gap', 'id', 'efm']
    X_df = df.drop(unwanted, axis=1, inplace=False)
    X_gap = X_df.values

    X_gap = preprocessing.scale(X_gap)
    X_m = X_gap
    crossvalidation = KFold(n_splits=5, shuffle=True, random_state=2)

    #首先进行线性回归
    print("首先进行线性回归")
    #print(metrics.SCORERS.keys())
    lr = LinearRegression()
    lr.fit(X_gap, y_gap)
    #看一下我们的结果如何
    print("线性训练的r2是:{}".format(round(lr.score(X_gap, y_gap), 3)))
    print("训练后的RMSE是{:.3f}".format(
        np.sqrt(mean_squared_error(y_true=y_gap, y_pred=lr.predict(X_gap)))))
    scores = cross_val_score(lr,
                             X_gap,
                             y_gap,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    rmse_scores = []
    for s in scores:
        rmse_scores.append(np.sqrt(abs(s)))
    print(rmse_scores)
    r2_scores = cross_val_score(lr,
                                X_gap,
                                y_gap,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    print("相关系数是:{}".format(r2_scores))
    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores))))

    #然后开始进行随机森林的预测
    print("开始进行随机森林的预测")
    rf = RandomForestRegressor(n_estimators=90,
                               max_features=10,
                               max_depth=12,
                               min_samples_split=2,
                               random_state=1)
    rf.fit(X_gap, y_gap)
    print("随机森林的r2是:{}".format(round(rf.score(X_gap, y_gap), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_gap,
                                       y_pred=rf.predict(X_gap))), 3)))

    print("看一下回归效果和什么关系更加密切")
    importances = rf.feature_importances_
    included = X_df.columns.values
    indices = np.argsort(importances)[::-1]
    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances gap',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf,
                             X_gap,
                             y_gap,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf,
                                X_gap,
                                y_gap,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='2d HSE calculate gap(ev)',
                      y_title='Random forest predicated 2d gap(ev)',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y_gap, cross_val_predict(rf, X_gap, y_gap, cv=crossvalidation)),
              ([0, 100], [0, 100])],
             labels=[],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)

    #这一部分是调参的过程

    #print("首先是n_estimaters的个数和max_features进行调参")
    #param_test1={'n_estimators':range(50,130,10),'max_features':range(5,15)}
    #gsearch1=gridsearchcv(estimator=randomforestregressor(random_state=10),
    #                     param_grid=param_test1,scoring='neg_mean_squared_error',cv=5)
    #gsearch1.fit(x_gap,y_gap)
    #print(gsearch1.cv_results_)
    #print("最好的参数是{}".format(gsearch1.best_params_))
    #print("最好的均方误差是{}".format(gsearch1.best_score_))

    #print("得到了最好的n_estimators是90,最大特征数是10")

    #print("最后是对于max_depth和min_samples_split的调参")
    #param_test3={'max_depth':range(4,20,2),'min_samples_split':range(2,5,1)}
    #gsearch3=gridsearchcv(estimator=randomforestregressor(random_state=10,n_estimators=90,max_features=10),
    #                      param_grid=param_test3,scoring='neg_mean_squared_error',cv=5)
    #gsearch3.fit(x_gap,y_gap)
    #print(gsearch3.cv_results_)
    #print("最好的参数是{}".format(gsearch3.best_params_))
    #print("最好的准确率是{}".format(gsearch3.best_score_))

    #print("得到了最好的max_depth是12,min_samples_split是2")

    #开始进行有效质量的预测
    print("开始进行有效质量预测")
    print("首先,支持向量机")
    svm_m = svm.SVR(gamma='scale', C=1.0)
    svm_m.fit(X=X_m, y=y_m)
    print("支持向量机的r2是{:.3f}".format(svm_m.score(X_m, y_m)))
    print("支持向量机的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_m, y_pred=svm_m.predict(X_m))),
            3)))
    scores = cross_val_score(svm_m,
                             X_m,
                             y_m,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(svm_m,
                                X_m,
                                y_m,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于支持向量机,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))

    print("没错,开始进行随机森林的预测")
    rf_m = RandomForestRegressor(n_estimators=120, random_state=1)
    rf_m.fit(X_m, y_m)
    print("随机森林的r2是:{}".format(round(rf_m.score(X_m, y_m), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_m, y_pred=rf_m.predict(X_m))),
            3)))

    print("看一下预测有效质量效果和什么关系更加密切")
    importances = rf_m.feature_importances_
    included = X_df.columns.values
    indices = np.argsort(importances)[::-1]
    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances efm',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf_m,
                             X_m,
                             y_m,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf_m,
                                X_m,
                                y_m,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='2d efm calculate efm(ev)',
                      y_title='Random forest predicated 2d efm',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y_m, cross_val_predict(rf_m, X_m, y_m, cv=crossvalidation)),
              ([0, 100], [0, 100])],
             labels=[],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)
    print("all work done!")
Beispiel #20
0
# Printing Validation Results
print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' %
      (len(scores), np.mean(np.abs(rmse_scores))))

# In[10]:

# Importing libraries for plotting
from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict

# In[12]:

# Creates and plots experimental vs. predicted enthalpies
pf = PlotlyFig(x_title='Experimental Enthalpy',
               y_title='Predicted Enthalpy',
               title='Random Forest Regressor',
               mode='notebook',
               filename="rf_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(rf, X, y, cv=crossvalidation)),
                ([0, 200], [0, 200])],
      labels=df['comp'],
      modes=['markers', 'lines'],
      lines=[{}, {
          'color': 'black',
          'dash': 'dash'
      }],
      showlegends=False)
Beispiel #21
0
def create_plots(x_title, y_title, tp,
                 file_suffix, fontsize, ticksize, path, margins, fontfamily,
                 plot_data, mode='offline', names=None, labels=None,
                 x_label_short='', y_label_short=None, xy_modes='markers',
                 y_axis_type='linear', title=None, empty_markers=True,
                 **kwargs):
    """
    A wrapper function with args mostly consistent with
    matminer.figrecipes.plot.PlotlyFig

    Args:
        x_title (str): label of the x-axis
        y_title (str): label of the y-axis
        tp (str): "n" or "p"
        file_suffix (str): small suffix for filename (NOT a file format)
        fontsize (int):
        ticksize (int):
        path (str): root folder where the plot will be saved.
        margins (float or [float]): figrecipe PlotlyFig margins
        fontfamily (str):
        plot_data ([(x_data, y_data) tuples]): the actual data to be plotted
        mode (str): plot mode. "offline" and "static" recommended. "static"
            would automatically set the file format to .png
        names ([str]): names of the traces
        labels ([str]): the labels of the scatter points
        x_label_short (str): used for distinguishing filenames
        y_label_short (str):  used for distinguishing filenames
        xy_modes (str): mode of the xy scatter plots: "markers", "lines+markers"
        y_axis_type (str): e.g. "log" for logscale
        title (str): the title of the plot appearing at the top
        empty_markers (bool): whether the markers are empty (filled if False)
        **kwargs: other keyword arguments of matminer.figrecipes.plot.PlotlyFig
                for example, for setting plotly credential when mode=="static"

    Returns (None): to return the dict

    """
    from matminer.figrecipes.plot import PlotlyFig
    plot_data = list(plot_data)
    marker_symbols = range(44)
    if empty_markers:
        marker_symbols = [i+100 for i in marker_symbols]
    tp_title = {"n": "conduction band(s)", "p": "valence band(s)"}
    if title is None:
        title = '{} for {}'.format(y_title, tp_title[tp])
    if y_label_short is None:
        y_label_short = y_title
    if not x_label_short:
        filename = os.path.join(path, "{}_{}".format(
            y_label_short, file_suffix))
    else:
        filename = os.path.join(path, "{}_{}_{}".format(
            y_label_short, x_label_short, file_suffix))
    if mode == "static":
        if not filename.endswith(".png"):
            filename += ".png"
    pf = PlotlyFig(x_title=x_title, y_title=y_title, y_scale=y_axis_type,
                   title=title, fontsize=fontsize,
                   mode=mode, filename=filename, ticksize=ticksize,
                   margins=margins, fontfamily=fontfamily, **kwargs)
    pf.xy(plot_data, names=names, labels=labels, modes=xy_modes,
          marker_scale=1.1, markers=[{'symbol': marker_symbols[i],
                                      'line': {'width': 2, 'color': 'black'}}
                                     for i, _ in enumerate(plot_data)])
# In[6]:

for i in range(0, 10):
    print(test_mean[i] - train_mean[i])

# In[7]:

# evaluation using prediction graph
# optimally all points should lie on the line
# this will show how much the prediction deviates from the labelled value

from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
pf = PlotlyFig(x_title='bulk modulus(GPa)',
               y_title='Predicted bulk modulus(GPa)',
               title='Ridge regression',
               mode='notebook',
               filename="Ridge_regression.html")
pf.xy(xy_pairs=[(y, cross_val_predict(sgd, x, y, cv=crossvalidation)),
                ([40, 300], [40, 300])],
      modes=['markers', 'lines'],
      lines=[{}, {
          'color': 'black',
          'dash': 'dash'
      }],
      showlegends=False)

# In[ ]:
Beispiel #23
0
def plot_expt_compt_band_gaps(citrine_api_key, limit=0):
    """
    Pulls experimental band gaps from Citrine (w/o dataset limitations) and
        evaluate the DFT computed band gaps (data from materialsproject.org)
        in xy scatter plot. To compare the right values, we pick the computed
        band gaps calculated for a chemical formula that has the lowest energy
        above hull (the most stable structure).
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plots in "offline" mode popped in the default browser.
    """

    # pull experimental band gaps from Citrine
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['chemicalFormula', 'Band gap']
    df_ct = cdr.get_dataframe(criteria={'data_type':'experimental',
                                        'max_results':limit},
                              secondary_fields=True,
                              properties=['Band gap'])
    df_ct = df_ct[cols].rename(columns={'chemicalFormula': 'Formula',
                                        'Band gap': 'Expt. gap'})
    df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition
    df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals
    df_ct['Formula'] = df_ct['Formula'].transform(
        lambda x: Composition(x).get_reduced_formula_and_factor()[0])

    # pull computational band gaps from the Materials Project
    df = MPDataRetrieval().get_dataframe(
        criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}},
        properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'],
        index_mpid=False).rename(
        columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap',
                 'material_id': 'mpid'})


    # pick the most stable structure
    df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()]
    df_final = df_ct.merge(df_mp, on='Formula').drop(
                                    'e_above_hull', axis=1).set_index('mpid')
    pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)',
                   y_title='Computed Band Gap (eV)',
                   filename='band_gaps')

    # computed vs. experimental band gap:
    pf.xy([
        ('Expt. gap', 'MP computed gap'),
        ([0, 12], [0, 12])
    ],
        lines=[{}, {'color': 'black', 'dash': 'dash'}],
        labels=['Formula', df_final.index],
        modes=['markers', 'lines'],
        names=['Computed vs. expt.', 'Expt. gap'])

    # residual:
    residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float)
    pf.set_arguments(x_title='Experimental band gap (eV)',
                    y_title='Residual (Computed - Expt.) Band Gap (eV)',
                    filename='band_gap_residuals')
    pf.xy(('Expt. gap', residuals),
          labels = ['Formula', df_final.index])