Ejemplo n.º 1
0
def train_split():

    name='full_vector_all.csv'
    df=pd.read_csv(name,index_col = [0])   
    
    y=df['is_daoti'].values
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid','is_daoti']
    X_df=df.drop(uwnanted_columns,axis=1,inplace=False)
    print(X_df[X_df.isnull().values==True]) 
    X=X_df.values
      
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)
    print("\n")
    print("我们看下这些分开的不同测试集合训练集")   
    
    rf_reg=RandomForestClassifier(n_estimators=100,random_state=1)
    rf_reg.fit(X_train,y_train)
    #下面我们看一下进行训练集、测试集划分之后的效果
    print("训练集的准确度是:{:.3f}".format(rf_reg.score(X_train,y_train)))
    print("训练集的rmse是:{:.3f}".format(np.sqrt(mean_squared_error(y_true=y_train,y_pred=rf_reg.predict(X_train)))))
    print("测试集上的准确度是:{:.3f}".format(rf_reg.score(X_test,y_test)))
    print("测试集上的rmse是:{:.3f}".format(np.sqrt(mean_squared_error(y_true=y_test,y_pred=rf_reg.predict(X_test)))))

    #然后我们还是来画图
    pf_rf=PlotlyFig(x_title='Bulk modulus prediction residual (Gpa)',y_title='Probability',title='Random forest regression residuals',filename='rf_regression_residuals.html')
    hist_plot=pf_rf.histogram(data=[y_train-rf_reg.predict(X_train),y_test-rf_reg.predict(X_test)],histnorm='probability',colors=['blue','red'],return_plot=True)
    hist_plot['data'][0]['name']='train'
    hist_plot['data'][1]['name']='test'
    pf_rf.create_plot(hist_plot)

    print('\n\n')
Ejemplo n.º 2
0
def advanced_histogram():
    """
    This is a work in progress
    """

    df = load_elastic_tensor()
    pf = PlotlyFig(df, title="Various Histograms")
    pf.histogram(cols=['G_Reuss', 'G_VRH', 'G_Voigt'], bins={'size': 10})
Ejemplo n.º 3
0
def plot_simple_xy():
    """
    Very simple xy plot with all default settings.
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    pf = PlotlyFig(title="Basic Example", filename="basic.html")
    pf.xy(([1, 2, 3], [4, 5, 6]))
Ejemplo n.º 4
0
def advanced_histogram():
    """
    This is a work in progress
    """

    df = load_elastic_tensor()
    pf = PlotlyFig(df, title="Various Histograms")
    pf.histogram(cols=['G_Reuss', 'G_VRH', 'G_Voigt'], bins={'size': 10})
Ejemplo n.º 5
0
def plot_simple_xy():
    """
    Very simple xy plot with all default settings.
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    pf = PlotlyFig(title="Basic Example", filename="basic.html")
    pf.xy(([1, 2, 3], [4, 5, 6]))
Ejemplo n.º 6
0
def advanced_histogram():
    """
    This is a work in progress
    """

    df = load_dataset("dielectric_constant")
    pf = PlotlyFig(df, title="Various Histograms")
    pf.histogram(cols=['G_Reuss', 'G_VRH', 'G_Voigt'], bins={'size': 10})
Ejemplo n.º 7
0
def basic_histogram():
    """
    Here we plot a basic histogram showing the distribution of band gaps
    in the matminer dielectric constant dataset, originally taken from Petousis
    et al., 2017.
    """
    df = load_dielectric_constant()
    pf = PlotlyFig(title="Distribution of Band Gaps in the Dielectric Constant "
                         "Dataset",
                   x_title="Band Gap (eV)",
                   hoverinfo='y')
    pf.histogram(df['band_gap'])
Ejemplo n.º 8
0
def basic_histogram():
    """
    Here we plot a basic histogram showing the distribution of band gaps
    in the matminer dielectric constant dataset, originally taken from Petousis
    et al., 2017.
    """
    df = load_dielectric_constant()
    pf = PlotlyFig(
        title="Distribution of Band Gaps in the Dielectric Constant "
        "Dataset",
        x_title="Band Gap (eV)",
        hoverinfo='y')
    pf.histogram(df['band_gap'])
def train_split():

    df = pd.read_csv('引入结构中的密度.csv')
    y = df['K_VRH'].values
    excluded = [
        "G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id",
        "poisson_ratio", "structure", "composition", "composition_oxid"
    ]
    X = df.drop(excluded, axis=1)
    X['formula'] = df['formula']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    print("\n")
    print("我们看下这些分开的不同测试集合训练集")
    print("x训练集是\n{}".format(X_train))
    #time.sleep(30)
    train_formula = X_train['formula']
    X_train = X_train.drop('formula', axis=1)
    test_formula = X_test['formula']
    X_test = X_test.drop('formula', axis=1)
    rf_reg = RandomForestRegressor(n_estimators=100, random_state=1)
    rf_reg.fit(X_train, y_train)
    #下面我们看一下进行训练集、测试集划分之后的效果
    print("训练集的r2是:{:.3f}".format(rf_reg.score(X_train, y_train)))
    print("训练集的rmse是:{:.3f}".format(
        np.sqrt(
            mean_squared_error(y_true=y_train,
                               y_pred=rf_reg.predict(X_train)))))
    print("测试集上的r2是:{:.3f}".format(rf_reg.score(X_test, y_test)))
    print("测试集上的rmse是:{:.3f}".format(
        np.sqrt(
            mean_squared_error(y_true=y_test, y_pred=rf_reg.predict(X_test)))))

    #然后我们还是来画图
    pf_rf = PlotlyFig(x_title='Bulk modulus prediction residual (Gpa)',
                      y_title='Probability',
                      title='Random forest regression residuals',
                      filename='rf_regression_residuals.html')
    hist_plot = pf_rf.histogram(data=[
        y_train - rf_reg.predict(X_train), y_test - rf_reg.predict(X_test)
    ],
                                histnorm='probability',
                                colors=['blue', 'red'],
                                return_plot=True)
    hist_plot['data'][0]['name'] = 'train'
    hist_plot['data'][1]['name'] = 'test'
    pf_rf.create_plot(hist_plot)

    print('\n\n')
Ejemplo n.º 10
0
def plot_bulk_shear_moduli():
    """
    Very basic example of xy scatter plot of Voigt-Reuss-Hill (VRH) average
        bulk vs. shear modulus. Poisson ratio as marker colors make the
        distinction between materials with different bulk/shear modulus ratios
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    df = load_elastic_tensor()
    pf = PlotlyFig(df,
                   y_title='Bulk Modulus (GPa)',
                   x_title='Shear Modulus (GPa)',
                   filename='bulk_shear_moduli.jpeg')
    pf.xy(('G_VRH', 'K_VRH'), labels='material_id', colors='poisson_ratio',
          colorscale='Picnic', limits={'x': (0, 300)})
Ejemplo n.º 11
0
def plot_expt_compt_band_gaps(citrine_api_key, limit=0):
    """
    Pulls experimental band gaps from Citrine (w/o dataset limitations) and
        evaluate the DFT computed band gaps (data from materialsproject.org)
        in xy scatter plot. To compare the right values, we pick the computed
        band gaps calculated for a chemical formula that has the lowest energy
        above hull (the most stable structure).
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plots in "offline" mode poped in the default browser.
    """

    # pull experimental band gaps from Citrine
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['chemicalFormula', 'Band gap']
    df_ct = cdr.get_dataframe(prop='band gap', data_type='experimental',
                              show_columns=cols, max_results=limit).rename(
        columns={'chemicalFormula': 'Formula', 'Band gap': 'Expt. gap'})
    df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition
    df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals
    df_ct['Formula'] = df_ct['Formula'].transform(
        lambda x: Composition(x).get_reduced_formula_and_factor()[0])

    # pull computational band gaps from the Materials Project
    df = MPDataRetrieval().get_dataframe(
        criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}},
        properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'],
        index_mpid=False).rename(
        columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap',
                 'material_id': 'mpid'})


    # pick the most stable structure
    df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()]
    df_final = df_ct.merge(df_mp, on='Formula').drop(
                                    'e_above_hull', axis=1).set_index('mpid')
    pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)',
                   y_title='Computed Band Gap (eV)',
                   filename='band_gaps')

    # computed vs. experimental band gap:
    pf.xy([
        ('Expt. gap', 'MP computed gap'),
        ([0, 12], [0, 12])
    ],
        lines=[{}, {'color': 'black', 'dash': 'dash'}],
        labels=df_final.index, modes=['markers', 'lines'],
        names=['Computed vs. expt.', 'Expt. gap'])

    # residual:
    residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float)
    pf.set_arguments(x_title='Experimental band gap (eV)',
                    y_title='Residual (Computed - Expt.) Band Gap (eV)',
                    filename='band_gap_residuals')
    pf.xy(('Expt. gap', residuals), labels = df_final.index)
Ejemplo n.º 12
0
def plot_thermoelectrics(citrine_api_key, limit=0):
    """
    Scatter plot of the properties of thermoelectric materials based on the data
        available in http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp
        The data is extracted via Citrine data retrieval tools. The dataset
        id on Citrine is 150557
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = [
        'Electrical resistivity', 'Seebeck coefficient',
        'Thermal conductivity', 'Thermoelectric figure of merit (zT)'
    ]
    df_te = cdr.get_dataframe(
        criteria={
            'data_type': 'experimental',
            'data_set_id': 150557,
            'max_results': limit
        },
        properties=['Seebeck coefficient'],
        secondary_fields=True,
    )
    df_te[cols] = df_te[cols].astype(float)
    df_te = df_te[(df_te['Electrical resistivity'] > 5e-4) & \
                  (df_te['Electrical resistivity'] < 0.1)]
    df_te = df_te[abs(df_te['Seebeck coefficient']) < 500].rename(
        columns={'Thermoelectric figure of merit (zT)': 'zT'})

    print(df_te.head())
    pf = PlotlyFig(df_te,
                   x_scale='log',
                   fontfamily='Times New Roman',
                   hovercolor='white',
                   x_title='Electrical Resistivity (cm/S)',
                   y_title='Seebeck Coefficient (uV/K)',
                   colorbar_title='Thermal Conductivity (W/m.K)',
                   filename='thermoelectrics.html')
    pf.xy(('Electrical resistivity', 'Seebeck coefficient'),
          labels=['chemicalFormula', 'Preparation method', 'Crystallinity'],
          sizes='zT',
          colors='Thermal conductivity',
          color_range=[0, 5])
Ejemplo n.º 13
0
def plot_bulk_shear_moduli():
    """
    Very basic example of xy scatter plot of Voigt-Reuss-Hill (VRH) average
        bulk vs. shear modulus. Poisson ratio as marker colors make the
        distinction between materials with different bulk/shear modulus ratios
    Returns:
        plotly plot in "offline" mode poped in the default browser.
    """
    df = load_elastic_tensor()
    pf = PlotlyFig(df,
                   y_title='Bulk Modulus (GPa)',
                   x_title='Shear Modulus (GPa)',
                   filename='bulk_shear_moduli')
    pf.xy(('G_VRH', 'K_VRH'),
          labels='material_id',
          colors='poisson_ratio',
          colorscale='Picnic')
Ejemplo n.º 14
0
def plot_thermoelectrics(citrine_api_key, limit=0):
    """
    Scatter plot of the properties of thermoelectric materials based on the data
        available in http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp
        The data is extracted via Citrine data retrieval tools. The dataset
        id on Citrine is 150557
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plot in "offline" mode popped in the default browser.
    """
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['Electrical resistivity', 'Seebeck coefficient',
            'Thermal conductivity', 'Thermoelectric figure of merit (zT)']
    df_te = cdr.get_dataframe(criteria={'data_type': 'experimental',
                                        'data_set_id': 150557,
                                        'max_results': limit},
                              properties=['Seebeck coefficient'],
                              secondary_fields=True,
                              )
    df_te[cols] = df_te[cols].astype(float)
    df_te = df_te[(df_te['Electrical resistivity'] > 5e-4) & \
                  (df_te['Electrical resistivity'] < 0.1)]
    df_te = df_te[abs(df_te['Seebeck coefficient']) < 500].rename(
                columns={'Thermoelectric figure of merit (zT)': 'zT'})

    print(df_te.head())
    pf = PlotlyFig(df_te,
                   x_scale='log',
                   fontfamily='Times New Roman',
                   hovercolor='white',
                   x_title='Electrical Resistivity (cm/S)',
                   y_title='Seebeck Coefficient (uV/K)',
                   colorbar_title='Thermal Conductivity (W/m.K)',
                   filename='thermoelectrics.html')
    pf.xy(('Electrical resistivity', 'Seebeck coefficient'),
          labels=['chemicalFormula', 'Preparation method', 'Crystallinity'],
          sizes='zT',
          colors='Thermal conductivity',
          color_range=[0, 5])
Ejemplo n.º 15
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                    modes=['markers', 'markers+lines', 'lines'],
                    colors=[c, 'red', 'blue'], return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                      columns=['ah', 'bh', 'ch'])
    x_labels = ['low', 'high']
    y_labels = ['small', 'large']
    # TODO: this plot was not JSON serializable, use a different serialization method for all plots
    hmdf = pf.heatmap_df(df, x_labels=x_labels, y_labels=y_labels, return_plot=True)

    df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
    triangle = pf.triangle(df[['q', 'w', 'e']], return_plot=True)

    fnamedict = {"xys": xys, "xym": xym, "xy_colors":xy_colors,
                 "hmb": hmb, "his": his, "bar": bar,
                 "pcp": pcp, "vio": vio, "scm": scm,
                 'triangle': triangle,
                 'hmdf': hmdf
                 }

    for fname, obj in fnamedict.items():
        if obj in [vio, scm]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f, cls=MontyEncoder)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Ejemplo n.º 16
0

def get_mp_bandgap(formula):
    #这个函数的作用是给定一定的化学组成,返回稳定状态的带隙
    #而mo数据库需要用到interger的化学式
    reduced_formula = Composition(formula).get_integer_formula_and_factor()[0]
    struct_list = mpr.get_data(reduced_formula)
    if struct_list:
        return sorted(struct_list,
                      key=lambda e: e['energy_per_atom'])[0]['band_gap']


df['Computed band gap'] = df['chemicalFormula'].apply(get_mp_bandgap)

from matminer.figrecipes.plot import PlotlyFig

pf = PlotlyFig(df,
               x_title='Experimental band gap (ev)',
               y_title='Computed band gap (ev)',
               mode='notebook',
               fontsize=20,
               ticksize=15)
pf.xy([('Experimental band gap', 'Computed band gap'), ([0, 10], [0, 10])],
      modes=['markers', 'lines'],
      lines=[{}, {
          'color': 'black',
          'dash': 'dash'
      }],
      labels='chemicalFormula',
      showlegends=False)
df.head()
Ejemplo n.º 17
0
def create_plots(x_title, y_title, tp,
                 file_suffix, fontsize, ticksize, path, margins, fontfamily,
                 plot_data, mode='offline', names=None, labels=None,
                 x_label_short='', y_label_short=None, xy_modes='markers',
                 y_axis_type='linear', title=None, empty_markers=True,
                 **kwargs):
    """
    A wrapper function with args mostly consistent with
    matminer.figrecipes.plot.PlotlyFig

    Args:
        x_title (str): label of the x-axis
        y_title (str): label of the y-axis
        tp (str): "n" or "p"
        file_suffix (str): small suffix for filename (NOT a file format)
        fontsize (int):
        ticksize (int):
        path (str): root folder where the plot will be saved.
        margins (float or [float]): figrecipe PlotlyFig margins
        fontfamily (str):
        plot_data ([(x_data, y_data) tuples]): the actual data to be plotted
        mode (str): plot mode. "offline" and "static" recommended. "static"
            would automatically set the file format to .png
        names ([str]): names of the traces
        labels ([str]): the labels of the scatter points
        x_label_short (str): used for distinguishing filenames
        y_label_short (str):  used for distinguishing filenames
        xy_modes (str): mode of the xy scatter plots: "markers", "lines+markers"
        y_axis_type (str): e.g. "log" for logscale
        title (str): the title of the plot appearing at the top
        empty_markers (bool): whether the markers are empty (filled if False)
        **kwargs: other keyword arguments of matminer.figrecipes.plot.PlotlyFig
                for example, for setting plotly credential when mode=="static"

    Returns (None): to return the dict

    """
    from matminer.figrecipes.plot import PlotlyFig
    plot_data = list(plot_data)
    marker_symbols = range(44)
    if empty_markers:
        marker_symbols = [i+100 for i in marker_symbols]
    tp_title = {"n": "conduction band(s)", "p": "valence band(s)"}
    if title is None:
        title = '{} for {}'.format(y_title, tp_title[tp])
    if y_label_short is None:
        y_label_short = y_title
    if not x_label_short:
        filename = os.path.join(path, "{}_{}".format(
            y_label_short, file_suffix))
    else:
        filename = os.path.join(path, "{}_{}_{}".format(
            y_label_short, x_label_short, file_suffix))
    if mode == "static":
        if not filename.endswith(".png"):
            filename += ".png"
    pf = PlotlyFig(x_title=x_title, y_title=y_title, y_scale=y_axis_type,
                   title=title, fontsize=fontsize,
                   mode=mode, filename=filename, ticksize=ticksize,
                   margins=margins, fontfamily=fontfamily, **kwargs)
    pf.xy(plot_data, names=names, labels=labels, modes=xy_modes,
          marker_scale=1.1, markers=[{'symbol': marker_symbols[i],
                                      'line': {'width': 2, 'color': 'black'}}
                                     for i, _ in enumerate(plot_data)])
Ejemplo n.º 18
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_test['data'] = [p.to_plotly_json() for p in xys_test['data']]
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_test['data'] = [p.to_plotly_json() for p in xym_test['data']]
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'], return_plot=True)
        xy_colors_test['data'] = [p.to_plotly_json() for p in xy_colors_test['data']]
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c], xlabels, ylabels,
                                         return_plot=True)
        hmb_test['data'] = [p.to_plotly_json() for p in hmb_test['data']]
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_test['data'] = [p.to_plotly_json() for p in his_test['data']]
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_test['data'] = [p.to_plotly_json() for p in bar_test['data']]
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b], cols=xlabels,
                                                return_plot=True)
        pcp_test['data'] = [p.to_plotly_json() for p in pcp_test['data']]
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = self.pf.violin(
            [a, b, c, b, a, c, b], cols=xlabels, return_plot=True)['layout']
        vio_test = vio_test.to_plotly_json()
        vio_true = self.fopen("template_vio.json")

        # Avoid errors from CircleCI's different plotly config
        for vio in [vio_test, vio_true]:
            vio["xaxis"]["range"] = [-0.167009, 0.167009]
        self.assertDictEqual(vio_test, vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c], return_plot=True)['layout']
        scm_test = scm_test.to_plotly_json()
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        with self.assertWarns(UserWarning):
            hmdf_test = self.pf.heatmap_df(df, x_labels=x_labels,
                                           y_labels=y_labels,
                                           return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_triangle(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
Ejemplo n.º 19
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_test['data'] = [p.to_plotly_json() for p in xys_test['data']]
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_test['data'] = [p.to_plotly_json() for p in xym_test['data']]
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy(
            [(a, b), (a, c), (c, c)],
            modes=['markers', 'markers+lines', 'lines'],
            colors=[c, 'red', 'blue'],
            return_plot=True)
        xy_colors_test['data'] = [
            p.to_plotly_json() for p in xy_colors_test['data']
        ]
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c],
                                         xlabels,
                                         ylabels,
                                         return_plot=True)
        hmb_test['data'] = [p.to_plotly_json() for p in hmb_test['data']]
        hmb_true = self.fopen("template_hmb.json")
        self.assertEqual(hmb_test, hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_test['data'] = [p.to_plotly_json() for p in his_test['data']]
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_test['data'] = [p.to_plotly_json() for p in bar_test['data']]
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b],
                                                cols=xlabels,
                                                return_plot=True)
        pcp_test['data'] = [p.to_plotly_json() for p in pcp_test['data']]
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = self.pf.violin([a, b, c, b, a, c, b],
                                  cols=xlabels,
                                  return_plot=True)['layout']
        vio_test = vio_test.to_plotly_json()
        vio_true = self.fopen("template_vio.json")

        # Avoid errors from CircleCI's different plotly config
        for vio in [vio_test, vio_true]:
            vio["xaxis"]["range"] = [-0.167009, 0.167009]
        self.assertDictEqual(vio_test, vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c],
                                          return_plot=True)['layout']
        scm_test = scm_test.to_plotly_json()
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        with self.assertWarns(UserWarning):
            hmdf_test = self.pf.heatmap_df(df,
                                           x_labels=x_labels,
                                           y_labels=y_labels,
                                           return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_triangle(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
Ejemplo n.º 20
0
def random_class():
    X_df=pd.DataFrame()
   
    df=pd.read_csv('full_vector_all.csv',index_col = [0])
    df=df.fillna(df.mean())
    print(df.index.name) 
    #print(df.index)    
    print(df[df.isnull().values==True])        
    df.index.name='mp_id'
    print(df.index.name)
    #读取数据,补充缺失值,更改index
   
    y=df['is_daoti'].values
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid','is_daoti']
    X_df=df.drop(uwnanted_columns,axis=1,inplace=False)
    print(X_df[X_df.isnull().values==True]) 
    X=X_df.values
    
    #选定相关的feature,把原始的区分画出来
    X_fig=df.loc[:,['maximum MendeleevNumber','range oxidation state']]
    X_fig=X_fig.values
    print(X_fig.shape)    
    plt.scatter(X_fig[y==0,0],X_fig[y==0,1],color='red')
    plt.scatter(X_fig[y==1,0],X_fig[y==1,1],color='blue')
    plt.xlabel('maximum MendeleevNumber')
    plt.ylabel('range oxidation state')
    plt.title('is_daoti classifier (red means no daoti,blue means daoti)')
    plt.show()

    
    crossvalidation=KFold(n_splits=10,shuffle=True,random_state=10)
    X=preprocessing.scale(X)
    #print(np.mean(X))

    
    #print("首先是支持向量机")
    #svm_=svm.SVC(kernel='rbf',random_state=1,class_weight='balanced')
    #print("先看一下默认的结果:")
    #svm_.fit(X,y)
    #y_pre_svm=svm_.predict(X)
    #print("准确度是{}".format(metrics.accuracy_score(y,y_pre_svm)))
    #print("召回率是{}".format(metrics.recall_score(y,y_pre_svm)))
    #print("试着先进行调参")

   
    #a=np.array(range(100,200,10))
    #c=list(1./a)
    #print(c)
    #param_test_svm={'gamma':c,'C':[0.5,0.6,0.7,0.8,0.9,1,1.2,1.4,1.6,1.8]}
    #gsearch_svm=GridSearchCV(estimator=svm.SVC(random_state=10),
    #                      param_grid=param_test_svm,scoring='accuracy',cv=5)
    #gsearch_svm.fit(X,y)
    #print(gsearch_svm.cv_results_)
    #print("最好的参数是{}".format(gsearch_svm.best_params_))
    #print("最好的准确率是{}".format(gsearch_svm.best_score_))
    
   
    #svm_=svm.SVC(kernel='rbf',C=1.2,gamma=0.00625,random_state=1,class_weight='balanced')
    #scores_svm=cross_val_score(svm_,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    #print(scores_svm)      
    #print("现在展示交叉验证的结果")    
    #print("经过{}次交叉验证,准确度的平均值是{:.3f}".format(len(scores_svm),np.mean(scores_svm)))

    #首先是随机森林的分类方法
    print("然后是随机森林")
    clf=RandomForestClassifier(n_estimators=100,oob_score=False,random_state=2)
    clf.fit(X,y)
    y_pre=clf.predict(X)
    print("看一下y:{}".format(y))
    print("看一下y_pre:{}".format(y_pre))
    print("对于全部样本训练的准确度是:{}".format(round(clf.score(X,y),3)))
    print("全部样本,准确率是{}".format(metrics.accuracy_score(y,y_pre)))
    print("全部样本,召回率是{}".format(metrics.recall_score(y,y_pre)))
    print("全部样本,精度是{}".format(metrics.precision_score(y,y_pre)))

     
    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation=KFold(n_splits=10,shuffle=True,random_state=10)
    scores=cross_val_score(clf,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    print(scores)    
    print("现在展示交叉验证的结果")    
    print("经过{}次交叉验证,准确性的平均值是{:.3f}".format(len(scores),np.mean(scores)))

    #展示feature的重要程度
    importances=clf.feature_importances_    
    included=X_df.columns.values
    indices=np.argsort(importances)[::-1]
    pf=PlotlyFig(y_title='importance(%)',title='Feature by importances(classfier)',fontsize=20,ticksize=15)
    pf.bar(x=included[indices][0:10],y=importances[indices][0:10])

    print("最后,用最好的参数进行验证效果")     
    clf=RandomForestClassifier(n_estimators=100,max_features=14,max_depth=16,min_samples_split=6,random_state=3)
    scores_best=cross_val_score(clf,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    print(scores)    
    print("现在展示交叉验证的结果")   
    print("经过{}次交叉验证,准确性的平均值是{:.3f}".format(len(scores),np.mean(scores)))

   
    

    #对随机森林进行调参

    #print("首先是n_estimaters的个数进行调参")
    #param_test1={'n_estimators':range(50,110,10)}
    #gsearch1=GridSearchCV(estimator=RandomForestClassifier(random_state=10),
    #                      param_grid=param_test1,scoring='accuracy',cv=5)
    #gsearch1.fit(X,y)
    #print(gsearch1.cv_results_)
    #print("最好的参数是{}".format(gsearch1.best_params_))
    #print("最好的准确率是{}".format(gsearch1.best_score_))

    #print("其次是对于max_features的调参")
    #param_test2={'max_features':range(6,15)}
    #gsearch2=GridSearchCV(estimator=RandomForestClassifier(random_state=10,n_estimators=80),
    #                      param_grid=param_test2,scoring='accuracy',cv=5)
    #gsearch2.fit(X,y)
    #print(gsearch2.cv_results_)
    #print("最好的参数是{}".format(gsearch2.best_params_))
    #print("最好的准确率是{}".format(gsearch2.best_score_))

    #print("最后是对于max_depth和min_samples_split的调参")
    #param_test3={'max_depth':range(8,20,2),'min_samples_split':range(2,8,2)}
    #gsearch3=GridSearchCV(estimator=RandomForestClassifier(random_state=10,n_estimators=80,max_features=14),
    #                      param_grid=param_test3,scoring='accuracy',cv=5)
    #gsearch3.fit(X,y)
    #print(gsearch3.cv_results_)
    #print("最好的参数是{}".format(gsearch3.best_params_))
    #print("最好的准确率是{}".format(gsearch3.best_score_))
    
    
    
    


    ##然后是朴素贝叶斯
    #print("然后是朴素贝叶斯")
    #bayes=GaussianNB()
    #bayes.fit(X,y)
    #print("训练的准确度是:{}".format(round(bayes.score(X,y),3)))
    ##print("训练后的RMSE是{:.3f}".format(np.sqrt(mean_squared_error(y_true=y,y_pred=bayes.predict(X)))))    
    #scores_bayes=cross_val_score(bayes,X,y,scoring='accuracy',cv=crossvalidation,n_jobs=1)
    #print(scores_bayes)  
    #r2_scores_bayes=cross_val_score(clf,X,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    #print(r2_scores_bayes)
    #print("现在展示交叉验证的结果")
    #print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores_bayes),np.mean(np.abs(r2_scores_bayes))))
    #print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores_bayes),np.mean(scores_bayes)))

    #然后是支持向量机
    #然后正式预测体材料的分类 

    clf.fit(X,y)  
    X_2d=pd.DataFrame()
    df_2d=pd.read_csv('2d_vector_plus.csv',index_col = [0])    
    print(df_2d.index.name)       
    print(df_2d[df_2d.isnull().values==True])        
    df_2d.index.name='mp_id'
    print(df_2d.index.name)
    #读取数据,补充缺失值,更改index   
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid','is_daoti']
    X_2d=df_2d.drop(uwnanted_columns,axis=1,inplace=False)
    X_2d=X_2d.fillna(X_2d.mean())
    X_2d_value=X_2d.values
    X_2d_value=preprocessing.scale(X_2d_value)
    y_pre_2d=clf.predict(X_2d_value)
    df_2d['is_daoti']=2
    for i in range(len(df_2d['is_daoti'].values)):
        df_2d.ix[i,['is_daoti']]=y_pre_2d[i]
    
    df_2d.to_csv('预测是否为导体.csv')

    

    print("all work done")
Ejemplo n.º 21
0
# Printing Validation Results
print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' %
      (len(scores), np.mean(np.abs(rmse_scores))))

# In[10]:

# Importing libraries for plotting
from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict

# In[12]:

# Creates and plots experimental vs. predicted enthalpies
pf = PlotlyFig(x_title='Experimental Enthalpy',
               y_title='Predicted Enthalpy',
               title='Random Forest Regressor',
               mode='notebook',
               filename="rf_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(rf, X, y, cv=crossvalidation)),
                ([0, 200], [0, 200])],
      labels=df['comp'],
      modes=['markers', 'lines'],
      lines=[{}, {
          'color': 'black',
          'dash': 'dash'
      }],
      showlegends=False)
Ejemplo n.º 22
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                    modes=['markers', 'markers+lines', 'lines'],
                    colors=[c, 'red', 'blue'], return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # plotly figures need to be converted jsonable data
    xys['data'] = [p.to_plotly_json() for p in xys['data']]
    xym['data'] = [p.to_plotly_json() for p in xym['data']]
    xy_colors['data'] = [p.to_plotly_json() for p in xy_colors['data']]
    hmb['data'] = [p.to_plotly_json() for p in hmb['data']]
    his['data'] = [p.to_plotly_json() for p in his['data']]
    bar['data'] = [p.to_plotly_json() for p in bar['data']]
    pcp['data'] = [p.to_plotly_json() for p in pcp['data']]

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    # plotly layout needs to be converted jsonable data
    vio = {'layout': vio['layout'].to_plotly_json()}
    scm = {'layout': scm['layout'].to_plotly_json()}

    df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                      columns=['ah', 'bh', 'ch'])
    x_labels = ['low', 'high']
    y_labels = ['small', 'large']
    # TODO: this plot was not JSON serializable, use a different serialization method for all plots
    hmdf = pf.heatmap_df(df, x_labels=x_labels, y_labels=y_labels, return_plot=True)
    hmdf['data'] = [p.to_plotly_json() for p in hmdf['data']]

    df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
    triangle = pf.triangle(df[['q', 'w', 'e']], return_plot=True)

    fnamedict = {"xys": xys, "xym": xym, "xy_colors":xy_colors,
                 "hmb": hmb, "his": his, "bar": bar,
                 "pcp": pcp, "vio": vio, "scm": scm,
                 'triangle': triangle,
                 'hmdf': hmdf
                 }

    for fname, obj in fnamedict.items():
        if fname in ["vio", "scm"]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f, cls=MontyEncoder)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Ejemplo n.º 23
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy(
            [(a, b), (a, c), (c, c)],
            modes=['markers', 'markers+lines', 'lines'],
            colors=[c, 'red', 'blue'],
            return_plot=True)
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c],
                                         xlabels,
                                         ylabels,
                                         return_plot=True)
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b],
                                                cols=xlabels,
                                                return_plot=True)
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = \
        self.pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)[
            'layout']
        vio_true = self.fopen("template_vio.json")
        self.assertTrue(vio_test == vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c],
                                          return_plot=True)['layout']
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):
        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        b = [2, 4, 6, 8, 10, 2, 4, 6, 8, 10]
        c = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        df = pd.DataFrame(data=np.asarray([a, b, c]).T,
                          columns=['a', 'b', 'c'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        self.pf.heatmap_df(df,
                           x_labels=x_labels,
                           y_labels=y_labels,
                           return_plot=True)
Ejemplo n.º 24
0
def refresh_json(open_plots=False):
    """
    For developer use. Refresh the json files and open plots to see if they
    look good. Use this function to set the current PlotlyFig build outputs
    as the true values of the tests.

    Args:
        open_plots (bool): If True, opens all plots generated. Useful if you
            want to check the current build outputs to make sure they look good.
            If False, just generates the json files and quits.
    """

    pf = PlotlyFig(**pfkwargs)
    xys = pf.xy([(a, b)], return_plot=True)
    xym = pf.xy([(a, b), (b, a)], return_plot=True)
    xy_colors = pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'],
                      return_plot=True)
    hmb = pf.heatmap_basic([a, b, c], xlabels, ylabels, return_plot=True)
    his = pf.histogram(a + b + c, n_bins=5, return_plot=True)
    bar = pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
    pcp = pf.parallel_coordinates([a, b], cols=xlabels, return_plot=True)

    # Layout is compared for the plots which always convert to dataframes,
    # as dataframes are not easily encoded by json.dump
    vio = pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)
    scm = pf.scatter_matrix([a, b, c], return_plot=True)

    fnamedict = {
        "xys": xys,
        "xym": xym,
        "xy_colors": xy_colors,
        "hmb": hmb,
        "his": his,
        "bar": bar,
        "pcp": pcp,
        "vio": vio,
        "scm": scm
    }

    for fname, obj in fnamedict.items():
        if obj in [vio, scm]:
            obj = obj['layout']

        with open("template_{}.json".format(fname), "w") as f:
            json.dump(obj, f)

    if open_plots:
        for obj in fnamedict.values():
            pf.create_plot(obj, return_plot=False)
Ejemplo n.º 25
0
class PlotlyFigTest(PymatgenTest):
    def setUp(self):
        self.pf = PlotlyFig(**pfkwargs)
        self.base_dir = os.path.dirname(os.path.realpath(__file__))

    def fopen(self, fname):
        fname = self.base_dir + "/" + fname
        with open(fname, 'r') as f:
            return json.load(f)

    def test_xy(self):
        # Single trace
        xys_test = self.pf.xy([(a, b)], return_plot=True)
        xys_true = self.fopen("template_xys.json")
        self.assertTrue(xys_test == xys_true)

        # Multi trace
        xym_test = self.pf.xy([(a, b), (b, a)], return_plot=True)
        xym_true = self.fopen("template_xym.json")
        self.assertTrue(xym_test == xym_true)

        xy_colors_test = self.pf.xy([(a, b), (a, c), (c, c)],
                      modes=['markers', 'markers+lines', 'lines'],
                      colors=[c, 'red', 'blue'], return_plot=True)
        xy_colors_true = self.fopen("template_xy_colors.json")
        self.assertTrue(xy_colors_test == xy_colors_true)

    def test_heatmap_basic(self):
        hmb_test = self.pf.heatmap_basic([a, b, c], xlabels, ylabels,
                                         return_plot=True)
        hmb_true = self.fopen("template_hmb.json")
        self.assertTrue(hmb_test == hmb_true)

    def test_histogram(self):
        his_test = self.pf.histogram(a + b + c, n_bins=5, return_plot=True)
        his_true = self.fopen("template_his.json")
        self.assertTrue(his_test == his_true)

    def test_bar(self):
        bar_test = self.pf.bar(x=a, y=b, labels=xlabels, return_plot=True)
        bar_true = self.fopen("template_bar.json")
        self.assertTrue(bar_test == bar_true)

    def test_parallel_coordinates(self):
        pcp_test = self.pf.parallel_coordinates([a, b], cols=xlabels,
                                                return_plot=True)
        pcp_true = self.fopen("template_pcp.json")
        self.assertTrue(pcp_test == pcp_true)

    def test_violin(self):
        vio_test = \
        self.pf.violin([a, b, c, b, a, c, b], cols=xlabels, return_plot=True)[
            'layout']
        vio_true = self.fopen("template_vio.json")
        self.assertTrue(vio_test == vio_true)

    def test_scatter_matrix(self):
        scm_test = self.pf.scatter_matrix([a, b, c], return_plot=True)['layout']
        scm_true = self.fopen("template_scm.json")
        self.assertTrue(scm_test == scm_true)

    def test_heatmap_df(self):

        df = pd.DataFrame(data=np.asarray([ah, bh, ch]).T,
                          columns=['ah', 'bh', 'ch'])
        x_labels = ['low', 'high']
        y_labels = ['small', 'large']
        hmdf_test = self.pf.heatmap_df(df, x_labels=x_labels,
                                       y_labels=y_labels,
                                       return_plot=True)
        hmdf_true = self.fopen("template_hmdf.json")
        self.assertTrue(hmdf_test, hmdf_true)

    def test_trianlge(self):
        df = pd.DataFrame(np.random.rand(50, 3), columns=list('qwe'))
        triangle_test = self.pf.triangle(df[['q', 'w', 'e']], return_plot=True)
        triangle_true = self.fopen("template_triangle.json")
        self.assertTrue(triangle_test, triangle_true)
Ejemplo n.º 26
0
 def setUp(self):
     self.pf = PlotlyFig(**pfkwargs)
     self.base_dir = os.path.dirname(os.path.realpath(__file__))
Ejemplo n.º 27
0
r2_scores = cross_val_score(lr, X, y, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))


# In[16]:


from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict

pf = PlotlyFig(x_title='Fracture Toughness (Mpa m^(1/2))',
               y_title='Predicted Fracture Toughness (Mpa m^(1/2))',
               title='Linear regression',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(lr, X, y, cv=crossvalidation)), ([0, 12], [0, 12])], 
      labels=df_ft_1['Formula'], 
      modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )


# ## random forest model

# In[17]:
Ejemplo n.º 28
0
def pre():

    ##导入之后首先进行测试是否有缺失值,并且命名行索引,以及添加gaps数值
    X=pd.DataFrame()
    df=pd.read_csv('full_vector_all.csv',index_col = [0])     
    print(df.index)     
    print(df.index.name)    
    df.index.name='mp_id'
    print(df.index.name)
   
    y=df['gaps'].values
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid']
    X=df.drop(uwnanted_columns,axis=1,inplace=False)
    X=X.fillna(X.mean())
    print(X[X.isnull().values==True])   

    #首先我想先规定下数据的大小范围,对于范围超额的进行近似
    #for co in df.columns:
    #    for row in df.index:            
    #        if(df.loc[row,co]<0.000001 and df.loc[row,co]>-0.000001 and df.loc[row,co]!=0):
    #            print(df.loc[row,co])
    #            df.loc[row,co]=0
    #        elif(df.loc[row,co]>1000000):
    #            print(df.loc[row,co])
    #            df.loc[row,co]=1000000  
     
    #X.to_csv("delete_af_vector.csv")           
    print("这里我们规定hse作为预测项,然后就是其他所有的数字项都是features")    
    print("现在有{}个可能的特征\n\n".format(X.shape[1]))
    print("X 的维度是{}".format(X.shape))   
    print(X[X.isnull().values==True])   
    print(X.isnull().values.any())
    X_pr = X.values
    X_pr=preprocessing.scale(X_pr)


    #首先是线性回归
    lr=LinearRegression()    
    #print(X)
    lr.fit(X_pr,y)   
    print("训练的r2是:{}".format(round(lr.score(X_pr,y),3)))
    print("训练后的RMSE是{:.3f}".format(np.sqrt(mean_squared_error(y_true=y,y_pred=lr.predict(X_pr)))))

    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)
    scores=cross_val_score(lr,X_pr,y,scoring='neg_mean_squared_error',cv=crossvalidation,n_jobs=1)
    print(scores)
   # print("暂停30s休息看看结果")
    #time.sleep(30)
    rmse_scores=[]
    
    for s in scores:
        #print(s)
        #print(np.sqrt(abs(s)))
        rmse_scores.append(np.sqrt(abs(s)))        

    print(rmse_scores)
    r2_scores=cross_val_score(lr,X_pr,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    print(r2_scores)

    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),np.mean(np.abs(rmse_scores))))

    #到这里我们发现预测的结果是不错的
    #但是我们还是需要画图看一下效果

    pf=PlotlyFig(x_title='HSE calculate gap(ev)',y_title='Predicated gap(ev)',title='Linear regression',filename='lr_regression.jpg')
    pf.xy(xy_pairs=[(y,cross_val_predict(lr,X_pr,y,cv=crossvalidation)),([0,400],[0,400])],labels=df['full_formula'],modes=['markers','lines'],lines=[{},{'color':'black','dash':'dash'}],showlegends=False)
    print("这就是线性回归的威力,感觉还是不错的")
    print("\n\n")

    #现在我们尝试使用随机森林来看一下结果如何
    rf=RandomForestRegressor(n_estimators=100,random_state=1)
    rf.fit(X_pr,y)
    print("随机森林的r2是:{}".format(round(rf.score(X_pr,y),3)))
    print("随机森林的是RMSE是:{}".format(round(np.sqrt(mean_squared_error(y_true=y,y_pred=rf.predict(X_pr))),3)))
    #单看整个数据集上效果还是不错的

    importances=rf.feature_importances_
    #print(importances)
    included=X.columns.values
    indices=np.argsort(importances)[::-1]
    #print(indices)

    pf=PlotlyFig(y_title='importance(%)',title='Feature by importances',fontsize=20,ticksize=15)
    pf.bar(x=included[indices][0:10],y=importances[indices][0:10])


    scores=cross_val_score(rf,X_pr,y,scoring='neg_mean_squared_error',cv=crossvalidation,n_jobs=1)
    r2_scores=cross_val_score(rf,X_pr,y,scoring='r2',cv=crossvalidation,n_jobs=1)
    rmse_scores_rf=np.sqrt(abs(scores))
    
    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='HSE calculate gap(ev)',y_title='Random forest predicated gap(ev)',title='Random forest regression',filename='rf_regression.html')
   
   #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y, cross_val_predict(rf, X_pr, y, cv=crossvalidation)), ([0, 450], [0, 450])], 
      labels=df['full_formula'], modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], showlegends=False)



    #现在开始正式进行测试了
    rf.fit(X_pr,y)
    print("正式开始进行预测")
    df_2d=pd.read_csv('预测是否为导体.csv',index_col = [0])  
    uwnanted_columns=['band_gap.optimize_structure_gap','gaps','full_formula','composition','composition_oxid']
    X_2d=df_2d.drop(uwnanted_columns,axis=1,inplace=False)
    X_2d=X_2d.fillna(X_2d.mean())   
    print(X_2d[X_2d.isnull().values==True]) 
    
    df_2d.index.name='mp_id'
    print(df_2d.index.name)
    #读取数据,补充缺失值,更改index   
    
    
   
    X_2d_value=X_2d.values
    X_2d_value=preprocessing.scale(X_2d_value)
    y_pre_2d=rf.predict(X_2d_value)
    df_2d['gaps']=-10
    for i in range(len(df_2d['gaps'].values)):
        df_2d.ix[i,['gaps']]=y_pre_2d[i]
    
    df_2d.to_csv('2d_bulk.csv')

    

    print("all work done")



    print("\n\n") 
Ejemplo n.º 29
0
 def setUp(self):
     self.pf = PlotlyFig(**pfkwargs)
     self.base_dir = os.path.dirname(os.path.realpath(__file__))
def pre():
    print("既然有了这么多数据,我们需要考虑好谁是输入,谁是输出")
    print("这里我们规定K-var作为预测项,然后就是其他所有的数字项都是features")
    df = pd.read_csv('引入结构中的密度.csv')
    print(df.columns)
    y = df['K_VRH'].values
    excluded = [
        "G_VRH", "K_VRH", "elastic_anisotropy", "formula", "material_id",
        "poisson_ratio", "structure", "composition", "composition_oxid"
    ]
    X = df.drop(excluded, axis=1)
    print("现在有{}个可能的特征:\n\n".format(X.shape[1], X.columns.values))
    lr = LinearRegression()
    lr.fit(X, y)
    #看一下我们的结果如何
    print("训练的r2是:{}".format(round(lr.score(X, y), 3)))
    print("训练后的RMSE是{:.3f}".format(
        np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X)))))

    #但是需要注意的是,我们还需要进行交叉检验
    crossvalidation = KFold(n_splits=10, shuffle=False, random_state=1)
    scores = cross_val_score(lr,
                             X,
                             y,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    print(scores)
    # print("暂停30s休息看看结果")
    #time.sleep(30)
    rmse_scores = []

    for s in scores:
        #print(s)
        #print(np.sqrt(abs(s)))
        rmse_scores.append(np.sqrt(abs(s)))

    print(rmse_scores)
    r2_scores = cross_val_score(lr,
                                X,
                                y,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    print(r2_scores)

    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores))))

    #到这里我们发现预测的结果是不错的
    #但是我们还是需要画图看一下效果
    pf = PlotlyFig(x_title='DFT (MP) bulk modules(Gpa)',
                   y_title='Predicated bulk modules(Gpa)',
                   title='Linear regression',
                   filename='lr_regression.jpg')
    pf.xy(xy_pairs=[(y, cross_val_predict(lr, X, y, cv=crossvalidation)),
                    ([0, 400], [0, 400])],
          labels=df['formula'],
          modes=['markers', 'lines'],
          lines=[{}, {
              'color': 'black',
              'dash': 'dash'
          }],
          showlegends=False)
    print("这就是线性回归的威力,感觉还是不错的")
    print("\n\n")

    #现在我们尝试使用随机森林来看一下结果如何
    rf = RandomForestRegressor(n_estimators=50, random_state=1)
    rf.fit(X, y)
    print("随机森林的r2是:{}".format(round(rf.score(X, y), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(np.sqrt(mean_squared_error(y_true=y, y_pred=rf.predict(X))), 3)))
    #单看整个数据集上效果还是不错的

    importances = rf.feature_importances_
    included = X.columns.values
    indices = np.argsort(importances)[::-1]

    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf,
                             X,
                             y,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf,
                                X,
                                y,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='DFT (MP) bulk modulus (GPa)',
                      y_title='Random forest bulk modulus (GPa)',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y, cross_val_predict(rf, X, y, cv=crossvalidation)),
              ([0, 400], [0, 400])],
             labels=df['formula'],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)

    print("\n\n")
print(r2_scores)
"""
[24.203757209180235, 30.412087252477434, 23.272840526605627, 23.28069316986555,
 21.720553965141224, 18.69205713590509, 21.47183327803488, 21.53727751980417,
 17.069404335219055, 17.367188969254546]
[0.89543795 0.85131643 0.88792746 0.90262337 0.88328682 0.93601341
 0.90234611 0.92147782 0.94717506 0.94403039]
"""

from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict

pf = PlotlyFig(
    x_title='DFT (MP) bulk modulus (GPa)',
    y_title='Predicted bulk modulus (GPa)',
    title='Linear Regression',
    mode='offline',
    filename='lr_regression.html'
)
pf.xy(
    xy_pairs=[
        (y, cross_val_predict(lr, X, y, cv=crossvalidation)),
        ([0, 400], [0, 400])
    ],
    labels=df['formula'],
    modes=['markers', 'lines'],
    lines=[{}, {'color': 'black', 'dash': 'dash'}],
    showlegends=False
)

from sklearn.ensemble import RandomForestRegressor
                         n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(lr, X, y, scoring='r2',
                            cv=crossvalidation,
                            n_jobs=1)
print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))

from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict

pf = PlotlyFig(x_title='DFT (MP) bulk modulus (GPa)',
               y_title='Predicted bulk modulus (GPa)',
               title='Linear regression',
               # mode='notebook',
               # mode='online',
               mode='offline',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(lr, X, y, cv=crossvalidation)), ([0, 400], [0, 400])],
      labels=df['formula'],
      modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}],
      showlegends=False
      )

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50,
                           random_state=1)
def pre_gap_forjiyuan():
    #首先载入我们之前建立的vector
    data = pd.read_csv('2d_bulk.csv')
    print(data.columns)
    print(data.describe())
    df = pd.read_csv('vector_new_plustitle.csv')
    print(df.columns)
    print(df.describe())
    df['is_daoti'] = np.nan
    df['bulk_gap'] = np.nan

    j = 0

    print("首先我们需要把之前bulk的is_daoti和预测的gaps放到vector中")
    print("有个问题,某些材料id没有对应信息,只能采取填充法,先规定nan,然后用平均值填充")

    print(len(df.index))

    for i in range(len(df.index)):
        #print("this is {}th".format(i+1))
        for j in range(len(data.index)):
            str = data.ix[j, 'mp_id']
            str = str[3:]
            if (eval(str) == df.ix[i, 'id']):
                #print(df.ix[j,'id'])
                #print(str)
                df.ix[i, 'is_daoti'] = data.ix[j, 'is_daoti']
                df.ix[i, 'bulk_gap'] = data.ix[j, 'gaps']
                break
    df = df.fillna(method="ffill")
    #df.to_csv('plus_bulk_isdaoti_2dvector.csv')
    print(df[df.isnull().values == True])

    y_gap = df['gap'].values
    y_m = df['efm'].values

    unwanted = ['gap', 'id', 'efm']
    X_df = df.drop(unwanted, axis=1, inplace=False)
    X_gap = X_df.values

    X_gap = preprocessing.scale(X_gap)
    X_m = X_gap
    crossvalidation = KFold(n_splits=5, shuffle=True, random_state=2)

    #首先进行线性回归
    print("首先进行线性回归")
    #print(metrics.SCORERS.keys())
    lr = LinearRegression()
    lr.fit(X_gap, y_gap)
    #看一下我们的结果如何
    print("线性训练的r2是:{}".format(round(lr.score(X_gap, y_gap), 3)))
    print("训练后的RMSE是{:.3f}".format(
        np.sqrt(mean_squared_error(y_true=y_gap, y_pred=lr.predict(X_gap)))))
    scores = cross_val_score(lr,
                             X_gap,
                             y_gap,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    rmse_scores = []
    for s in scores:
        rmse_scores.append(np.sqrt(abs(s)))
    print(rmse_scores)
    r2_scores = cross_val_score(lr,
                                X_gap,
                                y_gap,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    print("相关系数是:{}".format(r2_scores))
    print("现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores),
                                           np.mean(np.abs(r2_scores))))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores))))

    #然后开始进行随机森林的预测
    print("开始进行随机森林的预测")
    rf = RandomForestRegressor(n_estimators=90,
                               max_features=10,
                               max_depth=12,
                               min_samples_split=2,
                               random_state=1)
    rf.fit(X_gap, y_gap)
    print("随机森林的r2是:{}".format(round(rf.score(X_gap, y_gap), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_gap,
                                       y_pred=rf.predict(X_gap))), 3)))

    print("看一下回归效果和什么关系更加密切")
    importances = rf.feature_importances_
    included = X_df.columns.values
    indices = np.argsort(importances)[::-1]
    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances gap',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf,
                             X_gap,
                             y_gap,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf,
                                X_gap,
                                y_gap,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='2d HSE calculate gap(ev)',
                      y_title='Random forest predicated 2d gap(ev)',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y_gap, cross_val_predict(rf, X_gap, y_gap, cv=crossvalidation)),
              ([0, 100], [0, 100])],
             labels=[],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)

    #这一部分是调参的过程

    #print("首先是n_estimaters的个数和max_features进行调参")
    #param_test1={'n_estimators':range(50,130,10),'max_features':range(5,15)}
    #gsearch1=gridsearchcv(estimator=randomforestregressor(random_state=10),
    #                     param_grid=param_test1,scoring='neg_mean_squared_error',cv=5)
    #gsearch1.fit(x_gap,y_gap)
    #print(gsearch1.cv_results_)
    #print("最好的参数是{}".format(gsearch1.best_params_))
    #print("最好的均方误差是{}".format(gsearch1.best_score_))

    #print("得到了最好的n_estimators是90,最大特征数是10")

    #print("最后是对于max_depth和min_samples_split的调参")
    #param_test3={'max_depth':range(4,20,2),'min_samples_split':range(2,5,1)}
    #gsearch3=gridsearchcv(estimator=randomforestregressor(random_state=10,n_estimators=90,max_features=10),
    #                      param_grid=param_test3,scoring='neg_mean_squared_error',cv=5)
    #gsearch3.fit(x_gap,y_gap)
    #print(gsearch3.cv_results_)
    #print("最好的参数是{}".format(gsearch3.best_params_))
    #print("最好的准确率是{}".format(gsearch3.best_score_))

    #print("得到了最好的max_depth是12,min_samples_split是2")

    #开始进行有效质量的预测
    print("开始进行有效质量预测")
    print("首先,支持向量机")
    svm_m = svm.SVR(gamma='scale', C=1.0)
    svm_m.fit(X=X_m, y=y_m)
    print("支持向量机的r2是{:.3f}".format(svm_m.score(X_m, y_m)))
    print("支持向量机的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_m, y_pred=svm_m.predict(X_m))),
            3)))
    scores = cross_val_score(svm_m,
                             X_m,
                             y_m,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(svm_m,
                                X_m,
                                y_m,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于支持向量机,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))

    print("没错,开始进行随机森林的预测")
    rf_m = RandomForestRegressor(n_estimators=120, random_state=1)
    rf_m.fit(X_m, y_m)
    print("随机森林的r2是:{}".format(round(rf_m.score(X_m, y_m), 3)))
    print("随机森林的是RMSE是:{}".format(
        round(
            np.sqrt(mean_squared_error(y_true=y_m, y_pred=rf_m.predict(X_m))),
            3)))

    print("看一下预测有效质量效果和什么关系更加密切")
    importances = rf_m.feature_importances_
    included = X_df.columns.values
    indices = np.argsort(importances)[::-1]
    pf = PlotlyFig(y_title='importance(%)',
                   title='Feature by importances efm',
                   fontsize=20,
                   ticksize=15)
    pf.bar(x=included[indices][0:10], y=importances[indices][0:10])

    scores = cross_val_score(rf_m,
                             X_m,
                             y_m,
                             scoring='neg_mean_squared_error',
                             cv=crossvalidation,
                             n_jobs=1)
    r2_scores = cross_val_score(rf_m,
                                X_m,
                                y_m,
                                scoring='r2',
                                cv=crossvalidation,
                                n_jobs=1)
    rmse_scores_rf = np.sqrt(abs(scores))
    print("r2 分别是:{}".format(r2_scores))

    print("对于随机森林,现在展示交叉验证的结果")
    print("经过{}次交叉验证,r2的平均值是{:.3f}".format(len(scores), np.mean(r2_scores)))
    print("经过{}次交叉验证,rmse的平均值是{:.3f}".format(len(scores),
                                             np.mean(np.abs(rmse_scores_rf))))
    print("请看随机森林的结果展示")
    pf_rf = PlotlyFig(x_title='2d efm calculate efm(ev)',
                      y_title='Random forest predicated 2d efm',
                      title='Random forest regression',
                      filename='rf_regression.html')

    #这里可以用rf.predict(X)来代替交叉验证误差的预测项
    pf_rf.xy([(y_m, cross_val_predict(rf_m, X_m, y_m, cv=crossvalidation)),
              ([0, 100], [0, 100])],
             labels=[],
             modes=['markers', 'lines'],
             lines=[{}, {
                 'color': 'black',
                 'dash': 'dash'
             }],
             showlegends=False)
    print("all work done!")
# In[6]:

for i in range(0, 10):
    print(test_mean[i] - train_mean[i])

# In[7]:

# evaluation using prediction graph
# optimally all points should lie on the line
# this will show how much the prediction deviates from the labelled value

from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
pf = PlotlyFig(x_title='bulk modulus(GPa)',
               y_title='Predicted bulk modulus(GPa)',
               title='Ridge regression',
               mode='notebook',
               filename="Ridge_regression.html")
pf.xy(xy_pairs=[(y, cross_val_predict(sgd, x, y, cv=crossvalidation)),
                ([40, 300], [40, 300])],
      modes=['markers', 'lines'],
      lines=[{}, {
          'color': 'black',
          'dash': 'dash'
      }],
      showlegends=False)

# In[ ]:
Ejemplo n.º 35
0
def plot_expt_compt_band_gaps(citrine_api_key, limit=0):
    """
    Pulls experimental band gaps from Citrine (w/o dataset limitations) and
        evaluate the DFT computed band gaps (data from materialsproject.org)
        in xy scatter plot. To compare the right values, we pick the computed
        band gaps calculated for a chemical formula that has the lowest energy
        above hull (the most stable structure).
    Args:
        citrine_api_key (str): Your Citrine API key for getting data. Don't have
            a Citrine account? Visit https://citrine.io/
        limit (int): limit the number of entries (0 means no limit)
    Returns:
        plotly plots in "offline" mode popped in the default browser.
    """

    # pull experimental band gaps from Citrine
    cdr = CitrineDataRetrieval(api_key=citrine_api_key)
    cols = ['chemicalFormula', 'Band gap']
    df_ct = cdr.get_dataframe(criteria={'data_type':'experimental',
                                        'max_results':limit},
                              secondary_fields=True,
                              properties=['Band gap'])
    df_ct = df_ct[cols].rename(columns={'chemicalFormula': 'Formula',
                                        'Band gap': 'Expt. gap'})
    df_ct = df_ct[df_ct['Formula'] != 'In1p1'] # p1 not recognized in Composition
    df_ct = df_ct.dropna() # null band gaps cause problem when plotting residuals
    df_ct['Formula'] = df_ct['Formula'].transform(
        lambda x: Composition(x).get_reduced_formula_and_factor()[0])

    # pull computational band gaps from the Materials Project
    df = MPDataRetrieval().get_dataframe(
        criteria={'pretty_formula': {'$in': list(df_ct['Formula'].values)}},
        properties=['pretty_formula', 'material_id', 'band_gap', 'e_above_hull'],
        index_mpid=False).rename(
        columns={'pretty_formula': 'Formula', 'band_gap': 'MP computed gap',
                 'material_id': 'mpid'})


    # pick the most stable structure
    df_mp = df.loc[df.groupby("Formula")["e_above_hull"].idxmin()]
    df_final = df_ct.merge(df_mp, on='Formula').drop(
                                    'e_above_hull', axis=1).set_index('mpid')
    pf = PlotlyFig(df_final, x_title='Experimental band gap (eV)',
                   y_title='Computed Band Gap (eV)',
                   filename='band_gaps')

    # computed vs. experimental band gap:
    pf.xy([
        ('Expt. gap', 'MP computed gap'),
        ([0, 12], [0, 12])
    ],
        lines=[{}, {'color': 'black', 'dash': 'dash'}],
        labels=['Formula', df_final.index],
        modes=['markers', 'lines'],
        names=['Computed vs. expt.', 'Expt. gap'])

    # residual:
    residuals = df_final['MP computed gap']-df_final['Expt. gap'].astype(float)
    pf.set_arguments(x_title='Experimental band gap (eV)',
                    y_title='Residual (Computed - Expt.) Band Gap (eV)',
                    filename='band_gap_residuals')
    pf.xy(('Expt. gap', residuals),
          labels = ['Formula', df_final.index])