def gen_all_lin_model_inp():
    """ Generates all linearised full 2nd order Scheffe 
    model inputs from space filling experimental design """
    tag = 'All_Lin_Full_Model_List'

    db = access_db(1, True)

    if db:
        debug('Already generated all linearised full model inputs')
        return

    sv_db = access_db(0, True)
    all_full_models = all_full_model_lin(sv_db)
    db.insert({tag: all_full_models})
def get_data_req_to_score_model():
    """ Calculates all the data required to run score_models_per_data_type
    that does not need to be recalculated in score_models_per_data_type """
    Q = Query()

    all_model_codes = []

    for i in range(28):
        number_of_terms = i + 1
        db = access_db(('All_Poss_Mod_{}_Terms'.format(number_of_terms)),
                       False)

        all_model_codes += extractnames(db.search(Q.mc.exists()), 'mc')

    sv_db = access_db(0, True)
    model = LinearRegression(fit_intercept=False)
    all_full_models = get_all_lin_model_inp()
    return sv_db, model, all_full_models, all_model_codes
    def gen_and_score_mod(self, column):
        """Generates all models and scores the data without storing all the possible models,
        no big tinydb's are used"""
        Y = self.Ys[column].dropna().values
        sn_Y = self.Ys[column].dropna().index
        my_cv = ShuffleSplit(len(Y), n_iter=3, test_size=0.333, random_state=0)

        equip, d_type = column.split(' ')

        top_db = access_db('Top_score_results_' + equip + '_' + d_type, False)

        for i in range(2):
            number_of_terms = i + 1

            done = top_db.contains(self.Q.n_terms == number_of_terms)

            if done:
                continue

            terms_key = gen_terms_key()

            # Generate all possible models
            top_score = -10000.0
            for i in combinations(list(range(28)), number_of_terms):
                invalid = False

                for j in i:
                    if j >= 7:
                        key_1 = terms_key[j][0]
                        key_2 = terms_key[j][1]
                        if key_1 not in i or key_2 not in i:
                            invalid = True
                            break

                if not invalid:
                    # Generate X for certain model and Y
                    X = gen_X(sn_Y, self.all_full_input, i)
                    scores = cross_val_score(self.model_obj, X, Y, cv=my_cv)
                    score = mean(scores)

                    top_score = max(score, top_score)

                    if top_score == score:
                        top_mcode = list(i)

            entry = {
                'equipment_name': equip,
                'data_type': d_type,
                'n_terms': number_of_terms,
                'top_score': top_score,
                'top_mcode': top_mcode
            }

            top_db.insert(entry)
def score_models(column):
    """Generates all models and scores the data without storing all the possible models,
    no big tinydb's are used"""

    Ys = get_Ys()
    all_full_input = get_all_lin_model_inp()
    model_obj = LinearRegression(fit_intercept=False)

    Y = Ys[column].dropna().values
    sn_Y = Ys[column].dropna().index
    my_cv = ShuffleSplit(len(Y), n_iter=3, test_size=0.333, random_state=0)

    equip, d_type = column.split(' ')

    top_db = access_db('Top_score_results_' + equip + '_' + d_type, False)

    for i in range(28):
        number_of_terms = i + 1

        done = top_db.contains(Q.n_terms == number_of_terms)

        if done:
            continue

        f_name = 'All_Poss_Mod_{}_Terms'.format(number_of_terms)
        f_obj = access_file(f_name, write=False)
        mcodes = cPickle.load(f_obj)
        f_obj.close()

        top_score = -10000.0
        for i in mcodes:
            # Generate X for certain model and Y
            X = gen_X(sn_Y, all_full_input, i)
            scores = cross_val_score(model_obj, X, Y, cv=my_cv)
            score = mean(scores)

            top_score = max(score, top_score)

            if top_score == score:
                top_mcode = list(i)

        entry = {
            'equipment_name': equip,
            'data_type': d_type,
            'n_terms': number_of_terms,
            'top_score': top_score,
            'top_mcode': top_mcode
        }

        top_db.insert(entry)
def min_max_df(const_list, full=False):
    sv_db = access_db(0, True)

    msrmnts = get_msrmnts(sv_db, Q)

    if not full:
        for_opt = msrmnts[const_list]
    else:
        for_opt = msrmnts

    df = concat([for_opt.max(), for_opt.min()], axis=1)
    df.columns = ['max', 'min']

    return df
def get_mod_info():
    mod_db = access_db(3, True)

    mod_df = DataFrame(mod_db.all())
    mod_df['name'] = mod_df.equipment_name + ' ' + mod_df.data_type

    mod_df = mod_df.set_index('name')
    mod_df = mod_df.drop(['data_type',
                          'equipment_name',
                          'p_vals',
                          'r_sqrd',
                          'select_score',
                          't_vals'
                          ], axis=1)

    return mod_df
def preprocessing():
    """ Runs all the functions that put raw data into the single values database """
    sv_db = access_db(0, True)

    equip_list = [equipment.Rheomix(),
                  equipment.Thermomat(),
                  equipment.Colour(),
                  equipment.LOI(),
                  equipment.MCC(),
                  equipment.ConeCal(),
                  equipment.Tensile(),
                  equipment.MassFrac()
                 ]
                 
    for e in equip_list:
        e.raw_data_to_db(sv_db)
Beispiel #8
0
def pca_X(impute=False, exclude_inp=True):
    sv_db = access_db(0, True)

    Q = Query()

    # Extract data from db using pandas to construct X

    compositions = DataFrame(sv_db.search(Q.ingredient.exists()))
    compositions[
        'name'] = compositions.data_type + ' ' + compositions.ingredient
    compositions = compositions[['name', 'sample_number',
                                 'value']].pivot(index='sample_number',
                                                 columns='name',
                                                 values='value')

    measurements = get_msrmnts(sv_db, Q)

    alldata = concat([compositions, measurements], axis=1)

    # Database has missing values, missing values can either be
    # replaced by mean or the incomplete rows are excluded from X

    if not impute:
        measurements = measurements.dropna()
        alldata = alldata.dropna()

    if exclude_inp:
        use = measurements
    else:
        use = alldata

    X = use.values.tolist()

    if impute:
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(X)
        X = imp.transform(X)

    X_std = StandardScaler().fit_transform(X)

    #      X    ,df of data used
    return X_std, use
def get_Ys(do_pca=False):
    """Get Ys as DataFrame for fitting, if no PCA measurements are scaled from -1 to 1"""

    sv_db = access_db(0, True)
    measurements = get_msrmnts(sv_db, Q)

    if do_pca:
        X, df = pca_X()
        my_pca = PCA(n_components=0.99)
        my_pca.fit(X)

        X_trans = my_pca.transform(X)
        sn_Y = list(df.index)
        names = ['PCA Comp_' + str(i + 1) for i in range(my_pca.n_components_)]
        Ys = DataFrame(X_trans, index=sn_Y, columns=names)
        return Ys

    Ys = measurements

    Ys = Ys - Ys.min()
    Ys = Ys / Ys.max()
    return Ys * 2 - 1
def get_all_lin_model_inp():
    """ Gets all the full model inputs from the All_Lin_Full_Model_Inputs """
    tag = 'All_Lin_Full_Model_List'
    db = access_db(1, True)
    return db.all()[0][tag]
def get_select_models():
    """ Selects the model that is 'best' from the top models at each number of model terms """
    model_select_db = access_db(3, True)

    Ys = get_Ys()
    all_full_input = get_all_lin_model_inp()

    names = Ys.columns

    for column in names:
        equip, d_type = column.split(' ')

        top_db = access_db('Top_score_results_' + equip + '_' + d_type, False)

        df = DataFrame(top_db.all())

        scores = list(df['top_score'].values)
        mcodes = list(df['top_mcode'].values)

        max_score = max(scores)
        done = False

        # Select model with least number of terms where prediction improves
        # no more than 5 % at max prediction.
        lim = max_score - (abs(max_score * 5 / 105))

        for s in scores:
            if s > lim and not done:
                select_score = s
                done = True

        ind = scores.index(select_score)
        select_model = mcodes[ind]

        Y = Ys[column].dropna().values
        sn_Y = Ys[column].dropna().index
        X = gen_X(sn_Y, all_full_input, select_model)

        params, conf_int, r_sqrd, p_vals, t_vals = model_stats(X, Y)

        my_Q = ((Q.equipment_name == equip) & (Q.data_type == d_type))

        done = model_select_db.contains(my_Q)

        if done:
            model_select_db.update(
                {
                    'select_score': select_score,
                    'select_mcode': select_model,
                    'model_params': list(params),
                    'r_sqrd': r_sqrd,
                    'p_vals': list(p_vals),
                    't_vals': list(t_vals)
                }, my_Q)
            continue

        entry = {
            'equipment_name': equip,
            'data_type': d_type,
            'select_score': select_score,
            'select_mcode': select_model,
            'model_params': list(params),
            'r_sqrd': r_sqrd,
            'p_vals': list(p_vals),
            't_vals': list(t_vals)
        }

        model_select_db.insert(entry)