def main(mode='train'):
    train_path = 'data/train.csv'
    test_path = 'data/test.csv'

    data_train = data_loader.preprocess_data(train_path)
    data_test = data_loader.preprocess_data(test_path)
    data_X, data_Y = get_label_data(data_train)

    if mode in 'train':
        train_X, train_Y, test_X, test_Y = data_train_test_split(
            data_X, data_Y)
        print("Model in traning........")
        clf = train_random_forest_classifier(train_X=train_X,
                                             train_Y=train_Y,
                                             test_X=test_X,
                                             test_Y=test_Y)
        save_model(clf, 'model/rft-model.pkl')
        del clf

    clf = load_model('model/rft-model.pkl')
    print('\nValidation of model(kfold)')
    validate_model_kfold(clf, data_X, data_Y)

    print("Write predicted value to disk")
    generate_submission(clf, data_test=data_test)
def main():
    train_path = 'data/train.csv'
    test_path = 'data/test.csv'

    train_X, train_Y = data_loader.preprocess_data(train_path,
                                                   data_mode='train')
    data_test, _ = data_loader.preprocess_data(test_path, data_mode='test')

    train_X, train_Y, test_X, test_Y = data_train_test_split(train_X, train_Y)
    model = train_xgbooster(train_X=train_X,
                            train_Y=train_Y,
                            test_X=test_X,
                            test_Y=test_Y)

    save_model(model, 'model/xgb-model.pkl')
    generate_submission(model, data_test)
def find_my_chances(gpa, gmat, age, race, university, major, gender):

    # create list of strings to trigger the applicant profile parsing
    gpa_str = "{} GPA".format(gpa)
    gmat_str = "{} GMAT".format(gmat)
    demo_str = "{a} year old {r} {g}".format(a=age, r=race, g=gender)
    school_info = "Degree in {m} at {uni} (University)".format(m=major,
                                                               uni=university)

    app_profile = [gpa_str, gmat_str, demo_str, school_info]
    odds = ""
    for school in TARGET_LABELS:
        odds += "{}: 0.0\n".format(school)
    ap = ApplicantProfile(app_profile, odds)

    d = {}
    d["GMAT"] = ap.gmat_score
    d["GPA"] = ap.gpa
    d["UNIVERSITY"] = ap.uni
    d["MAJOR"] = ap.major
    d["JOBTITLE"] = ap.job_title
    d["GENDER"] = ap.gender
    d["RACE"] = ap.race
    d["AGE"] = ap.age
    d["INTERNATIONAL"] = ap.international
    d["ODDS"] = ap.odds.encode('utf-8').strip()

    df = pd.DataFrame(d, index=[0])

    schooldata_dict, mycolnames = preprocess_data(df)

    print("\n {d}".format(d=d))
    for school, indf in schooldata_dict.items():

        # if missing any columns from training set, add them w/ dummy vals
        for col in colnames:
            if col not in indf['features'].columns:
                indf['features'][col] = 0.0

        features_df = indf['features'][colnames]

        # print(features_df)

        df2predictfrom = features_df.values
        df2predictfrom = np.delete(df2predictfrom, 0, axis=1)

        try:

            chance = MODELS[school]['model'].predict(df2predictfrom)

        except KeyError as ke:
            print("No model for {}".format(school))
        try:
            pass
            #print("Coefficients: {}".format(MODELS[school].coef_))
        except AttributeError as ae:
            continue

        if school in ['Harvard', 'Wharton', 'Stanford', 'Booth']:
            print("{s} odds: {c}".format(s=school, c=chance))
Exemple #4
0
 def loading(self,
             load_total,
             sys_ticks,
             data_file='data/loads_model.xls',
             filter_col='BMS编号'):
     '''加载负载数据,并合并到load_pre
     默认为load.xls
     '''
     if self.obj == None:
         fp.output_msg(
             "The load's data has not loading because of the load model had not created!"
         )
     #读取数据文件
     self.load_data = fp.read_load_file(self.load_type, data_file)
     if data_file == 'data/loads_model.xls':
         data_d = False
     else:
         data_d = True
     #对数据进行预处理
     self.load_data = dp.preprocess_data(self.load_data,
                                         self.sys_settings,
                                         data_d,
                                         col_sel=filter_col,
                                         row_sel='010101030613001F')
     self.loads_on(load_total, sys_ticks)
Exemple #5
0
    def prepare_df(self, preproc_type='yeo-johnson'):
        files = os.listdir('.')

        if self.data_csv is None:
            # data_folder, clean_data_folder, flow_val_file, transform = True, flows = 'mean', feature_columns = None
            clean_data_folder = '_'.join([self.data_folder, 'transformed'])
            samples_data = preprocess_data(self.data_folder,
                                           clean_data_folder,
                                           self.flow_values,
                                           preproc_type,
                                           transform=True)
            success, df = get_data_and_labels(clean_data_folder)
            if success:
                self.data_csv = df
                self.x_data_cols = [
                    col for col in self.data_csv.columns
                    if col not in ['Flow', 'SampleName', 'FlowClass']
                ]
                return True
            else:
                return False
        else:
            # make sure the flow class labels correspond

            return True
def convert_and_process():
    # convert_ecgs()
    convert_xmls()
    data_x, data_y, fnames = dgen.get_data(
        return_fnames=True, location=cfg.converted_data_location)
    processed_data_x = dprep.preprocess_data(data_x)

    dprep.save_data(processed_data_x, data_y, cfg.processed_data_location,
                    fnames)
    save_pulse_data()
Exemple #7
0
    def __init__(self,args):
        self.training_iters=args.training_iters
        self.display_steps=args.display_steps

      


        processed_data=preprocess_data(image_path=args.img_path, caption_path=args.caption_path,sample_size=args.sample_size,size=args.size, num_channels=args.num_channels)
        self.train, self.train_captions,self.vocab_size=processed_data.get_data()

        self.x_caption=tf.placeholder(tf.float32,shape=[None, self.vocab_size],name='x_caption')
        self.x_inp=tf.placeholder(tf.float32, shape=[1, args.size[0], args.size[1], args.num_channels], name='x_input')
        self.y=tf.placeholder(tf.float32, shape=[None,self.vocab_size], name='y_image')
        mod=model(self.vocab_size,args.bridge_size,self.x_caption, self.x_inp, self.y,args.size)
        self.cost, self.optimizer, self.accuracy=mod.full_model(learning_rate=0.0001)
def look_at_and_pre_process_data(data, rawdata, variables):
    # Now plot the input data.
    show_data = raw_input("Show the raw-data? (1 = YES): ")
    if show_data == '1':
        pl.plot_data(data, variables)

    # Preprocess the data (mean-centering, normalization).
    text_1 = "Pre-process the data (ENTER = normalization AND mean centering, "
    text_2 = "1 = JUST mean centering, 0 = None): "
    these_processes = raw_input(text_1 + text_2)
    data, rawdata = dp.preprocess_data(these_processes, data, rawdata)

    # "Enhance" certain variables to put all their influence into one component.
    text = "Enhance variables? As integers: Variable_1, Variable_2, ... ; ENTER = none): "
    enhance_these = raw_input(text)
    data, rawdata = dp.boost_variables(enhance_these, data, rawdata)
Exemple #9
0
def load_data_tensors(data_file, num_examples=None):
    """ Read data from a CSV file, and convert into lookup tensors pointing to tokens in the text. """
    raw_data = pd.read_csv(data_file)
    if num_examples is not None: 
        raw_data = raw_data.head(num_examples)
    # extract the slot infromation into separate columns
    data_columns = build_slot_columns(raw_data)
    # add the slot columns into the dataframe
    data = pd.concat([raw_data, data_columns], axis=1)
    data = preprocess_data(data)
    new_mr = reconstruct_mr(data, data_columns.columns)
    data['new_mr'] = new_mr
    data['new_mr'] = data['new_mr'].apply(add_space_to_punctuation)
    input_tensor, mr_word2idx, mr_idx2word = tokenize(data['new_mr'])
    target_tensor, ref_word2idx, ref_idx2word = tokenize(data['ref'])
    return input_tensor, target_tensor, ref_word2idx, ref_idx2word, mr_word2idx, mr_idx2word
Exemple #10
0
from data_preprocessing import preprocess_data
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

sequence_length = 30
buffer_size = 1024
batch_size = 32

training_dataset, test_dataset, data_std, data_mean, n_state_labels, n_county_labels = preprocess_data(
    sequence_length=sequence_length,
    buffer_size=buffer_size,
    batch_size=batch_size)


def build_model():

    state_emb_input = Input(shape=(1, ))
    county_emb_input = Input(shape=(1, ))
    data_input = Input(shape=(sequence_length, 3))

    state_emb = Embedding(input_dim=n_state_labels,
                          output_dim=8,
                          input_length=1)(state_emb_input)
    county_emb = Embedding(input_dim=n_county_labels,
                           output_dim=8,
                           input_length=1)(county_emb_input)
    embedding = Concatenate()([state_emb, county_emb])
    embedding = Flatten()(embedding)
    embedding = Dense(sequence_length, activation='linear')(embedding)
    embedding = Reshape((sequence_length, 1))(embedding)
    concat = Concatenate()([data_input, embedding])
Exemple #11
0
            item_category_map[item].add(category)
        count = cur.execute(
            "select `id` from `book` where `id` not in("
            "select distinct `book_id` from `read_record`) "
            "and `id` not in(select distinct `book_id` from `buy_record`)")
        result = cur.fetchmany(count)
        all_book_not_been_read = []
        for row in result:
            all_book_not_been_read.append(row[0])
        cur.close()
        conn.commit()
        conn.close()
    except Exception, e:
        print Exception, ':', e

    preprocessed_dataset = data_preprocessing.preprocess_data(dataset)
    mp = MostPopular(preprocessed_dataset)
    mp.calc_item_popularity()
    mp_by_ratio = MPByCategoryRatio(preprocessed_dataset, item_category_map)
    mp_by_ratio.calc_item_popularity()
    user_cf = HieraKmeansUserCF(preprocessed_dataset, item_category_map,
                                n_sample, max_iter)
    user_cf.calc_user_sim()

    mutex.acquire()
    PREPROCESSED_DATASET = preprocessed_dataset
    ALL_BOOK_NOT_BEEN_READ = all_book_not_been_read
    MP = mp
    MP_BY_RATIO = mp_by_ratio
    USER_CF = user_cf
    mutex.release()
Exemple #12
0
#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from data_preprocessing import preprocess_data

#%%
seasons = np.arange(1998,2018,1)
tourney, regseason, sub = preprocess_data(seasons)

# tourney columns:
# - unusable:
#   'Season', 'DayNum', 'T1ID', 'T2ID', 'T1PtsF', 'T2PtsF', 'T1PtsA', 'T2PtsA', 'T2Result', 'WDeltaRatio'
# - usable but meh: 
#   'NumOT', 'T1Games', 'T2Games', 'GamesDiff', 'GamesRatio', 'RatingRatio'
# - usable:
#   'T1Seed', 'T2Seed', 'SeedDiff', 'SeedRatio', 'T1Loc',
#   'T1Rating', 'T1PtsFor', 'T1PtsAgainst', 'T1PtsDelta', 'T1WRatio', 'T1WDelta', 'T1WPyt', 'T1WWRatio', 
#   'T2Rating', 'T2PtsFor', 'T2PtsAgainst', 'T2PtsDelta', 'T2WRatio', 'T2WDelta', 'T2WPyt', 'T2WWRatio', 
#   'RatingDiff', 'PtsForDiff','PtsForRatio', 'PtsAgainstDiff', 'PtsAgainstRatio', 'PtsDeltaDiff', 'PtsDeltaRatio',
#   'WRatioDiff', 'WRatioRatio', 'WDeltaDiff', 'WPytDiff', 'WPytRatio',
#   'WWRatioDiff', 'WWRatioRatio'
# - label:
#   'T1Result'


#%%
tourney.plot(x='SeedDiff', y='Result', kind='scatter')
plt.show()
combined_df = input_data_df.append(other_data_df)

combined_df.reset_index(inplace=True)

# catboost_data_dict = preprocess_data_4_catboost(data_df=input_data_df)

# for school, catboostpool in catboost_data_dict.items():

# 	predicted_labels = catboost_pred(catboostpool)

# 	labels = catboostpool.get_label()

# 	display_metrics("Catboost for {}".format(school),predicted_labels,labels)

school_data_dict, colnames = preprocess_data(data_df=combined_df,
                                             output_path=OUT_FILE_PATH)
print(colnames)
MODELS = {}
# would use iteritems, but what if i want to port to python 3.5
for school, feature_label_d in school_data_dict.items():

    features = feature_label_d['features'].values
    labels = feature_label_d['labels'].values

    print("Number of Samples for {}: {}\n".format(school, features.shape[0]))

    # drop indices from the model
    features = np.delete(features, 0, axis=1)

    # test model against train data. we are using ALL of the data for training.
    # Not splitting for cross validation because the dataset for each school is TINY
Exemple #14
0
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import data_preprocessing as dp

TRAIN_DATA_PATH = './data/train.csv'
TEST_DATA_PATH = './data/test.csv'

x_train, y_train = dp.preprocess_data(TRAIN_DATA_PATH)
x_test, y_test = dp.preprocess_data(TEST_DATA_PATH)


regressor = LinearRegression()
model = regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

print ('Score: %.2f' % model.score(x_test, y_test))
print ('Variance Score: %.2f' % r2_score(y_test, y_pred))
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from tabulate import tabulate
import data_preprocessing as preprocessed_data
import pandas as pd
import sys
import os
if '__file__' not in globals():
    sys.path.append(
        os.getcwd() +
        '/Machine_Learning_A-Z_Mine/Part 7 - Natural Language Processing/Section 36 - Natural Language Processing'
    )

# %% codecell
# Preprocess the data before algorithm is applied
x_train, x_test, y_train, y_test = preprocessed_data.preprocess_data()

# %% codecell
# Create Presentation Table Structure
columns = ['Accuracy', 'Precision', 'Sensitivity', 'F1 Score']
index = []
data = []


def add_to_table(cm, algorithm, data, index):
    index.append(algorithm)
    tn, fp, fn, tp = cm.ravel()
    accuracy = round((tp + tn) / (tp + tn + fp + fn), 3)
    precision = round(tp / (tp + fp), 3)
    sensitivity = round(tp / (tp + fn), 3)
    f1_score = round(2.0 * precision * sensitivity / (precision + sensitivity),
Exemple #16
0
def main():
    # data = preprocess_bad_file('result.tsv')
    # save_preprocessed_file(data, 'result.cropped.tsv')
    data = read_data('result.cropped.tsv')
    data = preprocess_data(data)