Esempio n. 1
0
class Keras(BaseEstimator):
    def __init__(self, build_function, multi_class=False, keras_params = None):
        if not callable(build_function):
            raise ValueError('Model construction function must be callable.')

        self.multi_class = multi_class
        self.build_function = build_function
        if keras_params is None:
            keras_params = {}

        self.keras_params = keras_params

    def fit(self, X, y):
        if self.multi_class:
            self.n_classes_ = len(set(y))
        else:
            self.n_classes_ = 1

        build_callable = lambda: self.build_function(X.shape[1], self.n_classes_)
        keras_params=copy(self.keras_params)
        keras_params['build_fn']=build_callable

        self.classifier_ = KerasClassifier(**keras_params)
        self.classifier_.fit(X, y)

    def predict(self, X):
        return self.classifier_.predict(X)
def test_keras_classifier():
    model = Sequential()
    model.add(Dense(input_dim, input_shape=(input_dim,)))
    model.add(Activation('relu'))
    model.add(Dense(nb_class))
    model.add(Activation('softmax'))

    sklearn_clf = KerasClassifier(model, optimizer=optim, loss=loss,
                                  train_batch_size=batch_size,
                                  test_batch_size=batch_size,
                                  nb_epoch=nb_epoch)
    sklearn_clf.fit(X_train, y_train)
    sklearn_clf.score(X_test, y_test)
Esempio n. 3
0
    def NN(self, report=False):
        """Neutral Network.

        Args:
            report: whether print out the model analysis report.
        Returns:
            One layer neutral network model."""
        from keras.models import Sequential
        from keras.layers import Dense
        from keras.wrappers.scikit_learn import KerasClassifier

        def baseline_model():
            model = Sequential()
            model.add(Dense(8, input_dim=len(self.features), activation='relu'))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            return model    

        self.nn = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=5, verbose=1)
        self.nn.fit(self.train[self.features], self.train[self.target])

        if report:
            from Report import Report
            rpt = Report(self.nn, self.train, self.valid, self.target, self.features)
            rpt.ALL()

        return self.nn
Esempio n. 4
0
    def init_model(self):
        '''
        init model
        '''
        train_params = {"nb_epoch": 10, "batch_size": 10}
        self.dic_params.update(train_params)
        self.model = KerasClassifier(build_fn=self.create_model_func, **self.kargs["create_model"]["params"])
#        self.model = KerasClassifier(build_fn=self.create_model_func)
        self.model.set_params(**self.dic_params)
Esempio n. 5
0
    def fit(self, X, y):
        if self.multi_class:
            self.n_classes_ = len(set(y))
        else:
            self.n_classes_ = 1

        build_callable = lambda: self.build_function(X.shape[1], self.n_classes_)
        keras_params=copy(self.keras_params)
        keras_params['build_fn']=build_callable

        self.classifier_ = KerasClassifier(**keras_params)
        self.classifier_.fit(X, y)
Esempio n. 6
0
def main():
    code_dir = '/home/john/git/kaggle/OttoGroup/'
    data_dir = '/home/john/data/otto/'
    training_file = 'train.csv'

    os.chdir(code_dir)
    np.random.seed(1337)

    print('Starting script...')

    print('Loading data...')
    X, labels = load_training_data(data_dir, training_file)

    print('Pre-processing...')
    scaler = create_scaler(X)
    X = apply_scaler(X, scaler)
    y, y_onehot, encoder = preprocess_labels(labels)
    num_features = X.shape[1]
    num_classes = y_onehot.shape[1]
    print('Features = ' + str(num_features))
    print('Classes = ' + str(num_classes))

    print('Building model...')
    model = define_model(num_features, num_classes)
    print('Complete.')

    print('Training model...')
    wrapper = KerasClassifier(model)
    wrapper.fit(X, y_onehot, nb_epoch=20)
    print('Complete.')

    print('Training score = ' + str(wrapper.score(X, y_onehot)))

    preds = wrapper.predict(X)
    print('Predictions shape = ' + str(preds.shape))

    proba = wrapper.predict_proba(X)
    print('Probabilities shape = ' + str(proba.shape))

    print('Building ensemble...')
    ensemble = BaggingClassifier(wrapper, n_estimators=3, max_samples=1.0, max_features=1.0)
    print('Complete.')

    print('Training ensemble...')
    ensemble.fit(X, y)
    print('Complete.')

    print('Ensemble score = ' + str(ensemble.score(X, y)))

    print('Script complete.')
Esempio n. 7
0
seed = 7
np.random.seed(seed)
 
# Function to create model, required for KerasClassifier
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=34, init='uniform', activation='relu'))
    model.add(Dense(8, init='uniform', activation='relu'))
    model.add(Dense(1, init='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
 
# create model
model = KerasClassifier(build_fn=create_model, nb_epoch=20, batch_size=32)
# evaluate using 10-fold cross validation
# kfold = KFold(n=len(features_train), n_folds=10, shuffle=True, random_state=seed)
# results = cross_val_score(model, features_train.values, labels_train.values, cv=kfold)
# print "Cross validation results:", (results.mean()*100), (results.std()*100)
model.fit(features_train.values, labels_train.values)

print "Model building complete:",round((time()-t0)/60,3),"m"

# print len(np.unique(train.user_id)), len(np.unique(test.user_id))

# features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.60)

# # neigh = neighbors.KNeighborsClassifier(weights='distance', n_jobs=-1).fit(train[features], train['hotel_cluster'])
# forest = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(train[features], train['hotel_cluster'])
# # bayes = naive_bayes.GaussianNB().fit(train[features], train['hotel_cluster'])
Esempio n. 8
0
        Dense(units=12,
              kernel_initializer='uniform',
              activation='relu',
              input_dim=11))
    classifier.add(
        Dense(units=12, kernel_initializer='uniform', activation='relu'))
    classifier.add(
        Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
    classifier.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])
    return classifier


classifieres = KerasClassifier(build_fn=build_classifieres,
                               batch_size=10,
                               epochs=2)
accuracies = cross_val_score(estimator=classifieres,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             n_jobs=-1)

mean = accuracies.mean()
variance = accuracies.std()

print(mean)
print(variance)

# Regualrização do Dropout para reduzir overfitting, caso seja necessário
from keras.wrappers.scikit_learn import KerasClassifier
        Dense(
            units=neurons,
            activation=activation,
            kernel_initializer=kernel_initializer,
        ))
    rede_neural.add(Dropout(0.2))
    rede_neural.add(Dense(units=1, activation='sigmoid'))

    rede_neural.compile(optimizer=optimizer,
                        loss=loss,
                        metrics=['binary_accuracy'])

    return rede_neural


rede_neural = KerasClassifier(build_fn=criar_rede)

parametros = {
    'batch_size': [5, 10, 20, 30, 40],
    'epochs': [50, 100, 500, 1000],
    'optimizer': ['adam', 'sgd'],
    'loss': ['binary_crossentropy', 'hinge'],
    'kernel_initializer': ['random_uniform', 'normal'],
    'activation': ['relu', 'tanh'],
    'neurons': [8, 16, 24, 32, 64]
}

grid_search = GridSearchCV(estimator=rede_neural,
                           param_grid=parametros,
                           scoring='accuracy',
                           cv=10)
Esempio n. 10
0
    # create model
    model = Sequential()
    model.add(Dropout(0.4, input_shape=(60, )))
    model.add(
        Dense(60, init='normal', activation='relu', W_constraint=maxnorm(3)))
    model.add(Dense(1, init='normal', activation='sigmoid'))
    # Compile model
    sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False)
    model.compile(loss='binary_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])
    return model


numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',
                   KerasClassifier(build_fn=create_model,
                                   nb_epoch=100,
                                   batch_size=10,
                                   verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(y=encoded_Y,
                        n_folds=10,
                        shuffle=True,
                        random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Nom")
print("Visible: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))
Esempio n. 11
0
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


seed = 2016

earlyStopping = callbacks.EarlyStopping(monitor='val_loss',
                                        patience=1,
                                        verbose=1,
                                        mode='auto')

model = KerasClassifier(build_fn=create_model,
                        epochs=40,
                        batch_size=1024,
                        verbose=1)

from sklearn import model_selection
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

# x_train = np.array(X)
#y_train = np.array(y)

train_stacker = [[0.0 for s in range(1)] for k in range(0, (x_train.shape[0]))]

cv_scores = []
oof_preds = []
a = [0 for x in range(2345796)]
Esempio n. 12
0
def processData(df):
    unique_degrees = df.degree.unique()
    degree_level, filtered = Filter_degree(df)
    stream_filtered, unsettled_streams = Filter_streams(filtered)
    df['stream_processed'] = stream_filtered
    df['degree_processed'] = degree_level
#######################################################################################################################
    majors_classification_df = pd.read_csv("College_Majors_Classification.csv")
    print(majors_classification_df.shape)
    majors_classification_df = majors_classification_df.dropna()
    majors_classification_df = majors_classification_df.applymap(lambda s: s.lower().strip())
    majors_mapping = {}
    idx_majors_mapping = {}
    for major in majors_classification_df.Major_Category.unique():
        majors_mapping.update({major: []})
    for index, row in majors_classification_df.iterrows():
        majors_mapping[row['Major_Category']].append(row['Major'])
        idx_majors_mapping.update({row['Major']: row['Major_Category']})
    df['major'] = df['major'].fillna('None')

#######################################################################################################################
    unsettled_titles = []
    settled_titles = []

    known_majors_vectors = []
    for key, item in idx_majors_mapping.items():
        nlp_token = nlp(key)
        if (nlp_token.vector_norm):
            known_majors_vectors.append((nlp_token.vector, item))

    created_mapping = {}

    for val in df.major.unique():
        nlp_token = nlp(val)
        if not nlp_token.vector_norm:
            unsettled_titles.append(val)
            continue
        closest_major = find_closest(nlp_token.vector, known_majors_vectors)
        created_mapping.update({val: closest_major})

    prettify_create_mapping = {}
    for major in majors_classification_df.Major_Category.unique():
        prettify_create_mapping.update({major: []})

    for key, val in created_mapping.items():
        prettify_create_mapping[val].append(key)

    # pd.DataFrame.from_dict(prettify_create_mapping, orient='index').to_csv('majors_mapping.csv')
#######################################################################################################################
    r = pd.read_csv('majors_mapping.csv')
    r = r.set_index('Unnamed: 0')
    prettify_create_mapping = {}
    for i in r.index:
        prettify_create_mapping[i] = []
        for enum, j in enumerate(r.columns):
            if enum == 1:
                continue
            val = r.loc[str(i), j]
            if not pd.isnull(val):
                prettify_create_mapping[i].append(val)
    majors_mapping = prettify_create_mapping
#######################################################################################################################
    majors_processed = []

    for major_raw in df.major:
        major_ = "None"
        if major_raw == "None":
            majors_processed.append("None")
            continue
        for major_group, val in majors_mapping.items():
            if major_raw in val:
                major_ = major_group
                break
        majors_processed.append(major_)

    df['majors_processed'] = majors_processed
#######################################################################################################################

    a = pd.read_csv("industries_classification.csv")
    a = a.drop(["Sector", "Industry Group", "Industry", "Sub-Industry"], axis=1)
    a = a.rename({"Unnamed: 1": "Sector", "Unnamed: 3": "Industry Group", "Unnamed: 5": "Industry",
                  "Unnamed: 7": "Sub-Industry"}, axis=1)
    a = a.dropna(how='all')
    a = a.fillna("None")

    SP_classification = []
    column_used = "Industry Group"
    sector = a.iloc[0][column_used]
    for index, row in a.iterrows():
        subindustry = row['Sub-Industry']
        if (row[column_used] != "None"):
            sector = row[column_used]
        SP_classification.append([sector, subindustry])

    for tuple_ in SP_classification:
        if tuple_[0] == "(cont’d)":
            tuple_[0] = "Consumer Discretionary"
        if tuple_[0] == "Discretionary":
            tuple_[0] = "Consumer Discretionary"
        if tuple_[0] == "Consumer":
            tuple_[0] = "Consumer Discretionary"

        tuple_[0] = tuple_[0].replace(" (cont’d)", "")
        tuple_[0] = tuple_[0].replace("\n", " ")
        tuple_[0] = tuple_[0].split(" (")[0]
        tuple_[1] = tuple_[1].replace("\n", " ")
        tuple_[1] = tuple_[1].replace(" & ", " and ")
        tuple_[1] = tuple_[1].split(" -- ")[0]
        tuple_[1] = tuple_[1].split("(")[0]
        tuple_[1] = tuple_[1].replace("REITs", "Real Estate Investment Trusts")
        tuple_[1] = tuple_[1].replace("-", " ")
#######################################################################################################################
    sectors = []
    for tuple_ in SP_classification:
        sectors.append(tuple_[0])
    unique_sectors = np.unique(sectors)

    sector_mapping = {}
    for i, sector in enumerate(unique_sectors):
        sector_mapping.update({sector: i + 1})
#######################################################################################################################
    SP_vectors = []
    for tuple_ in SP_classification:
        idx = sector_mapping[tuple_[0]]
        SP_vectors.append(np.append(nlp(tuple_[1]).vector, idx))
#######################################################################################################################
    data = pd.DataFrame(SP_vectors)
    X = data.loc[:, :299]
    y = data.loc[:, 300]
    from sklearn.preprocessing import LabelEncoder
    from keras.utils import np_utils
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_Y = encoder.transform(y)
    dummy_y = np_utils.to_categorical(encoded_Y, num_classes=len(unique_sectors))
    data.head()

    from keras.wrappers.scikit_learn import KerasClassifier
    estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
    from keras.models import Sequential
    from sklearn.model_selection import train_test_split
    from keras.layers import Dense
    data = np.concatenate([X, dummy_y], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(data[:, :300], data[:, 300:], test_size=0.33, random_state=42)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    y_pred = np_utils.to_categorical(y_pred, num_classes=len(unique_sectors))
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape, y_pred.shape)
########################################################################################################################

    from sklearn.metrics import classification_report
    df['industry'] = df['industry'].fillna('None')
    vectors_unique = []
    words_unique = []
    industries_unique = df.industry.unique()
    industries_filtered = []
    for val in industries_unique:
        val = val.replace("&", "")
        val = val.replace("/", " ")
        if val != "None" and val.find(':') == -1 and val.find("]"):
            industries_filtered.append(val)

    for val in industries_filtered:
        nlp_token = nlp(val)
        if (nlp_token.has_vector):
            vectors_unique.append(nlp_token.vector)
            words_unique.append(val)
    vectors_unique = np.array(vectors_unique)
    y_pred = estimator.predict(vectors_unique)

    predicted_sectors = {}

    for key, value in sector_mapping.items():
        predicted_sectors[key] = []

    for a, b in zip(words_unique, y_pred):
        predicted_sectors[unique_sectors[b]].append(a)
    # pd.DataFrame.from_dict(predicted_sectors, orient='index').to_csv("/content/predicted_industries_classification.csv")

    industry_processed = []

    for industry_raw in df.industry:
        industry_ = "None"
        industry_raw = clean_string(industry_raw)
        for key, value in predicted_sectors.items():
            if industry_raw in value:
                industry_ = key
                break
        if (industry_ == "None" and industry_raw != "None"):
            print(industry_raw)
        industry_processed.append(industry_)

    df['industry_processed'] = industry_processed
########################################################################################################################
    workers_classification = pd.read_csv(
        'https://docs.google.com/spreadsheets/d/e/2PACX-1vSXxWGKiCFw6QRXV09znbdHmd5HqCGgzl8o8qGndrft2U9I9fyNz94rblr69YLQkqhDiTkGrwGH6M4R/pub?gid=2085110439&single=true&output=csv')
    workers_classification.columns = ['none', 'level', 'title']
    workers_classification = workers_classification.drop(['none'], axis=1)
    workers_classification = workers_classification.fillna(method='ffill')
    workers_classification = workers_classification.applymap(lambda s: s.lower().strip())
    workers_mapping = {}
    for x in workers_classification.level.unique():
        workers_mapping[x] = []
    for index, row in workers_classification.iterrows():
        workers_mapping[row['level']].append((row['title']).lower())
    levels = list(workers_mapping.keys())
    data = []
    counter = 0
    for level, titles in workers_mapping.items():
        for title in titles:
            if nlp(title).has_vector:
                data.append(np.append(nlp(title).vector, counter))
        counter += 1
    data = pd.DataFrame(data)

    workers_mapping = {}
    for x in workers_classification.level.unique():
        workers_mapping[x] = []
    for index, row in workers_classification.iterrows():
        workers_mapping[row['level']].append((row['title']).lower())
    idx_workers_mapping = {}
    idx_vectors_mapping = {}

    for key, value in workers_mapping.items():
        for item in value:
            idx_workers_mapping[item] = key
            idx_vectors_mapping[item] = (nlp(item))
########################################################################################################################
    import re
    from tqdm import tqdm_notebook

    switches = {
        "sr": "senior",
        "asst": "assistant",
        " & ": " ",
        "-": "",
        "engg": "engineer",
        "ceo": "chief executive officer",
        "cto": "chief technical officer",
        "@": " ",
        "dy": "deputy",
        "/": " ",
        '"': "",
    }

    levels_split = []
    unsettled_titles = []
    specialisation = []
    already_done_ = {}
    df = df.sort_values(by=['job_title'])
    df['job_title'] = df['job_title'].fillna('None')
    df['level_raw'] = df['level_raw'].fillna('None')

    for title_raw, level_raw in tqdm_notebook(zip(df.job_title, df.level_raw), total=len(df.job_title)):
        selected_word = ""
        if title_raw == "None" and level_raw == "None":
            level_ = "None"
        else:
            if title_raw == "None":
                level_ = level_raw
            else:
                try:
                    level_, selected_word = already_done_[(title_raw, level_raw)]
                except:
                    title = title_raw
                    for original, new in switches.items():
                        title = title.replace(original, new)
                        title = title.strip()

                    level_ = "None"
                    for level in ['tm', 'mm', 'lm', 'fm', 'worker', 'others']:
                        for level_granular in workers_mapping[level]:
                            if title.find(level_granular) > -1:
                                level_ = level
                                selected_word = level_granular
                                break
                        if selected_word != "":
                            break

                    title_token = nlp(title)
                    if level_ == "None" and title_token.vector_norm:
                        for level_granular, level in idx_workers_mapping.items():
                            token = idx_vectors_mapping[level_granular]
                            if token.vector_norm and title_token.similarity(token) > 0.9:
                                level_ = level
                                selected_word = level_granular
                                break

                    if level_ == "None" and title != "None":
                        unsettled_titles.append(title)
                        level_ = "undefined"

                    already_done_.update({(title_raw, level_raw): (level_, selected_word)})

        left_part = title_raw.replace(selected_word, "")
        if left_part == "":
            left_part = "None"

        levels_split.append(level_)
        specialisation.append(left_part)
        # print(title_raw,"|", level_raw,"|", level_, "|", left_part)
    # pd.DataFrame(unsettled_titles).to_csv('unsettled_titles.csv')

    print(len(levels_split))
    print(len(unsettled_titles))
    print(len(specialisation))
    df['levels_processed'] = levels_split
    df['specialisation'] = specialisation
########################################################################################################################
    # spec_classification = pd.read_csv("/content/drive/My Drive/Revamp/Data/specialisations_classification.csv")
    spec_classification = pd.read_csv(
        'https://docs.google.com/spreadsheets/d/e/2PACX-1vSXxWGKiCFw6QRXV09znbdHmd5HqCGgzl8o8qGndrft2U9I9fyNz94rblr69YLQkqhDiTkGrwGH6M4R/pub?gid=0&single=true&output=csv')
    spec_classification.columns = ['specialisation', 'categories']
    spec_classification['specialisation'] = spec_classification['specialisation'].fillna(method='ffill')
    spec_classification = spec_classification.applymap(lambda s: s.lower().strip())
    spec_classification.head()

    specialisation_mapping = {}

    spec_classification = spec_classification.applymap(lambda s: s.lower() if type(s) == str else s)

    for spec in spec_classification['specialisation'].unique():
        specialisation_mapping[spec] = []

    for index, row in spec_classification.iterrows():
        specialisation_mapping[row['specialisation']].append(row['categories'])
    idx_specialisation_mapping = {}
    idx_tokens_mapping = {}
    idx_vectors_mapping = {}
    for key, value in specialisation_mapping.items():
        for item in value:
            idx_specialisation_mapping[item] = key
            idx_tokens_mapping[item] = nlp(item)
            idx_vectors_mapping[item] = idx_tokens_mapping[item].vector

    processed_specialisations = []
    specialiasation_cleaned = []
    granular_specialisations = []
    already_done_ = {}
    count = 0
    df['job_title'] = df['job_title'].fillna('None')
    choices = []
    for key, value in idx_specialisation_mapping.items():
        choices.append(key)

    from numpy import dot
    from numpy.linalg import norm

    def cos_sim(a, b):
        return dot(a, b) / (norm(a) * norm(b))

    from tqdm import tqdm_notebook
    # for specialisation_raw in tqdm_notebook(df.job_title.unique(), total=len(df.job_title.unique())):
    for specialisation_raw in tqdm_notebook(df.job_title, total=len(df.job_title)):
        count += 1
        # print(specialisation_raw)
        specialisation = specialisation_raw
        specialisation = specialisation.replace('"', "")
        specialisation = specialisation.replace("'", "")
        specialisation = specialisation.replace('-', " ")
        specialisation = specialisation.replace('*', " ")

        specialisation = specialisation.replace('(', "")
        specialisation = specialisation.replace('/', "")
        specialisation = specialisation.replace('|', "")
        specialisation = specialisation.replace('\\', "")
        specialisation = specialisation.replace(')', "")
        specialisation = specialisation.strip()

        if specialisation_raw == "None":
            processed_specialisation = "None"
            specialisation = "None"
            granular_specialisation = "None"

        else:
            if specialisation_raw in already_done_.keys():
                granular_specialisation = already_done_[specialisation_raw][0]
                processed_specialisation = already_done_[specialisation_raw][1]
            else:
                processed_specialisation = "undefined"
                granular_specialisation = "undefined"
                spec_vector = nlp(specialisation).vector
                best_token = "None"
                max_sim = 0
                max_len = 0
                for spec_ in idx_specialisation_mapping.keys():
                    if (len(spec_) > 7) and specialisation.find(spec_) > -1:
                        if len(spec_) > max_len:
                            best_token = spec_
                            max_len = max(max_len, len(best_token))
                    elif len(spec_) > 3 and specialisation.find(spec_) > -1:
                        if (cos_sim(spec_vector, idx_vectors_mapping[spec_]) > 0.5):
                            best_token = spec_
                            break
                        else:
                            # print("Close touch here ->", specialisation, "->", spec_, "-- failed")
                            pass

                if best_token == "None":
                    for spec_ in idx_specialisation_mapping.keys():
                        current_sim = cos_sim(spec_vector, idx_vectors_mapping[spec_])
                        if (current_sim > max_sim):
                            max_sim = current_sim
                            best_token = spec_

                # print(max_sim, best_token)
                if (max_sim < 0.50 and best_token == "None"):
                    processed_specialisation = "undefined"
                    granular_specialisation = "undefined"
                else:
                    processed_specialisation = idx_specialisation_mapping[best_token]
                    granular_specialisation = best_token

                already_done_.update({specialisation_raw: (granular_specialisation, processed_specialisation)})
                if (count % 1 == 0):
                    # print(specialisation, "|\t|", granular_specialisation, "|\t|", processed_specialisation, "|\t|", max_sim)
                    pass

        processed_specialisations.append(processed_specialisation)
        granular_specialisations.append(granular_specialisation)
        specialiasation_cleaned.append(specialisation)

    df['specialisations_processed'] = processed_specialisations
    df['granular_specialisations'] = granular_specialisations
    df['job_title_cleaned'] = specialiasation_cleaned

    level_classification = pd.read_csv(
        "https://docs.google.com/spreadsheets/d/e/2PACX-1vSXxWGKiCFw6QRXV09znbdHmd5HqCGgzl8o8qGndrft2U9I9fyNz94rblr69YLQkqhDiTkGrwGH6M4R/pub?gid=2085110439&single=true&output=csv")
    level_classification['Categories'] = level_classification['Categories'].fillna(method='ffill')
    level_classification = level_classification.applymap(lambda s: s.lower() if type(s) == str else s)
    level_classification = level_classification.drop('Levels', axis=1)
    level_classification.columns = ['level', 'granular_specialisation']
    level_mapping = {}
    for index, row in level_classification.iterrows():
        level_mapping.update({row['granular_specialisation']: row['level']})
    print(level_mapping)
    levels_processed = []
    unsettled = []
    for spec in df.granular_specialisations:
        if spec == "None":
            levels_processed.append("None")
        elif spec in level_mapping.keys():
            levels_processed.append(level_mapping[(spec)])
        else:
            levels_processed.append("undefined")
            unsettled.append(spec)
    df['levels_processed'] = levels_processed
    # df.to_csv('features_created_.csv')
    return df
Esempio n. 13
0
def grid_search(X_train_, X_test_, y_train_, y_test_):
    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import StandardScaler
    from keras.wrappers.scikit_learn import KerasClassifier

    scaler = StandardScaler()
    X_transform = scaler.fit_transform(X_train_)

    #parameters_svm = {'kernel': ('linear',  'poly', 'rbf'), 'C': [1, 10, 100, 1e5]}
    parameters_svm = {'kernel': ['linear'], 'C': [1]}

    #parameters_logistic = {'solver': ('liblinear', 'saga'), 'C': [1, 10, 100, 1e5], 'max_iter': [1000, 2000, 3000]}
    parameters_logistic = {
        'solver': ['liblinear'],
        'C': [1],
        'max_iter': [1000]
    }

    #parameters_decisiontree = {'criterion': ('entropy', 'gini'), 'max_depth': [10, 21, 42]}
    parameters_decisiontree = {'criterion': ['entropy'], 'max_depth': [10]}

    #parameters_kneighbors = {'n_neighbors': (10, 15, 21, 27), 'p': (1, 2, 3)}
    parameters_kneighbors = {'n_neighbors': [10], 'p': [1]}

    #parameters_randomforest = {'n_estimators': (25,50,100,150), 'criterion': ('entropy', 'gini'), 'max_depth': [10, 21, 42]}

    parameters_randomforest = {
        'n_estimators': [25],
        'criterion': ['entropy'],
        'max_depth': [10]
    }
    #parameters_nn = {'dropout_rate': (0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9), 'neurons': (88, 128, 168, 208, 248),
    #                           'batch_size': [20, 40, 60, 80], 'epochs': [10, 50, 100]}

    parameters_nn = {
        'dropout_rate': [0.2],
        'neurons': [88],
        'batch_size': [20],
        'epochs': [10]
    }

    parameters_list = [
        parameters_nn, parameters_decisiontree, parameters_svm,
        parameters_kneighbors, parameters_randomforest, parameters_logistic
    ]

    model5 = LogisticRegression(random_state=0, multi_class='auto')
    model1 = DecisionTreeClassifier(random_state=0)
    model2 = SVC(tol=1e-3, random_state=0, gamma="scale", verbose=True)
    # model2 = SVC(kernel='linear')
    model3 = KNeighborsClassifier(metric='minkowski', algorithm='auto')
    model4 = RandomForestClassifier(random_state=1, n_jobs=2)
    model = KerasClassifier(build_fn=nn_model,
                            input_dim=np.shape(X_train_)[1],
                            verbose=0)
    model_list = [model, model1, model2, model3, model4, model5]
    model_name_list = [
        'NeuralNetwork', 'DecisionTree', 'SVM', 'KNeighbors', 'RandomForest',
        'LogisticRegression'
    ]

    for name, mod, parameter in zip(model_name_list, model_list,
                                    parameters_list):
        clf = GridSearchCV(mod, parameter, cv=5, scoring='balanced_accuracy')
        clf.fit(X_transform, y_train_)
        print(clf.cv_results_.keys())
        with open(name + 'parameter.csv', 'w') as f:
            w = csv.writer(f)
            w.writerow(clf.cv_results_.keys())
            for i in range(len(clf.cv_results_['mean_fit_time'])):
                row = []
                for ele_key in clf.cv_results_.keys():
                    print(type(clf.cv_results_[ele_key]))
                    if (isinstance(clf.cv_results_[ele_key],
                                   np.ma.core.MaskedArray)):
                        row.append((clf.cv_results_[ele_key].data)[i])
                    else:
                        row.append(clf.cv_results_[ele_key][i])
                w.writerow(row)
Esempio n. 14
0
    model.add(Dense(512, init='normal', activation='relu'))
    model.add(Dense(9, init='normal', activation="softmax"))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(train_y)
encoded_y = encoder.transform(train_y)

dummy_y = np_utils.to_categorical(encoded_y)
print(dummy_y.shape)

estimator = KerasClassifier(build_fn=baseline_model, nb_epochs=10, batch_size=64)
estimator.fit(sentence_vectors[0:3321], dummy_y, validation_split=0.05)

y_pred = estimator.predict_proba(sentence_vectors[3321:])

""" Submission """
submission = pd.DataFrame(y_pred)
submission['id'] = test_index
submission.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
submission.to_csv("submission_keras_classify.csv",index=False)





   
# baseline
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, init='normal', activation='relu'))
    model.add(Dense(30, init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation='sigmoid'))
    # Compile model
    sgd = SGD(lr=0.01, momentum=0.8, decay=0.0, nesterov=False)
    model.compile(loss='binary_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])
    return model


numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',
                   KerasClassifier(build_fn=create_baseline,
                                   nb_epoch=300,
                                   batch_size=16,
                                   verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(y=encoded_Y,
                        n_folds=10,
                        shuffle=True,
                        random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" %
      (results.mean() * 100, results.std() * 100))
Esempio n. 16
0
def train_wrapper(filename):  # filename='measurements.mat'
    global dataset
    global seed
    global model
    global ss
    global mX
    global nX
    global mY
    global nY

    data = loadmat(filename)  # this is a dict.
    keys = list(data.keys())[3:]  # skip the first three columns
    values = list(data.values())[3:]

    dataset = pd.DataFrame()
    dataset = dataset.reindex(columns=keys)  # create an empty dataframe

    for ii in np.arange(len(values)):
        v_ = np.array(values[ii])
        dataset[keys[ii]] = pd.Series(
            v_.flatten())  # cannot add the data to this empty df.

    dataset['y'] = 1 * (dataset['BLER'] <= 0.19)  # H-ARQ target.
    dataset = dataset[['RSRP', 'TBSINR_1', 'rank', 'y']]
    dataset.dropna(inplace=True, axis=0)
    if os.path.exists('dataset.csv'):
        dataset.to_csv('dataset.csv', index=False, mode='a',
                       header=False)  # append
    else:
        dataset.to_csv('dataset.csv', index=False)

    #print(dataset.head())

    # Perform a split 30-70
    train, test = train_test_split(dataset, test_size=0.30, random_state=seed)

    X_train = train.drop('y', axis=1)
    X_test = test.drop('y', axis=1)

    y_train = train['y'].values
    y_test = test['y'].values

    mX, nX = X_train.shape
    mY = y_train.shape
    nY = 1

    ss = MinMaxScaler(feature_range=(0, 1))

    # Scale the variables
    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)

    model = KerasClassifier(build_fn=create_mlp,
                            verbose=0,
                            epochs=10,
                            batch_size=8)

    # The hyperparameters
    width_dims = [3, 5, 10]
    n_hiddens = [3, 5]  # the depth of hidden layers

    hyperparameters = dict(width=width_dims, depth=n_hiddens)
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)

    grid = GridSearchCV(estimator=model,
                        param_grid=hyperparameters,
                        n_jobs=1,
                        cv=3)
    gpu_available = tf.test.is_gpu_available()
    if (gpu_available == False):
        print('WARNING: No GPU available.  Will continue with CPU.')

    with tf.device('/gpu:0'):
        grid_result = grid.fit(X_train_sc, y_train, class_weight=class_weights)

    # This is the best model
    best_model_mlp = grid_result.best_params_
    print(best_model_mlp)

    model = grid_result.best_estimator_
    mlp = model

    y_pred = mlp.predict(X_test_sc)
    y_score = mlp.predict_proba(X_test_sc)

    mu = accuracy_score(y_test, y_pred)
    # Compute ROC curve and ROC area
    try:
        roc_auc = roc_auc_score(y_test, y_score[:, 1])
    except:
        print('WARNING: ROC was not computed.  Returning NaN')
        roc_auc = np.nan

    print('ROC for training is: {}'.format(roc_auc))
    print('Misclassification error for training is: {:.3f}'.format(1 - mu))

    return [roc_auc, 1 - mu]  # model is valid
Esempio n. 17
0
embedding_matrix_frWac = embd_bin(
    200, 'embeddings/frWac_non_lem_no_postag_no_phrase_200_cbow_cut0.bin')

#https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.vec
wiki_dir = 'embeddings/wiki.fr/wiki.fr.vec'

embedding_matrix_wiki = embd(embedding_dim2, wiki_dir)

num_epochs = 50
batch_size = 102
##
ppl1 = Pipeline([("MODEL_WV",
                  KerasClassifier(MODEL_WV,
                                  epochs=num_epochs + 50,
                                  batch_size=batch_size,
                                  verbose=1,
                                  shuffle=False))])

ppl2 = Pipeline([("MODEL_wiki",
                  KerasClassifier(MODEL_wiki,
                                  epochs=num_epochs + 50,
                                  batch_size=batch_size,
                                  verbose=1,
                                  shuffle=False))])

ppl3 = Pipeline([("model_cv_wv",
                  KerasClassifier(model_cv_wv,
                                  epochs=num_epochs + 30,
                                  batch_size=batch_size,
                                  verbose=1,
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

#****Basic split of Data****###
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,shuffle=True,stratify=y)

#kfold = KFold(n_splits=5, shuffle=True)

#***Stratified Split of the Data
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)

for train_index, test_index in skf.split(X, y):
    print("Train:", train_index, "Validation:", val_index)
    X_train, X_test = X[train_index], X[val_index]
    y_train, y_test = y[train_index], y[val_index]

estimator = KerasClassifier(build_fn=rock_classifier,
                            epochs=20,
                            batch_size=1,
                            verbose=1)

results = cross_val_score(estimator, X_train, y_yrain, cv=skf)

print("Baseline: %.2f%% (%.2f%%)" %
      (results.mean() * 100, results.std() * 100))

filename = 'results_400_skf.sav'
joblib.dump(results, filename)
Esempio n. 19
0
              kernel_initializer="random_uniform"))
    classifier.add(PReLU(input_shape=(6, )))  #Parametric RELU
    classifier.add(Dropout(rate=0.1))

    classifier.add(
        Dense(units=1,
              activation="sigmoid",
              kernel_initializer="random_uniform"))
    classifier.compile(optimizer=optimizer,
                       loss="binary_crossentropy",
                       metrics=['accuracy'])
    return classifier


# Grid Search for finding the best hyper params
model = KerasClassifier(build_fn=classifier_fn, epochs=10, batch_size=25)
params = {
    'epochs': [10, 25, 100],
    'batch_size': [10, 100],
    'optimizer': ['adam', 'rmsprop', 'SGD'],
    'dp1': [0.12, 0.25]
}
gridSearch = GridSearchCV(estimator=model,
                          param_grid=params,
                          scoring='accuracy',
                          cv=3)
gs = gridSearch.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
#####################################
# Set callback functions to early stop training and save the best model so far
# create model
def baseline_model():
    model = Sequential()
    model.add(Dense(100, input_shape=(10249,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.7))   
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# In[43]:

estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1)
estimator.fit(X_train_mat, y_train_cat)


# In[44]:

predictions = estimator.predict(X_test_mat)
print(set(predictions))
print(encoder.inverse_transform(predictions))


# In[45]:

print 'macro f1:', f1_score(encoded_Y_test, predictions, average='macro')

Esempio n. 21
0
        pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    print(model.summary())
    return model

# モデルを生成
model = KerasClassifier(
    build_fn=build_model, 
    nb_epoch=nb_epoch, 
    batch_size=batch_size)

# テストデータを読み込み
data = json.load(open("./newstext/data-mini.json"))
X = data["X"]
Y = data["Y"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
Y_train = np_utils.to_categorical(Y_train, nb_classes)
print(len(X_train),len(Y_train))

# 学習
model.fit(X_train, Y_train, verbose=1)

y = model.predict(X_test)
print(y)
Esempio n. 22
0

def train_and_evaluate_model(model, features, labels):
    # Test, Train, Valid Split
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.10,random_state=832289)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=0.25,random_state=832289)
    # Fit the model
    history = model.fit(X_train, y_train,batch_size=batch_size, nb_epoch=nb_epoch,verbose=1, validation_data=(X_valid, y_valid))
    # evaluate the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("Test Accuracy : %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


# # Find Best Dropout rate using GridCV Search

model = KerasClassifier(build_fn=create_model, nb_epoch=10, batch_size=5, verbose=1)
dropout_rate = [0.0, 0.2, 0.5]
param_grid = dict(dropout_rate=dropout_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_result = grid.fit(features, labels)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))


# # Training the model using the dropouts , discovered by GridCV Search

best_params = grid_result.best_params_
dropout_keep_prob = best_params['dropout_rate']

Esempio n. 23
0
metric = 'accuracy'
tuned_parameters = {
    'epochs': [50],
    'batch_size': [16, 32, 64],
    'conv_layers': [2, 3],
    'filters': [16, 32],
    'kernel_size': [3, 5],
    'units': [256, 512, 1024],
    'dropout_rate': [0.3, 0.5],
    'optimizer': ['adam'],
    'init_mode': ['glorot_uniform'],
}

print('> Grid search:')
print(' - Tuning hyper-parameters for \'{}\' metric\n'.format(metric))
grid_search = GridSearchCV(KerasClassifier(build_fn=build_model, verbose=0),
                           tuned_parameters,
                           cv=3,
                           n_jobs=-1,
                           verbose=2)

print(' - ', end='')
grid_search.fit(X,
                y,
                callbacks=[
                    tf.keras.callbacks.EarlyStopping(monitor='loss',
                                                     patience=5,
                                                     min_delta=0.01,
                                                     restore_best_weights=True)
                ])
ch1 = np.append(buf,zo,axis=1)
print('ch: ',str(1))
print(ch1)
print(ch1.shape)

# Separate to Input | Output
X = ch1[:,0:32].astype(float)
Y = ch1[:,32]

# Larger model
def create_larger():
    # Create model (neural network)
    model = Sequential()
    model.add(Dense(32, input_dim=32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

print('Estimating...')
# KerasClassifier
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_larger, epochs=100, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Result: acc:%.2f%% stdev:(%.2f%%)" % (results.mean()*100, results.std()*100))
Esempio n. 25
0
def get_model_from_name(model_name, training_params=None):

    # For Keras
    epochs = 250
    if os.environ.get('is_test_suite',
                      0) == 'True' and model_name[:12] == 'DeepLearning':
        print(
            'Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy'
        )
        epochs = 30

    all_model_params = {
        'LogisticRegression': {
            'n_jobs': -2
        },
        'RandomForestClassifier': {
            'n_jobs': -2
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {
            'n_estimators': 10
        },
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'presort': False,
            'learning_rate': 0.05,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'presort': False,
            'learning_rate': 0.05,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {
            'n_estimators': 10
        },
        'XGBRegressor': {
            'nthread': -1,
            'n_estimators': 200
        },
        'XGBClassifier': {
            'nthread': -1,
            'n_estimators': 200
        },
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.05,
            'num_leaves': 8,
            'lambda_l2': 0.001
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.05,
            'num_leaves': 8,
            'lambda_l2': 0.001
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:'
        )
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'SGDClassifier': SGDClassifier(),
        'Perceptron': Perceptron(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),
        'SGDRegressor': SGDRegressor(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans()
    }

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor(
            calc_feature_importance=True)
        model_map['CatBoostClassifier'] = CatBoostClassifier(
            calc_feature_importance=True)

    if keras_installed:
        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize'
        )
        raise (e)
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
Esempio n. 26
0
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


seed = 7

earlyStopping = callbacks.EarlyStopping(monitor='loss',
                                        patience=1,
                                        verbose=0,
                                        mode='auto')

model = KerasClassifier(build_fn=create_model,
                        epochs=90,
                        batch_size=50,
                        verbose=1)

# evaluate using 10-fold cross validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = model_selection.cross_val_score(
    model,
    x_train,
    y_train,
    cv=kfold,
    scoring='neg_log_loss',
    fit_params={'callbacks': [earlyStopping]})

print(results.mean())

model.fit(x_train, y_train, callbacks=[earlyStopping])
Esempio n. 27
0
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

	print ("done remodel")

	return model


print ("")
print ("====================================")
print ("START")
print ("====================================")
print ("")


#train model and create model as estimator
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

#estimator harus di fit agar bisa di save jadi model
estimator.fit(X,Y)

print ("")
print ("====================================")
print("TESTING")
print ("====================================")
print ("")

#testing pake kfold
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
Esempio n. 28
0
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_pred)


#predict a new observation
newObserv = np.array([[0.0,0,600,1,40,3,60000,2,1,1,50000]])
newObserv = sc_x.transform(newObserv)
newPredection = classifier.predict(newObserv)
newPredection = (newPredection>0.5)
'''

#uisng k corss vaidation
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

classifier = KerasClassifier(build_fn=build_classifier , batch_size = 10 , epochs = 100 )
accuracies = cross_val_score(estimator = classifier, X=x_train , y= y_train, cv=10 , n_jobs=-1 )

mean = accuracies.mean()
variance = accuracies.std()


#ANN tuning
""" from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCv


classifier = KerasClassifier(build_fn=build_classifier , batch_size = 10 , epochs = 100 )
accuracies = cross_val_score(estimator = classifier, X=x_train , y= y_train, cv=10 , n_jobs=-1 )

mean = accuracies.mean()
    model.add(Dropout(0.5))
    model.add(
        Dense(len(df_train_new_arr[0]) * 3, init='uniform', activation='tanh'))
    model.add(Dropout(0.6))
    model.add(
        Dense(len(df_train_new_arr[0]) * 1, init='uniform', activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(len(target[0]), init='uniform', activation='sigmoid'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=rms,
                  metrics=['accuracy'])
    return model


estimator = KerasClassifier(build_fn=keras_model,
                            nb_epoch=49,
                            batch_size=500,
                            verbose=1)
np.random.seed(123)
estimator.fit(df_train_new_arr_normed,
              target,
              verbose=1,
              validation_split=0.3,
              show_accuracy=True)
predictions = estimator.predict_proba(df_sub_arr_normed)
pred = pd.DataFrame(data=predictions,
                    columns=[x for x in products if x not in drop_targets])
pred['ncodpers'] = df_sub['ncodpers']

# Removing items already present in May-16
pred_T = pd.melt(pred, id_vars="ncodpers", var_name="Var", value_name="Val")
Esempio n. 30
0
#sklearn.grid_search this is if the sklearn doesnt use model selection for GridsearchCV


def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(
        Dense(output_dim=6, init='uniform', activation='relu', input_dim=11))
    classifier.add(Dense(output_dim=6, init='uniform', activation='relu'))
    classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))
    classifier.compile(optimizer=optimizer,
                       loss='binary_crossentropy',
                       metrics=['accuracy'])
    return classifier


classifier = KerasClassifier(build_fn=build_classifier)
#no batch_size or nb_epoch cause that will be used in gridsearchcv
#use GridsearchCV to get value of best batch size for best accuracy

parameters = {
    'batch_size': [10, 20, 50, 100, 250, 500],
    'nb_epoch': [100, 500],
    'optimizer': ['adam', 'rmsprop']
}

grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring='neg_mean_squared_error',
                           cv=10)
grid_search = grid_search.fit(X_train, Y_train)
best_parameters = grid_search.best_params_
Esempio n. 31
0
class Baseline(object):
    """Provide general machine learning models as baseline."""
    def __init__(self, train, valid, target, features, impute=True):
        super(Baseline, self).__init__()
        self.target = target
        self.features = features

        self.train = train
        self.valid = valid
        
        if impute:
            import pandas as pd
            from sklearn.preprocessing import Imputer

            self.train_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.train), columns=self.train.columns)
            self.valid_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.valid), columns=self.valid.columns)
        else:
            self.train_prep = self.train
            self.valid_prep = self.valid          

    def LR(self, report=False):
        """Logistic Regression.

        Args:
            feature_num: number of feaures to keep in the model.
            report: whether print out the model analysis report.
        Returns:
            Logistic regression model."""
        from sklearn.linear_model import LogisticRegression

        self.lr = LogisticRegression(n_jobs=-1)
        self.lr.fit(self.train_prep[self.features], self.train_prep[self.target])

        if report:
            from Report import Report
            rpt = Report(self.lr, self.train_prep, self.valid_prep, self.target, self.features)
            rpt.ALL()

        return self.lr
    
    def RF(self, report=False):
        """Random Forest.

        Args:
            report: whether print out the model analysis report.
        Returns:
            Decision tree model generated from Random Forest."""
        from sklearn.ensemble import RandomForestClassifier

        self.rf = RandomForestClassifier(n_estimators=1000, 
                                        max_features='sqrt',
                                        max_depth=10,
                                        random_state=0, 
                                        n_jobs=-1)
        self.rf.fit(self.train_prep[self.features], self.train_prep[self.target])

        if report:
            from Report import Report
            rpt = Report(self.rf, self.train_prep, self.valid_prep, self.target, self.features)
            rpt.ALL()

        return self.rf

    def GBDT(self, report=False):
        """Gradient Boosting Decision Tree.

        Args:
            report: whether print out the model analysis report.
        Returns:
            Decision tree model generated from Gradient Boosting Decision Tree."""
        from xgboost.sklearn import XGBClassifier

        self.gbdt = XGBClassifier(objective='binary:logistic',
                                  booster='gbtree',
                                  learning_rate=0.01,
                                  n_estimators=5000,
                                  max_depth=3,
                                  subsample=0.75,
                                  colsample_bytree=0.75,
                                  n_jobs=4,
                                  random_state=2018)

        self.gbdt.fit(self.train_prep[self.features], self.train_prep[self.target])
        
        if report:
            from Report import Report
            rpt = Report(self.gbdt, self.train, self.valid, self.target, self.features)
            rpt.ALL()

        return self.gbdt

    def NN(self, report=False):
        """Neutral Network.

        Args:
            report: whether print out the model analysis report.
        Returns:
            One layer neutral network model."""
        from keras.models import Sequential
        from keras.layers import Dense
        from keras.wrappers.scikit_learn import KerasClassifier

        def baseline_model():
            model = Sequential()
            model.add(Dense(8, input_dim=len(self.features), activation='relu'))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            return model    

        self.nn = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=5, verbose=1)
        self.nn.fit(self.train[self.features], self.train[self.target])

        if report:
            from Report import Report
            rpt = Report(self.nn, self.train, self.valid, self.target, self.features)
            rpt.ALL()

        return self.nn
Esempio n. 32
0
class Baseline(object):
    """Provide general machine learning models as baseline."""
    def __init__(self, train, valid, target, features, impute=True):
        super(Baseline, self).__init__()
        self.target = target
        self.features = features

        self.train = train
        self.valid = valid
        
        if impute:
            import pandas as pd
            from sklearn.preprocessing import Imputer

            self.train_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.train), columns=self.train.columns)
            self.valid_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.valid), columns=self.valid.columns)
        else:
            self.train_prep = self.train
            self.valid_prep = self.valid          

    def LR(self, report=False):
        """Logistic Regression.

        Args:
            feature_num: number of feaures to keep in the model.
            report: whether print out the model analysis report.
        Returns:
            Logistic regression model."""
        from sklearn.linear_model import LogisticRegression

        self.lr = LogisticRegression(n_jobs=-1)
        self.lr.fit(self.train_prep[self.features], self.train_prep[self.target])

        if report:
            from Report import Report
            rpt = Report(self.lr, self.train_prep, self.valid_prep, self.target, self.features)
            rpt.ALL()

        return self.lr
    
    def RF(self, report=False):
        """Random Forest.

        Args:
            report: whether print out the model analysis report.
        Returns:
            Decision tree model generated from Random Forest."""
        from sklearn.ensemble import RandomForestClassifier

        self.rf = RandomForestClassifier(n_estimators=1000, 
                                        max_features='sqrt',
                                        max_depth=10,
                                        random_state=0, 
                                        n_jobs=-1)
        self.rf.fit(self.train_prep[self.features], self.train_prep[self.target])

        if report:
            from Report import Report
            rpt = Report(self.rf, self.train_prep, self.valid_prep, self.target, self.features)
            rpt.ALL()

        return self.rf

    def GBDT(self, report=False):
        """Gradient Boosting Decision Tree.

        Args:
            report: whether print out the model analysis report.
        Returns:
            Decision tree model generated from Gradient Boosting Decision Tree."""
        import lightgbm as lgb
        from sklearn.model_selection import train_test_split

        train, test = train_test_split(self.train, test_size=0.2, random_state=0)

        lgb_train = lgb.Dataset(train[self.features], train[self.target], free_raw_data=False)
        lgb_valid = lgb.Dataset(test[self.features], test[self.target], reference=lgb_train, free_raw_data=False)
        
        params = {
            'boosting_type': 'gbdt',
            'objective': 'bianry',
            'metric': 'auc',
            'num_leaves': 64,
            'learning_rate': 0.01,
            'feature_fraction': 0.75,
            'bagging_fraction': 0.75,
            'bagging_freq': 5,
            'verbose': 0
        }

        self.gbdt = lgb.train(params,
                        lgb_train,
                        num_boost_round=10000,
                        valid_set=lgb_valid,
                        early_stopping_round=200,
                        verbose_eval=100)
        if report:
            from Report import Report
            rpt = Report(self.gbdt, self.train, self.valid, self.target, self.features)
            rpt.ALL()

        return self.gbdt

    def NN(self, report=False):
        """Neutral Network.

        Args:
            report: whether print out the model analysis report.
        Returns:
            One layer neutral network model."""
        from keras.models import Sequential
        from keras.layers import Dense
        from keras.wrappers.scikit_learn import KerasClassifier

        def baseline_model():
            model = Sequential()
            model.add(Dense(8, input_dim=len(self.features), activation='relu'))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            return model    

        self.nn = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=5, verbose=1)
        self.nn.fit(self.train[self.features], self.train[self.target])

        if report:
            from Report import Report
            rpt = Report(self.nn, self.train, self.valid, self.target, self.features)
            rpt.ALL()

        return self.nn
Esempio n. 33
0
"""
Now it is time to evaluate this model using stratified cross validation in the
scikit-learn framework.

To use Keras models with scikit-learn, we must use the KerasClassifier wrapper.
This class takes a function that creates and returns our neural network model.
It also takes arguments that it will pass along to the call to fit() such as
the number of epochs and the batch size.
We pass the number of training epochs to the KerasClassifier, again using
reasonable default values. Verbose output is also turned off given that the
model will be created 10 times for the 10-fold cross validation being
performed.
"""
# Rescale our data
# evaluate baseline model with standardized dataset
estimator =  KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=1)


"""
We are going to use scikit-learn to evaluate the model using stratified k-fold
cross validation. This is a resampling technique that will provide an estimate
of the performance of the model. It does this by splitting the data into
k-parts, training the model on all parts except one which is held out as a test
set to evaluate the performance of the model. This process is repeated k-times
and the average score across all constructed models is used as a robust
estimate of performance. It is stratified, meaning that it will look at the
output values and attempt to balance the number of instances that belong to
each class in the k-splits of the data.
"""
kfold = StratifiedKFold(n_splits=1000, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
Esempio n. 34
0
    le_target = LabelEncoder().fit(train[target])
    y = le_target.transform(train[target])

    train = train.drop([target, 'image', 'filename'], axis=1)

    combined_features = Pipeline([
        ('pca',
         Pipeline([
             ('scaler', StandardScaler()),
             ('pca', PCA(n_components=input_dim)),
         ])),
    ])

    X = combined_features.fit_transform(train.as_matrix())

    model = KerasClassifier(build_fn=create_mlp)

    splitter = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0)
    cv_splits = cv_split_generator(X=X, y=y, splitter=splitter)

    scores = []
    hist = {}
    for i, X_train, X_val, y_train, y_val in cv_splits:
        X = combined_features.fit_transform(train.as_matrix())
        results = model.fit(X_train,
                            y_train,
                            nb_epoch=250,
                            batch_size=128,
                            validation_split=0.1,
                            verbose=1)
base_nns = load_all_nns(n_nns)
print('Loaded %d models' % len(base_nns))

# fit stacked model using the ensemble
model = fit_stacked_model(base_nns, X_val, y_val)

# evaluate model on test set
ypred = stacked_prediction(base_nns, model, X_val)
acc = accuracy_score(y_val, ypred)


def get_model():
    return load_model('base_nn.h5')


stk_nn = KerasClassifier(build_fn=get_model)

classif = [stk_nn]

kf = model_selection.StratifiedKFold(n_splits=5)

for i, ensem in enumerate(classif):
    cvscore = model_selection.cross_val_score(ensem,
                                              X_train,
                                              y_train,
                                              cv=kf,
                                              scoring='accuracy')
    print("Stacked Ensemble Model %0.0f" % i)
    print("Train (CV) Acc: %0.2f (+/- %0.2f)" %
          (cvscore.mean(), cvscore.std()))
    ensem.fit(X_train, y_train)
Esempio n. 36
0
# return the best three results
def top_n(matrix_prob, label_map):
	ans = []
	for line in matrix_prob:
		rank = [label_map[item[0]] for item in sorted(enumerate(line), key=lambda v:v[1], reverse=True)]
		ans.append(rank[:3])
	return ans
# basic neural network model
def basic_model():
	model = Sequential()
	model.add(Dense(output_dim=500, input_dim=100, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(output_dim=42, input_dim=500, activation='softmax'))
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

if __name__ == '__main__':
	X = pd.read_csv('./data/triple_train_x_mean.txt', header=None, encoding='utf-8')
	Y = pd.read_csv('./data/triple_train_y.txt', header=None, encoding='utf-8')
	X_test = pd.read_csv('./data/triple_test_x_mean.txt', header=None, encoding='utf-8')
	matrix_y = np_utils.to_categorical(Y,42)
	# KerasClassifier analysis
	classifier = KerasClassifier(build_fn=basic_model, nb_epoch=10, batch_size=500)
	classifier.fit(X, Y)

	pred_prob = classifier.predict_proba(X_test)

	with open('./model/task2_label_space.txt', encoding='utf-8') as flabel:
		label_map = flabel.read().split()
	pd.DataFrame(top_n(pred_prob, label_map)).to_csv('./data/task2_ans_int_index.txt', index=None, header=None, encoding='utf-8')
Esempio n. 37
0
    model.add(BatchNormalization(axis=-1,
                                 input_shape=(X.shape[1], X.shape[2])))
    model.add(CuDNNLSTM(256, return_sequences=True))
    model.add(CuDNNLSTM(256, return_sequences=True))
    model.add(CuDNNLSTM(256, return_sequences=True))
    model.add(Flatten())
    model.add(Dropout(0.4))
    model.add(Dense(8, activation='softmax'))  #unit must match n classes

    # model compilation
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


# create the model
model = create_model()
print(model.summary())

# create model
model = KerasClassifier(build_fn=create_model,
                        epochs=200,
                        batch_size=16,
                        verbose=1)

# evaluate using 5-fold cross validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)
results = cross_val_score(model, X, y, cv=kfold)
print(results.mean())
Esempio n. 38
0
    classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

    #Compiling the ANN
    classifier.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])
    #Root mean square propagation
    #classifier.compile(optimizer = 'rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    #Fitting the ANN to the training set
    #classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 100)

    return classifier


neural_network = KerasClassifier(build_fn=create_network,
                                 epochs=10,
                                 batch_size=5,
                                 verbose=0)

#Create k-fold cross-validation
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=10, shuffle=True, random_state=1)
score = cross_val_score(neural_network, X_train, y_train, cv=kf).mean()
print(score)
"""
#Predict test set result
y_pred = classifier.predict(X_test)
#y_pred = (y_pred > 0.5)
y_pred = [ 1 if x > 0.5 else 0 for x in y_pred]


#Making confusion Matrix
Esempio n. 39
0
    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=16, activation="relu"))

    # Add fully connected layer with a sigmoid activation function
    network.add(layers.Dense(units=1, activation="sigmoid"))

    # Compile neural network
    network.compile(loss="binary_crossentropy", # Cross-entropy
                    optimizer=optimizer, # Optimizer
                    metrics=["accuracy"]) # Accuracy performance metric

    # Return compiled network
    return network

# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=create_network, verbose=0)

# Create hyperparameter space
epochs = [5, 10]
batches = [5, 10, 100]
optimizers = ["rmsprop", "adam"]

# Create hyperparameter options
hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

# Create grid search
grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)

# Fit grid search
grid_result = grid.fit(features, target)
Esempio n. 40
0
        LRscore[j] = np.mean(cross_val_score(logisticModel,train_data,label_data,cv=5))
        j = j+1
    print(c)
    print(LRscore)
    # plt.plot(c,LRscore,'bx-')
    # plt.xlabel('penalty')
    # plt.ylabel('validation score')
    # plt.title('LR Model selection')
    # plt.show()
    # #logisticModel = LogisticRegression(penalty='l2')
    # #scores[1] = cross_val_score(logisticModel,train_data,label_data,cv=5)
    #
    #test model 3 : Neutral network
    #NNModel = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(5000,100), random_state=1,max_iter=500)
    tbCallback = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)
    NNModel = KerasClassifier(build_fn=create_model,epochs=1200, batch_size=150,verbose=0)
    cv = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    #NNscore = cross_val_score(NNModel,train_data,label_data,fit_params={'callbacks': [tbCallback]},cv=cv)
    NNModel.fit(train_data,label_data)
    prediction = NNModel.predict(test_data)
    prediction = np.array(prediction)
    print(prediction)
    np.savetxt("prediction.csv", prediction, delimiter=",")
    #print('MLPClassifier validation score : ',NNscore)


    #test model 4 : SVM
    # c = [1]
    # SVMscore = np.zeros(len(c))
    # j = 0
    # for i in c:
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)
tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph',
                                         histogram_freq=0,
                                         write_graph=True,
                                         write_images=True)
batch_size = 32
model.fit(X_train,
          Y_train,
          epochs=7,
          batch_size=batch_size,
          verbose=2,
          callbacks=[tbCallBack])
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print(score)
print(acc)

model = KerasClassifier(build_fn=createmodel, verbose=0)
batch_size = [32, 64]
epochs = [10, 2]
param_grid = dict(batch_size=batch_size, epochs=epochs)
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
"""
Now it is time to evaluate this model using stratified cross validation in the
scikit-learn framework.

To use Keras models with scikit-learn, we must use the KerasClassifier wrapper.
This class takes a function that creates and returns our neural network model.
It also takes arguments that it will pass along to the call to fit() such as
the number of epochs and the batch size.
We pass the number of training epochs to the KerasClassifier, again using
reasonable default values. Verbose output is also turned off given that the
model will be created 10 times for the 10-fold cross validation being
performed.
"""
# Rescale our data
# evaluate baseline model with standardized dataset
estimator =  KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=1)


"""
We are going to use scikit-learn to evaluate the model using stratified k-fold
cross validation. This is a resampling technique that will provide an estimate
of the performance of the model. It does this by splitting the data into
k-parts, training the model on all parts except one which is held out as a test
set to evaluate the performance of the model. This process is repeated k-times
and the average score across all constructed models is used as a robust
estimate of performance. It is stratified, meaning that it will look at the
output values and attempt to balance the number of instances that belong to
each class in the k-splits of the data.
"""
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
Esempio n. 43
0
    def train_KerasBinaryClassifier(self, X_train, y_train, noOfepochs):

        # Use Tenserflow backend
        sess = tf.Session()
        K.set_session(sess)

        def custom_activation(x):
            return (1 / np.sqrt(self.h_size)) * tf.cos(x / 0.02)

        get_custom_objects().update(
            {'custom_activation': Activation(custom_activation)})

        def model():
            # Load the
            # print("Loading the Pretrained Supervised NN Model..... ")
            from keras.models import load_model
            from keras.models import model_from_json

            # # Model reconstruction from JSON file
            # with open('../models/supervisedBC/model_architecture.json', 'r') as f:
            #     best_model = model_from_json(f.read())

            # # Load weights into the new model
            # best_model.load_weights('../models/supervisedBC/model_weights.h5')
            # best_model.compile(
            #     optimizer='rmsprop',
            #     loss='binary_crossentropy',
            #     metrics=['accuracy'])

            model = Sequential()
            model.add(Dense(128, input_dim=X_train.shape[1]))
            model.add(Activation(custom_activation))
            model.add(Dense(64, activation='linear'))
            model.add(Dense(1))
            model.compile(optimizer='rmsprop',
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

            # ## Copy the weights from one model to another model
            # model.set_weights(best_model.get_weights())

            return model

        # early_stopping = callbacks.EarlyStopping(
        #     monitor='val_loss', patience=1, verbose=0, mode='auto')
        # print("Removed Early stopping......")
        pipe = pipeline.Pipeline([('rescale', preprocessing.StandardScaler()),
                                  ('nn',
                                   KerasClassifier(build_fn=model,
                                                   epochs=noOfepochs,
                                                   batch_size=128,
                                                   verbose=0,
                                                   validation_split=0.2))])

        #    callbacks=[early_stopping]
        pipe.fit(X_train, y_train)

        model_step = pipe.steps.pop(-1)[1]
        joblib.dump(pipe, os.path.join(self.directory, 'pipeline.pkl'))
        # print("Trained Model is Saved at relative path inside PROJECT_DIR ",
        #   self.directory)
        models.save_model(model_step.model,
                          os.path.join(self.directory, 'model.h5'))
        return

def create_model():
    model = Sequential()
    model.add(SimpleRNN(X_train.shape[1], input_dim=X_train.shape[1]))
    model.add(Activation('relu'))
    model.add(SimpleRNN(20000))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(SimpleRNN(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss=loss, optimizer=optim, metrics=['accuracy'])
    return model


classifier = KerasClassifier(build_fn=create_model, nb_epoch=nb_epoch, batch_size=batch_size)
history = classifier.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)
Y_pred = classifier.predict(X_test, batch_size=batch_size)

print(classification_report(y_true=Y_test, y_pred=Y_pred))

plt.figure()
plt.plot(history.history['acc'])
plt.title('Genauigkeit')
plt.ylabel('Genauigkeit')
plt.xlabel('Epoche')
plt.legend(['Training', 'Test'], loc='upper left')
plt.savefig("data/acc.png")

# summarize history for loss
plt.figure()
Esempio n. 45
0
    model.add(Dropout(rate=0.1))
    model.add(
        Dense(units=5,
              activation='sigmoid',
              kernel_initializer='glorot_uniform'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


# Cross Validation
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
cv_classifier = KerasClassifier(build_fn=build_model,
                                batch_size=25,
                                nb_epoch=1000)
accuracies = cross_val_score(estimator=cv_classifier,
                             X=X_train,
                             y=y_train,
                             cv=10)

accuracySum = 0
for accuracy in accuracies:
    accuracySum += accuracy

print(accuracySum / accuracies.size)

# -- Cross Val. Son -- #

classifier = build_model()
Esempio n. 46
0
class BaseKerasSklearnModel(base_model.BaseModel):
    '''
    base keras model based on keras's model(without sklearn)
    '''
##    def __init__(self, data_file, delimiter, lst_x_keys, lst_y_keys, log_filename=DEFAULT_LOG_FILENAME, model_path=DEFAULT_MODEL_PATH, create_model_func=create_model_demo):
##        '''
##        init
##        '''
##        import framework.tools.log as log
##        loger = log.init_log(log_filename)
##        self.load_data(data_file, delimiter, lst_x_keys, lst_y_keys)
##        self.model_path = model_path
##        self.create_model_func=create_model_func

    def __init__(self, **kargs):
        '''
        init
        '''
        import framework.tools.log as log
        self.kargs = kargs
        log_filename = self.kargs["basic_params"]["log_filename"]
        model_path = self.kargs["basic_params"]["model_path"]
        self.load_data_func = self.kargs["load_data"]["method"]
        self.create_model_func = self.kargs["create_model"]["method"]
        loger = log.init_log(log_filename)
        (self.dataset, self.X, self.Y, self.X_evaluation, self.Y_evaluation) = self.load_data_func(**self.kargs["load_data"]["params"])
        self.model_path = model_path
        self.dic_params = {}
 

    def load_data(self, data_file, delimiter, lst_x_keys, lst_y_keys):
        '''
        load data
        '''
        # Load the dataset
        self.dataset = numpy.loadtxt(data_file, delimiter=",") 
        self.X = self.dataset[:, lst_x_keys] 
        self.Y = self.dataset[:, lst_y_keys]
    
    def init_callbacks(self):
        '''
        init all callbacks
        '''
        os.system("mkdir -p %s" % (self.model_path))
        checkpoint_callback = ModelCheckpoint(self.model_path + '/weights.{epoch:02d}-{acc:.2f}.hdf5', \
                monitor='acc', save_best_only=False)
        history_callback = LossHistory()
        callbacks_list = [checkpoint_callback, history_callback]
        self.dic_params["callbacks"] = callbacks_list

    def init_model(self):
        '''
        init model
        '''
        train_params = {"nb_epoch": 10, "batch_size": 10}
        self.dic_params.update(train_params)
        self.model = KerasClassifier(build_fn=self.create_model_func, **self.kargs["create_model"]["params"])
#        self.model = KerasClassifier(build_fn=self.create_model_func)
        self.model.set_params(**self.dic_params)
    
    def train_model(self):
        '''
        train model
        '''
        X = self.X
        Y = self.Y
        X_evaluation = self.X_evaluation
        Y_evaluation = self.Y_evaluation
        seed = 7
        numpy.random.seed(seed) # Load the dataset
        
        history = self.model.fit(X, Y)
        scores = self.model.score(X, Y)
#history_callback = self.dic_params["callbacks"][1]
#        print dir(history_callback)
#        logging.info(str(history_callback.losses))
        logging.info("final : %.2f%%" % (scores * 100))
        logging.info(str(history.history))
    
    def process(self):
        '''
        process
        '''
        self.init_callbacks()
        self.init_model()
        self.train_model()
# one hot encoding
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)


#baseline model
def create_baseline():
    #create model
    #we start with same number of neurons as input in hidden layer as a starting point
    model = Sequential()
    model.add(Dense(60, input_dim=60, init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation='sigmoid'))
    #compile model
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


estimator = KerasClassifier(build_fn=create_baseline,
                            nb_epoch=100,
                            batch_size=5,
                            verbose=0)
kfold = StratifiedKFold(y=encoded_Y,
                        n_folds=10,
                        shuffle=True,
                        random_state=seed)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))
Esempio n. 48
0
Y_test = np_utils.to_categorical(y_test, nb_classes)[:max_test_samples]

#############################
# scikit-learn wrapper test #
#############################
print('Beginning scikit-learn wrapper test')

print('Defining model')
model = Sequential()
model.add(Dense(784, 50))
model.add(Activation('relu'))
model.add(Dense(50, 10))
model.add(Activation('softmax'))

print('Creating wrapper')
classifier = KerasClassifier(model)

print('Fitting model')
classifier.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch)

print('Testing score function')
score = classifier.score(X_train, Y_train)
print('Score: ', score)

print('Testing predict function')
preds = classifier.predict(X_test)
print('Preds.shape: ', preds.shape)

print('Testing predict proba function')
proba = classifier.predict_proba(X_test)
print('Proba.shape: ', proba.shape)
Esempio n. 49
0
def modeling(conn, sentences, lib, dz):
#def modeling(conn, df, lib, dz):
  
    #pts = pd.read_sql("SELECT DISTINCT SUBJECT_ID from UFM", conn)
    #pts =list(set(pts.SUBJECT_ID))
    #pool = []
    #for d in dz:
    #    pool += d.pos + d.neg
    np.random.seed(7)
    decay = .0002
    data = []; train = []; test = []
    keys = [k[1] for k in lib]
    
    admits = pd.read_sql("SELECT * from admissions", conn)
    
    for itr in range(0,5):
        print ("Sess: {0}".format(itr))
        for d in dz:
            neg = random.sample(d[1], len(d[0]))
            temp = d[0] + neg
            random.shuffle(temp)
            t1, t2 = cross_validation.train_test_split(temp, test_size = .2)
            train +=t1; test +=t2
                    
        #X stands for raw indexes of feature input; V stands for raw feature input
        #W stands for word vectors from feature input trained by Word2Vec
        X_train = []; t_train = []; W_train = []; Y_train = []
        X_test = []; t_test = []; W_test = []; Y_test = []
        V_train = []; V_test = []
    
        count=0
        for t in train:
            print (count)
            count+=1

            corpus = [[s[2], s[3]] for s in sentences if  (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])]
            #order subject by time of entry for each sentence (admission)
            corpus = sorted(corpus, key = lambda x: x[1])
            #transpose into nx2xd from 2xnxd
            #this way, corpus[0] refers to words and corpus[1] refers to times
            corpus = list(map(list, zip(*corpus)))                  
            x_train = list(chain.from_iterable(corpus[0]))
            t_stamps = list(chain.from_iterable(corpus[1]))
            x = np.array(list(map(lambda x: keys.index(x), x_train)))
     
            #configure each timestamp to reflect time elapsed from first time entry
            #calculate time decay from initial event
            temp = t_stamps[0]
            t_stamps = [ii-temp for ii in t_stamps]
                
            #append
            X_train.append(x)
            V_train.append(np.array(x_train))
            t_train.append(np.array(t_stamps))
            Y_train.append(t[3])
                
        print ("X_train made.")

        count = 0
        for t in test:
            print (count)
            count+=1
                
            corpus = [[s[2], s[3]] for s in sentences if  (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])]
                
            corpus = sorted(corpus, key = lambda x: x[1])
            corpus = list(map(list, zip(*corpus)))                  
            x_test = list(chain.from_iterable(corpus[0]))
            t_stamps = list(chain.from_iterable(corpus[1]))
            temp = t_stamps[0]
            t_stamps = [ii-temp for ii in t_stamps]
            x = np.array(list(map(lambda x: keys.index(x), x_test)))
            
            X_test.append(x)
            V_test.append(np.array(x_train))
            t_test.append(np.array(t_stamps))
            Y_test.append(t[3])            
                           
        #training normal LSTM and CNN-LSTM          
        top_words = [9444]
        max_review_length = [1000]
        embedding_length = [300]          
        X_train = sequence.pad_sequences(X_train, maxlen=max_review_length[0])
        X_test = sequence.pad_sequences(X_test, maxlen=max_review_length[0])


        #build model using KerasClassifier and Gridsearch
        cnn = KerasClassifier(build_fn=cnn_train, verbose=1)
        lstm = KerasClassifier(build_fn=lstm_train, verbose=1)
        d_cnn = KerasClassifier(build_fn=d_cnn_train, verbose = 1)
        d_lstm = KerasClassifier(build_fn=d_lstm_train, verbose = 1)
        # define the grid search parameters

        batch_size = [32, 64, 128]
        epochs = [20, 50, 100, 200]
        optimizer = ['SGD', 'RMSprop', 'Adam']
        learn_rate = (10.0**np.arange(-4,-1)).tolist()
        momentum = np.arange(.5,.9,.1).tolist()
        neurons = [50, 100, 200]
        dropout_W = [.1, .2, .5]
        dropout_U = [.1, .2, .5]
        W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        init_mode = ['uniform', 'normal', 'zero']
        #activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
        param_grid = dict(top_words=top_words, max_length = max_review_length, embedding_length = embedding_length, batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode)
        d_param_grid = dict(input_shape = [(max_review_length[0], embedding_length[0])], batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode)
        lr_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'penalty':('l1','l2')}
        sv_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
        rf_params = {'criterion': ['gini', 'entropy']}

        #setup GridSearch w/ cross validation
        cnn_grid = GridSearchCV(estimator=cnn, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        lstm_grid = GridSearchCV(estimator=lstm, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        d_cnn_grid = GridSearchCV(estimator=d_cnn, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        d_lstm_grid = GridSearchCV(estimator=d_lstm, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        classics = GridSearchCV(estimator = (LR, SVM, RF), param_grid = (lr_params, sv_params, rf_params), scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #lr_grid = GridSearchCV(estimator = lr_params, param_grid = lr_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #sv_grid = GridSearchCV(estimator = sv_params, param_grid = sv_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #rf_grid = GridSearchCV(estimator = rf_params, param_grid = rf_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)

        # Fit the model
        cnn_result = cnn_grid.fit(X_train, Y_train)
        lstm_result = lstm_grid.fit(X_train, Y_train) 
        d_cnn_result = d_cnn_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train)
        d_lstm_result = d_lstm_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) 
        classics_result = classics.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[1], Y_train)       
        #lr_result = lr_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)
        #sv_result = sv_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)
        #rf_result = rf_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)        
        
        #grid_search results:
        print("CNN Best: %f using %s" % (cnn_result.best_score_, cnn_result.best_params_))
        means = cnn_result.cv_results_['mean_test_score']
        stds = cnn_result.cv_results_['std_test_score']
        params = cnn_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))
        
        print("LSTM Best: %f using %s" % (lstm_result.best_score_, lstm_result.best_params_))
        means = lstm_result.cv_results_['mean_test_score']
        stds = lstm_result.cv_results_['std_test_score']
        params = lstm_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))
        
        print("Decay CNN Best: %f using %s" % (d_cnn_result.best_score_, d_cnn_result.best_params_))
        means = d_cnn_result.cv_results_['mean_test_score']
        stds = d_cnn_result.cv_results_['std_test_score']
        params = d_cnn_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
            
        print("Decay LSTM Best: %f using %s" % (d_lstm_result.best_score_, d_lstm_result.best_params_))
        means = d_lstm_result.cv_results_['mean_test_score']
        stds = d_lstm_result.cv_results_['std_test_score']
        params = d_lstm_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
            
        print("Best of Classics: %f using %s, %s" % (classics_result.best_score_, classics_result.best_estimator_, classics_result.best_params_))    
        means = classics_result.cv_results_['mean_test_score']
        stds = classics_result.cv_results_['std_test_score']
        params = classics_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
        
        #KFold = 5
        #kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
        #cvscores = []
        #for training, testing in kfold.split(X_train, Y_train):     
            # Fit the model
            #model.fit(X[training], Y[training], nb_epoch=150, batch_size=10, verbose=0)
            # evaluate the model
            #scores = model.evaluate(X[testing], Y[testing], verbose=0)
            #print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
            #cvscores.append(scores[1] * 100)
        #print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

 ######TESTING#######
        cnn = cnn_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length)
        lstm = lstm_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length)
            
        cnn.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1)
        lstm.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1)

        #testing
        predictions_lstm = lstm.predict_classes(X_test)
        predictions_cnn = cnn.predict_classes(X_test)

        acc = accuracy_score(Y_test, predictions_lstm)
        f1 = f1_score (Y_test, predictions_lstm)
        auc = roc_auc_score (Y_test, predictions_lstm)
        scores_lstm = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)]

        acc = accuracy_score(Y_test, predictions_cnn)
        f1 = f1_score (Y_test, predictions_cnn)
        auc = roc_auc_score (Y_test, predictions_cnn)
        scores_cnn = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)]

        print ("LSTM DATA: ")
        for s in scores_lstm:
            print("%s: %.2f" %(s[0], s[1]), end = " ")
        print ("")
        print ("CNN DATA: ")
        for s in scores_cnn:
            print("%s: %.2f" %(s[0], s[1]), end = " ")        
        
        
        data.append(data)
            
    return (Data)
Esempio n. 50
0
        optimizer='adam',
        metrics=['accuracy'])
    return model

# データを読み込み --- (※2)
data = json.load(open("./newstext/data-mini.json"))
#data = json.load(open("./newstext/data.json"))
X = data["X"] # テキストを表すデータ
Y = data["Y"] # カテゴリデータ
# 最大単語数を指定
max_words = len(X[0])

# 学習 --- (※3)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
Y_train = np_utils.to_categorical(Y_train, nb_classes)
print(len(X_train),len(Y_train))
model = KerasClassifier(
    build_fn=build_model, 
    nb_epoch=nb_epoch, 
    batch_size=batch_size)
model.fit(X_train, Y_train)

# 予測 --- (※4)
y = model.predict(X_test)
ac_score = metrics.accuracy_score(Y_test, y)
cl_report = metrics.classification_report(Y_test, y)
print("正解率=", ac_score)
print("レポート=\n", cl_report)


Esempio n. 51
0
	# output layer
	model.add(Dense(1))
	model.add(BatchNormalization())
	model.add(Dropout(0.5))
	model.add(Activation('sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
	return model

#thank god for wrappers

def nn_model():
	return KerasClassifier(build_fn=create_baseline, nb_epoch=20, batch_size=50, verbose = 1)


model = KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=80, verbose = 0)

model.fit(X_train, y_train, nb_epoch=7, batch_size=300, validation_split=0.1, show_accuracy=True)
scores = cross_validation.cross_val_score(model, X, y, cv = 5, scoring = "accuracy", n_jobs = -1, verbose = 1)
model.fit(X_train, y_train, verbose=2)

y_pred = model.predict(X_test)
'''
print y_pred
print y_test

print mean_squared_error(y_test, y_pred)


'''
#scores = roc_auc_score(y_test,y_pred)