def decision_tree(instruction,
                  dataset=None,
                  preprocess=True,
                  ca_threshold=None,
                  text=None,
                  test_size=0.2,
                  drop=None):
    logger("Reading in dataset....")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(remove))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i

    # Custom label encoder due to train test split
    y_train = y_train.apply(lambda x: label_mappings[x]).values
    y_test = y_test.apply(lambda x: label_mappings[x]).values
    num_classes = len(np.unique(y))

    # fitting and storing
    logger("Fitting Decision Tree...")

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)
    logger("->", "Score found on testing set: {}".format(score))
    logger("Stored model under 'decision_tree' key")

    clearLog()

    return {
        'id': generate_id(),
        "model": clf,
        "target": remove,
        "accuracy_score": score,
        "preprocesser": full_pipeline,
        "interpeter": label_mappings,
        "cross_val_score": cross_val_score(clf, X_train, y_train, cv=3)
    }
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):

    global counter

    dataReader = DataReader(dataset)

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data, y, target, full_pipeline = initial_preprocesser(
            data, instruction, True, 0.2, [], 0.2, random_state=49)

        le = preprocessing.LabelEncoder()
        X_train = data['train']
        y_train = y['train']
        X_test = data['test']
        y_test = y['test']

        y_train= le.fit_transform(y_train)
        y_test = le.fit_transform(y_test)

    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(
        first_classifier.predict(X_test), y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(X_train.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=x)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(X_train.columns[indices])

        X_temp_train = X_train[X_train.columns[indices]]
        X_temp_test = X_test[X_train.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))
    print(accuracy_scores)
    return datas[the_index], accuracy_scores[0], max(
        accuracy_scores), list(columns[the_index])
def dimensionality_KPCA(instruction, dataset, target="", y=""):
    '''
    function to reduce dimensionality in dataset via kernal principal component analysis
    :param instruction: command sent to client instance in written query.
    :param dataset: data instantiated in client instance passed to the algorithm
    :param target: column name of response variable/feature
    :param y: dictionary of train/test data values associated with response variable/feature
    '''

    pca = KernelPCA(kernel='rbf')

    dataReader = DataReader(dataset)
    dataset = dataReader.data_generator()

    data, y, target, full_pipeline = initial_preprocesser(dataset,
                                                          instruction,
                                                          True,
                                                          0.2, [],
                                                          0.2,
                                                          random_state=49)

    X_train = data['train']
    X_test = data['test']

    y_train = y['train']
    y_test = y['test']

    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.transform(X_test)

    clf = tree.DecisionTreeClassifier()
    clf_mod = tree.DecisionTreeClassifier()

    clf.fit(X_train, y_train)
    clf_mod.fit(X_train_mod, y_train)

    acc = []
    acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test))

    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j

    data_modified = pd.concat(
        [pd.DataFrame(X_train_mod),
         pd.DataFrame(X_test_mod)], axis=0)

    y_combined = np.r_[y_train, y_test]
    data_modified[target] = y_combined
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test),
        y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def nearest_neighbors(instruction=None,
                      dataset=None,
                      ca_threshold=None,
                      preprocess=True,
                      drop=None,
                      min_neighbors=3,
                      max_neighbors=10):
    logger("Reading in dataset....")
    # Reads in dataset
    # data = pd.read_csv(self.dataset)
    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(remove))
    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']
    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))
    # encodes the label dataset into 0's and 1's
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i
    y_train = y_train.apply(lambda x: label_mappings[x]).values
    y_test = y_test.apply(lambda x: label_mappings[x]).values
    models = []
    scores = []
    logger("Fitting Nearest Neighbor...")
    logger("Identifying optimal number of neighbors...")
    # Tries all neighbor possibilities, based on either defaults or user
    # specified values
    for x in range(min_neighbors, max_neighbors):
        knn = KNeighborsClassifier(n_neighbors=x)
        knn.fit(X_train, y_train)
        models.append(knn)
        scores.append(accuracy_score(knn.predict(X_test), y_test))
    logger("Stored model under 'nearest_neighbors' key")
    knn = models[scores.index(min(scores))]
    return {
        'id': generate_id(),
        "model": knn,
        "accuracy_score": scores.index(min(scores)),
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        "target": remove,
        "cross_val_score": cross_val_score(knn, X_train, y_train, cv=3)
    }
    clearLog()
def dimensionality_PCA(instruction, dataset, ca_threshold=None):

    global counter

    pca = PCA(0.92)

    dataReader = DataReader(dataset)
    dataset = dataReader.data_generator()

    data, y, target, full_pipeline = initial_preprocesser(dataset,
                                                          instruction,
                                                          True,
                                                          0.2, [],
                                                          0.2,
                                                          random_state=49)

    X_train = data['train']
    X_test = data['test']

    y_train = y['train']
    y_test = y['test']

    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.transform(X_test)

    clf = tree.DecisionTreeClassifier()
    clf_mod = tree.DecisionTreeClassifier()

    clf.fit(X_train, y_train)
    clf_mod.fit(X_train_mod, y_train)

    acc = []
    acc.append(accuracy_score(clf_mod.predict(X_test_mod), y_test))

    for i, j in product(range(3, 10), ["entropy", "gini"]):
        model = tree.DecisionTreeClassifier(criterion=j, max_depth=i)
        model = model.fit(X_train_mod, y_train)
        acc.append(accuracy_score(model.predict(X_test_mod), y_test))
    del i, j

    data_modified = pd.concat(
        [pd.DataFrame(X_train_mod),
         pd.DataFrame(X_test_mod)], axis=0)

    y_combined = np.r_[y_train, y_test]
    data_modified[target] = y_combined
    # data_modified.to_csv("./data/housingPCA.csv")

    return data_modified, accuracy_score(
        clf.predict(X_test),
        y_test), max(acc), (len(dataset.columns) - len(data_modified.columns))
def train_xgboost(instruction,
                  dataset=None,
                  learning_rate=0.1,
                  n_estimators=1000,
                  ca_threshold=None,
                  max_depth=6,
                  min_child_weight=1,
                  gamma=0,
                  subsample=0.8,
                  colsample_bytree=0.8,
                  objective='binary:logistic',
                  random_state=27,
                  test_size=0.2,
                  text=[],
                  preprocess=True,
                  verbosity=0,
                  drop=None):
    '''
    function to train a xgboost algorithm
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    logger("Preprocessing data")
    data, y, target, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(target))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))

    if num_classes > 2:
        objective = 'multi:softmax'

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)

    # Fitting to SVM and storing in the model dictionary
    logger("Fitting XGBoost")
    clf = XGBClassifier(learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        min_child_weight=min_child_weight,
                        gamma=gamma,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        objective=objective,
                        verbosity=verbosity,
                        random_state=random_state)
    clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)

    logger("->", "Accuracy found on testing set: {}".format(score))

    logger('->', "Stored model under 'xgboost' key")
    clearLog()
    clearLog()

    return {
        'id': generate_id(),
        "model": clf,
        "target": target,
        'num_classes': num_classes,
        "accuracy": {
            'cross_val_score': cross_val_score(
                clf,
                X_train,
                y_train,
            ),
            'accuracy_score': score
        },
        "accuracy_score": score,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        }
    }
def decision_tree(instruction,
                  dataset=None,
                  preprocess=True,
                  ca_threshold=None,
                  text=[],
                  test_size=0.2,
                  drop=None,
                  criterion='gini',
                  splitter='best',
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
                  min_weight_fraction_leaf=0.0,
                  max_leaf_nodes=None,
                  min_impurity_decrease=0.0,
                  ccp_alpha=0.0):
    '''
    function to train a decision tree algorithm.
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''
    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    logger("Preprocessing data")
    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target column found: {}".format(remove))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)

    logger("Labels being mapped to appropriate classes")
    num_classes = len(np.unique(y))

    # fitting and storing
    logger("Fitting Decision Tree")

    clf = tree.DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        ccp_alpha=ccp_alpha)
    clf = clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)
    logger("->", "Score found on testing set: {}".format(score))
    logger("Stored model under 'decision_tree' key")
    clearLog()

    return {
        'id': generate_id(),
        "model": clf,
        "target": remove,
        'num_classes': num_classes,
        "accuracy": {
            'cross_val_score': cross_val_score(clf, X_train, y_train, cv=3),
            'accuracy_score': score
        },
        "accuracy_score": score,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        }
    }
def nearest_neighbors(instruction=None,
                      dataset=None,
                      ca_threshold=None,
                      preprocess=True,
                      drop=None,
                      min_neighbors=3,
                      max_neighbors=10,
                      leaf_size=30,
                      p=2,
                      test_size=0.2,
                      random_state=49,
                      algorithm='auto',
                      text=[]):
    '''
    function to train a nearest neighbor algorithm
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    logger("Reading in dataset")
    # Reads in dataset
    # data = pd.read_csv(self.dataset)
    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    logger("Preprocessing data")
    data, y, remove, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(remove))
    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']
    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))
    # encodes the label dataset into 0's and 1's
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)
    logger("Labels being mapped to appropriate classes")
    models = []
    scores = []
    logger("Fitting nearest neighbors model")
    logger("Identifying optimal number of neighbors")
    # Tries all neighbor possibilities, based on either defaults or user
    # specified values
    num_neighbors = []
    for x in range(min_neighbors, max_neighbors):
        knn = KNeighborsClassifier(n_neighbors=x,
                                   leaf_size=leaf_size,
                                   p=p,
                                   algorithm=algorithm)
        knn.fit(X_train, y_train)
        models.append(knn)
        scores.append(accuracy_score(knn.predict(X_test), y_test))
        num_neighbors.append(x)

    logger(
        "->", "Optimal number of neighbors found: {}".format(
            num_neighbors[scores.index(max(scores))]))
    logger(
        "->", "Accuracy found on testing set: {}".format(scores[scores.index(
            max(scores))]))
    logger("Stored model under 'nearest_neighbors' key")
    knn = models[scores.index(min(scores))]
    clearLog()
    return {
        'id': generate_id(),
        "model": knn,
        'num_classes': num_classes,
        "accuracy": {
            'accuracy_score': scores[scores.index(max(scores))],
            'cross_val_score': cross_val_score(knn, X_train, y_train, cv=3)
        },
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        "target": remove
    }
    clearLog()
def train_svm(instruction,
              dataset=None,
              test_size=0.2,
              kernel='linear',
              text=[],
              preprocess=True,
              ca_threshold=None,
              drop=None,
              cross_val_size=0.3,
              degree=3,
              gamma='scale',
              coef0=0.0,
              max_iter=-1,
              random_state=49):
    '''
    function to train a support vector machine clustering algorithm
    :param many params: used to hyperparametrize the function.
    :return a dictionary object with all of the information for the algorithm.
    '''

    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    logger("Preprocessing data")
    data, y, target, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(target))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = sklearn.preprocessing.LabelEncoder()
    label_mappings.fit(y_vals)

    y_train = label_mappings.transform(y_train)
    y_test = label_mappings.transform(y_test)

    # Fitting to SVM and storing in the model dictionary
    logger("Fitting Support Vector Machine")
    clf = svm.SVC(kernel=kernel,
                  degree=degree,
                  gamma=gamma,
                  coef0=coef0,
                  max_iter=max_iter)
    clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)

    logger("->", "Accuracy found on testing set: {}".format(score))

    logger('->', "Stored model under 'svm' key")
    clearLog()
    return {
        'id': generate_id(),
        "model": clf,
        'num_classes': num_classes,
        "accuracy": {
            'cross_val_score': cross_val_score(clf, X_train, y_train),
            'accuracy_score': score
        },
        "target": target,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        'test_data': {
            'X': X_test,
            'y': y_test
        }
    }
    clearLog()
Beispiel #10
0
def regression_ann(instruction,
                   callback=False,
                   ca_threshold=None,
                   text=[],
                   dataset=None,
                   drop=None,
                   preprocess=True,
                   test_size=0.2,
                   random_state=49,
                   epochs=50,
                   generate_plots=True,
                   callback_mode='min',
                   maximizer="val_loss",
                   save_model=False,
                   save_path=os.getcwd()):
    '''
    Body of the regression function used that is called in the neural network query
    if the data is numerical.
    :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained.
    :return dictionary that holds all the information for the finished model.
    '''

    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    # data = pd.read_csv(self.dataset)

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    data, y, target, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(target))

    X_train = data['train']
    X_test = data['test']

    # Target scaling
    target_scaler = StandardScaler()

    y_train = target_scaler.fit_transform(np.array(y['train']).reshape(-1, 1))
    y_test = target_scaler.transform(np.array(y['test']).reshape(-1, 1))

    logger("Establishing callback function")

    models = []
    losses = []
    model_data = []

    # callback function to store lowest loss value
    es = EarlyStopping(monitor=maximizer,
                       mode=callback_mode,
                       verbose=0,
                       patience=5)

    callback_value = None
    if callback is not False:
        callback_value = [es]

    i = 0

    # get the first 3 layer model
    model = get_keras_model_reg(data, i)

    logger("Training initial model")
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        callbacks=callback_value,
                        verbose=0)
    models.append(history)
    model_data.append(model)

    col_name = [[
        "Initial number of layers ", "| Training Loss ", "| Test Loss "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append(
        "| " +
        str(history.history['loss'][len(history.history['val_loss']) - 1]))
    values.append(
        "| " +
        str(history.history['val_loss'][len(history.history['val_loss']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")

    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])

    # keeps running model and fit functions until the validation loss stops
    # decreasing
    logger("Testing number of layers")
    col_name = [["Current number of layers", "| Training Loss", "| Test Loss"]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    #while all(x > y for x, y in zip(losses, losses[1:])):
    while (len(losses) <= 2
           or losses[len(losses) - 1] < losses[len(losses) - 2]):
        model = get_keras_model_reg(data, i)
        history = model.fit(X_train,
                            y_train,
                            callbacks=callback_value,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)
        model_data.append(model)
        models.append(history)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append(
            "| " +
            str(history.history['loss'][len(history.history['val_loss']) - 1]))
        values.append("| " + str(history.history['val_loss'][
            len(history.history['val_loss']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        i += 1
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    final_model = model_data[losses.index(min(losses))]
    final_hist = models[losses.index(min(losses))]
    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))

    logger(
        '->', "Training Loss: " +
        str(final_hist.history['loss'][len(final_hist.history['val_loss']) -
                                       1]))
    logger(
        '->', "Test Loss: " +
        str(final_hist.history['val_loss'][len(final_hist.history['val_loss'])
                                           - 1]))

    # calls function to generate plots in plot generation
    plots = {}
    if generate_plots:
        init_plots, plot_names = generate_regression_plots(
            models[len(models) - 1], data, y)
        for x in range(len(plot_names)):
            plots[str(plot_names[x])] = init_plots[x]

    if save_model:
        save(final_model, save_model)
    # stores values in the client object models dictionary field
    print("")
    logger("Stored model under 'regression_ANN' key")
    clearLog()
    return {
        'id': generate_id(),
        'model': final_model,
        "target": target,
        "num_classes": 1,
        "plots": plots,
        "preprocesser": full_pipeline,
        "interpreter": target_scaler,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        }
    }
Beispiel #11
0
def classification_ann(instruction,
                       callback=False,
                       dataset=None,
                       text=[],
                       ca_threshold=None,
                       preprocess=True,
                       callback_mode='min',
                       drop=None,
                       random_state=49,
                       test_size=0.2,
                       epochs=50,
                       generate_plots=True,
                       maximizer="val_accuracy",
                       save_model=False,
                       save_path=os.getcwd()):
    '''
    Body of the classification function used that is called in the neural network query
    if the data is categorical.
    :param many parameters: used to preprocess, tune, plot generation, and parameterizing the neural network trained.
    :return dictionary that holds all the information for the finished model.
    '''
    logger("Reading in dataset")

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, remove, full_pipeline = initial_preprocesser(
        data,
        instruction,
        preprocess,
        ca_threshold,
        text,
        test_size=test_size,
        random_state=random_state)
    logger("->", "Target column found: {}".format(remove))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y = pd.concat([y['train'], y['test']], axis=0)

    num_classes = len(np.unique(y))

    X_train = data['train']
    X_test = data['test']

    if num_classes > 2:
        # ANN needs target one hot encoded for classification
        one_hot_encoder = OneHotEncoder()
        y = pd.DataFrame(one_hot_encoder.fit_transform(
            np.reshape(y.values, (-1, 1))).toarray(),
                         columns=one_hot_encoder.get_feature_names())

    y_train = y.iloc[:len(X_train)]
    y_test = y.iloc[len(X_train):]

    models = []
    losses = []
    accuracies = []
    model_data = []

    logger("Establishing callback function")

    # early stopping callback
    es = EarlyStopping(monitor=maximizer, mode='max', verbose=0, patience=5)

    callback_value = None
    if callback is not False:
        callback_value = [es]

    i = 0
    model = get_keras_model_class(data, i, num_classes)
    logger("Training initial model")

    history = model.fit(X_train,
                        y_train,
                        callbacks=callback_value,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        verbose=0)

    model_data.append(model)
    models.append(history)
    col_name = [[
        "Initial number of layers ", "| Training Accuracy ", "| Test Accuracy "
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2
    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    values = []
    values.append(str(len(model.layers)))
    values.append("| " + str(history.history['accuracy'][
        len(history.history['val_accuracy']) - 1]))
    values.append("| " + str(history.history['val_accuracy'][
        len(history.history['val_accuracy']) - 1]))
    datax = []
    datax.append(values)
    for row in datax:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    losses.append(history.history[maximizer][len(history.history[maximizer]) -
                                             1])
    accuracies.append(
        history.history['val_accuracy'][len(history.history['val_accuracy']) -
                                        1])
    # keeps running model and fit functions until the validation loss stops
    # decreasing

    logger("Testing number of layers")
    col_name = [[
        "Current number of layers", "| Training Accuracy", "| Test Accuracy"
    ]]
    col_width = max(len(word) for row in col_name for word in row) + 2

    for row in col_name:
        print((" " * 2 * counter) + "| " +
              ("".join(word.ljust(col_width) for word in row)) + " |")
    datax = []
    #while all(x < y for x, y in zip(accuracies, accuracies[1:])):
    while (len(accuracies) <= 2 or
           accuracies[len(accuracies) - 1] > accuracies[len(accuracies) - 2]):
        model = get_keras_model_class(data, i, num_classes)
        history = model.fit(X_train,
                            y_train,
                            callbacks=callback_value,
                            epochs=epochs,
                            validation_data=(X_test, y_test),
                            verbose=0)

        values = []
        datax = []
        values.append(str(len(model.layers)))
        values.append("| " + str(history.history['accuracy'][
            len(history.history['accuracy']) - 1]))
        values.append("| " + str(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1]))
        datax.append(values)
        for row in datax:
            print((" " * 2 * counter) + "| " +
                  ("".join(word.ljust(col_width) for word in row)) + " |")
        del values, datax
        losses.append(
            history.history[maximizer][len(history.history[maximizer]) - 1])
        accuracies.append(history.history['val_accuracy'][
            len(history.history['val_accuracy']) - 1])
        models.append(history)
        model_data.append(model)

        i += 1
    # print((" " * 2 * counter)+ tabulate(datax, headers=col_name, tablefmt='orgtbl'))
    # del values, datax

    final_model = model_data[accuracies.index(max(accuracies))]
    final_hist = models[accuracies.index(max(accuracies))]

    print("")
    logger('->',
           "Best number of layers found: " + str(len(final_model.layers)))
    logger(
        '->', "Training Accuracy: " + str(final_hist.history['accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))
    logger(
        '->', "Test Accuracy: " + str(final_hist.history['val_accuracy'][
            len(final_hist.history['val_accuracy']) - 1]))

    # genreates appropriate classification plots by feeding all information
    plots = {}
    if generate_plots:
        plots = generate_classification_plots(models[len(models) - 1], data, y,
                                              model, X_test, y_test)

    if save_model:
        save(final_model, save_model)

    print("")
    logger("Stored model under 'classification_ANN' key")
    clearLog()
    # stores the values and plots into the object dictionary
    return {
        'id': generate_id(),
        "model": final_model,
        'num_classes': num_classes,
        "plots": plots,
        "target": remove,
        "preprocesser": full_pipeline,
        "interpreter": one_hot_encoder,
        'test_data': {
            'X': X_test,
            'y': y_test
        },
        'losses': {
            'training_loss': final_hist.history['loss'],
            'val_loss': final_hist.history['val_loss']
        },
        'accuracy': {
            'training_accuracy': final_hist.history['accuracy'],
            'validation_accuracy': final_hist.history['val_accuracy']
        }
    }
def dimensionality_ICA(instruction, dataset, target="", y=""):

    global counter

    dataReader = DataReader(dataset)

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)

        data, y, target, full_pipeline = initial_preprocesser(
            data, instruction, True, 0.2, [], 0.2, random_state=49)

        X_train = data['train']
        X_test = data['test']

        y_train = y['train']
        y_test = y['test']


    pca = FastICA(n_components=len(X_train.columns))
    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.fit_transform(X_test)


    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    clf_mod = tree.DecisionTreeClassifier()
    clf_mod.fit(X_train_mod, y_train)
    acc = []
    sets = []
    acc.append(accuracy_score(
        clf_mod.predict(X_test_mod), y_test))

    frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
    frame[target] = np.r_[y_train, y_test]
    sets.append(frame)

    for i in range(2, len(X_train.columns)):
        pca = FastICA(n_components=i)
        X_train_mod = pca.fit_transform(X_train)
        X_test_mod = pca.fit_transform(X_test)

        frame = pd.DataFrame(pd.DataFrame(X_train_mod).append(pd.DataFrame(X_test_mod)))
        frame[target] = np.r_[y_train, y_test]
        sets.append(frame)

        clf_mod = tree.DecisionTreeClassifier()
        clf_mod.fit(X_train_mod, y_train)

        acc.append(accuracy_score(
            clf_mod.predict(X_test_mod), y_test))

    del i

    data_modified = sets[acc.index(max(acc))]
    score = max(acc)


    return data_modified, score, ((len(
            X_train.columns) + 1) - len(data_modified.columns))
Beispiel #13
0
        accuracy_scores), list(columns[the_index])


def dimensionality_PCA(instruction, dataset, ca_threshold=None):
     '''
     function to reduce dimensionality in dataset via principal component analysis method
     :param instruction: command sent to client instance in written query.
     :param dataset: data instantiated in client instance passed to the algorithm
     :param ca_threshold: percentage of dataset to be preprocessed using morphological component analysis
     '''
        
    global counter

    pca = PCA(0.92)

    data, y, target, full_pipeline = initial_preprocesser(
        dataset, instruction, ca_threshold=ca_threshold, preprocess=True)

    X_train = data['train']
    X_test = data['test']

    y_train = y['train']
    y_test = y['test']

    X_train_mod = pca.fit_transform(X_train)
    X_test_mod = pca.transform(X_test)

    clf = tree.DecisionTreeClassifier()
    clf_mod = tree.DecisionTreeClassifier()

    clf.fit(X_train, y_train)
    clf_mod.fit(X_train_mod, y_train)
def train_svm(instruction,
              dataset=None,
              test_size=0.2,
              kernel='linear',
              text=None,
              preprocess=True,
              ca_threshold=None,
              drop=None,
              cross_val_size=0.3):

    logger("Reading in dataset....")
    # reads dataset and fills n/a values with zeroes
    #data = pd.read_csv(self.dataset)

    dataReader = DataReader(dataset)
    data = dataReader.data_generator()

    if drop is not None:
        data.drop(drop, axis=1, inplace=True)

    data, y, target, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, ca_threshold, text)
    logger("->", "Target Column Found: {}".format(target))

    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']

    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))

    # Needed to make a custom label encoder due to train test split changes
    # Can still be inverse transformed, just a bit of extra work
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i

    y_train = y_train.apply(lambda x: label_mappings[x]).values
    y_test = y_test.apply(lambda x: label_mappings[x]).values

    # Fitting to SVM and storing in the model dictionary
    logger("Fitting Support Vector Machine...")
    clf = svm.SVC(kernel=kernel)
    clf.fit(X_train, y_train)

    score = accuracy_score(clf.predict(X_test), y_test)

    logger("->", "Accuracy found on testing set: {}".format(score))

    logger('->', "Stored model under 'svm' key")
    return {
        'id': generate_id(),
        "model": clf,
        "accuracy_score": accuracy_score(clf.predict(X_test), y_test),
        "target": target,
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        "cross_val_score": cross_val_score(clf, X_train, y_train)
    }
    clearLog()