Esempio n. 1
0
 def train(self, num_classifiers=50):
     bagged_datasets = DataHandler.create_bagged_datasets(
         num_classifiers, self.examples, self.targets)
     for bagged_dataset in bagged_datasets:
         naive_bayes = NaiveBayes(bagged_dataset[0], bagged_dataset[1])
         naive_bayes.train()
         self.nb_classifiers.append(naive_bayes)
Esempio n. 2
0
def main():
    Dataset = namedtuple('Dataset', ['inputs', 'labels'])

    # Reading in data. You do not need to touch this.
    with open("train-images-idx3-ubyte.gz",
              'rb') as f1, open("train-labels-idx1-ubyte.gz", 'rb') as f2:
        buf1 = gzip.GzipFile(fileobj=f1).read(16 + 60000 * 28 * 28)
        buf2 = gzip.GzipFile(fileobj=f2).read(8 + 60000)
        inputs = np.frombuffer(buf1, dtype='uint8',
                               offset=16).reshape(60000, 28 * 28)
        inputs = np.where(inputs > 99, 1, 0)
        labels = np.frombuffer(buf2, dtype='uint8', offset=8)
        data_train = Dataset(inputs, labels)
    with open("t10k-images-idx3-ubyte.gz",
              'rb') as f1, open("t10k-labels-idx1-ubyte.gz", 'rb') as f2:
        buf1 = gzip.GzipFile(fileobj=f1).read(16 + 10000 * 28 * 28)
        buf2 = gzip.GzipFile(fileobj=f2).read(8 + 10000)
        inputs = np.frombuffer(buf1, dtype='uint8',
                               offset=16).reshape(10000, 28 * 28)
        inputs = np.where(inputs > 99, 1, 0)
        labels = np.frombuffer(buf2, dtype='uint8', offset=8)
        data_test = Dataset(inputs, labels)

    # run naive bayes
    model = NaiveBayes(10)
    model.train(data_train)
    print("{:.1f}%".format(model.accuracy(data_test) * 100))
Esempio n. 3
0
def build_train_model(p, data, train_data):

    attrs = [v for k, v in p['var_map'].items() if k != 'out']
    train_insts = get_instances(train_data, attrs, p['out'])

    model = NaiveBayes.NaiveBayes()

    # PAD MODEL Instances
    attr_values = {
        attr: list(set(get_attr_values(data, attr)))
        for attr in attrs
    }

    label_values = list(set(get_attr_values(data, p['out'])))
    pad_insts = get_padding_instances(attr_values, label_values)
    # hacky way to assure all insts are seen at least once

    # TRAIN padding instances
    for row in pad_insts:
        model.add_instances(row)

    # TRAIN Model
    for row in train_insts:
        model.add_instances(row)

    model.train()

    return model
Esempio n. 4
0
def comparing_models(X_train, X_test, y_train, y_test):
    AdaBoost(X_train, X_test, y_train, y_test)
    Logistic_Regression(X_train, X_test, y_train, y_test)
    NaiveBayes(X_train, X_test, y_train, y_test)
    XGBoost(X_train, X_test, y_train, y_test)
    RandomForest(X_train, X_test, y_train, y_test)
    SVM(X_train, X_test, y_train, y_test)
    NeuralNetwork(X_train, X_test, y_train, y_test)
Esempio n. 5
0
def naive_bayes_accuracy(p, data):

    attrs = p['var_map'].values()

    instances = get_instances(data, attrs, p['out'])
    print(len(instances))

    train_insts, test_insts = split_train_test(instances, train_ratio=0.9)

    # BUILD Model
    model = NaiveBayes.NaiveBayes()

    # PAD MODEL Instances
    attr_values = {
        attr: list(set(get_attr_values(data, attr)))
        for attr in attrs
    }
    label_values = list(set(get_attr_values(data, p['out'])))

    pad_insts = get_padding_instances(attr_values, label_values)

    # hacky way to assure all insts are seen at least once
    for row in pad_insts:
        model.add_instances(row)

    # TRAIN Model
    for row in train_insts:
        model.add_instances(row)

    model.train()

    print(len(test_insts))

    # EVAL ACCUCCURACY
    accs = []
    target_confs = []
    preds = {}
    for row in test_insts:
        pred = model.predict(row)
        for attr in pred:
            if attr not in preds: preds[attr] = []
            preds[attr] += [pred[attr]]

        target_confs += [pred[row['label']]]

    avg_confs = {
        attr: sum(confs) / len(confs)
        for attr, confs in preds.items()
    }

    print('n rows', len(target_confs))
    return sum(target_confs) / len(target_confs)
Esempio n. 6
0
 def _initialize_models(self, data_generator):
     """Initializes models prior to training."""
     models = {
         "Linear Regression":
         LinearRegression(),
         "Logistic Regression":
         LogisticRegression(),
         "Quadratic Regression":
         QuadraticRegression(),
         "Naive Bayes'":
         NaiveBayes(std_X=data_generator.std_X,
                    m0=data_generator.m0s,
                    m1=data_generator.m1s),
         "kNN CV":
         kNNCV(n_folds=self.n_folds)
     }
     return models
Esempio n. 7
0
def main():

    np.random.seed(0)

    X_train, X_val, y_train, y_val, x_sex, x_age, x_sex_age = get_credit()

    model = NaiveBayes(2)

    model.train(X_train, y_train)

    print("------------------------------------------------------------")

    print("Train accuracy:")
    print(model.accuracy(X_train, y_train))

    print("------------------------------------------------------------")

    print("Test accuracy:")
    print(model.accuracy(X_val, y_val))

    print("------------------------------------------------------------")

    print("Fairness measures:")
    model.print_fairness(X_val, y_val, x_sex_age)
Esempio n. 8
0
def get_model(model_type: str, model_config: Dict, w2v: torch.Tensor, vocab_list: List, model_name: str) -> nn.Module:
    # Instantiate model and configuration
    train_config = {
                    "num_epochs": 30,
                    "lr_rate": 2e-5,
                    "log_step": 100,
                    "l2norm": False,
                    "l2factor": 3.,
                    "lambda": 0.01,
                   }

    if model_type == "nb":
        model = NaiveBayes(model_config)
    elif model_type == "lr":
        model = LogisticRegression(model_config)
        train_config["lr_rate"] = 2e-3
    elif model_type == "ff":
        model = feedforwardNN(model_config, w2v)
        train_config["num_epochs"] = 50
        train_config["lr_rate"] = 2e-4
    elif model_type ==  "cnn":
        model = convolutionalNN(model_config, w2v)
        train_config["num_epochs"] = 30
        train_config["lr_rate"] = 2e-4
        train_config["l2norm"] = False
    elif model_type ==  "bertff":
        model = BERTfeedforward(model_config, vocab_list)
        train_config["num_epochs"] = 30
        train_config["lr_rate"] = 1e-5
    else:
        raise ValueError("Model type is not supported.")

    # Load model
    if model_name is not "":
        model = torch.load("./models/"+model_name)

    return model, train_config
Esempio n. 9
0

if __name__ == "__main__":
    train_data = []
    train_label = []
    
    load_data = []
    for file in config.data_files:
        load_data.append(LoadData(file))

    for cpt in range(len(load_data)):
        train_x, train_y = load_data[cpt].getTrainData()
        train_data += train_x
        train_label += train_y

    nb_model_nb = NaiveBayes(train_data, train_label)
    nb_model_svm = SVM(train_data, train_label)

    # Save Naive Bayes Model
    nb_pickle = open(config.naive_bayes_path, 'wb')
    pickle.dump(nb_model_nb, nb_pickle)
    nb_pickle.close()

    # Save SVM Model
    svm_pickle = open(config.SVM_path, 'wb')
    pickle.dump(nb_model_nb, svm_pickle)
    svm_pickle.close()

    valid_data = []
    valid_label = []
    for cpt in range(len(load_data)):
Esempio n. 10
0
from models import KNN, NaiveBayes, DecisionTree
from data import Data

training_data = Data('skinTraining')
testing_data = Data('skinTesting')

knn = KNN(training_data, testing_data)
bayes = NaiveBayes(training_data, testing_data)
tree = DecisionTree(training_data, testing_data)

knnResult = knn.evaluate()
bayesResult = bayes.evaluate()
treeResult = tree.evaluate()

print knnResult, treeResult, bayesResult
Esempio n. 11
0
            goldLabels.append(label)
            s1 = " ".join(leaves(t1))
            s2 = " ".join(leaves(t2))
            modelPredict = model.predict(s1, s2)
            predictions.append(modelPredict)
            count += 1

    accuracy = accuracy_score(predictions, goldLabels)
    print "Accuracy on SICK %s set: %f" % (dataSet, accuracy)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="arguments for CioEntails system")
    parser.add_argument("--model",
                        type=str,
                        default="baseline",
                        help="Name of model to use for system")
    args = parser.parse_args()

    if args.model == "baseline":
        model = Baseline("cosineSimilarity", ["keyword_overlap"])
    elif args.model == "keyword":
        model = Keyword("cosineSimilarity", ["keyword_overlap"])
    elif args.model == "NB":
        model = NaiveBayes("cosineSimilarity", ["keyword_overlap"])

    start = time.time()
    evaluateModel(model, args.model, sick_dev_reader)
    print "Evaluation done in %f seconds" % (time.time() - start)
Esempio n. 12
0
from models import NaiveBayes

import numpy as np
import pandas as pd
from dfply import *

target = NaiveBayes()

verification_data = np.load('./data/verification_data.npy' ,allow_pickle=True).tolist()
df = pd.DataFrame.from_dict(verification_data, orient='index').reset_index().rename(columns={'index':'link', 0:'true_label'})

verification_link = []
predict_caterogy = []
for link in df['link']:
  category = target.classify(link)
  verification_link.append(link)
  predict_caterogy.append(category)

predict_data = dict(zip(verification_link, predict_caterogy))
predict_data = pd.DataFrame.from_dict(predict_data, orient='index').reset_index().rename(columns={'index':'link', 0:'pre_label'})
verification = pd.merge(df, predict_data, on='link')

verification = verification >> mutate(flg = if_else(X.true_label == X.pre_label, 1, 0))
print(verification['flg'].sum()/160)