def train(self, num_classifiers=50): bagged_datasets = DataHandler.create_bagged_datasets( num_classifiers, self.examples, self.targets) for bagged_dataset in bagged_datasets: naive_bayes = NaiveBayes(bagged_dataset[0], bagged_dataset[1]) naive_bayes.train() self.nb_classifiers.append(naive_bayes)
def main(): Dataset = namedtuple('Dataset', ['inputs', 'labels']) # Reading in data. You do not need to touch this. with open("train-images-idx3-ubyte.gz", 'rb') as f1, open("train-labels-idx1-ubyte.gz", 'rb') as f2: buf1 = gzip.GzipFile(fileobj=f1).read(16 + 60000 * 28 * 28) buf2 = gzip.GzipFile(fileobj=f2).read(8 + 60000) inputs = np.frombuffer(buf1, dtype='uint8', offset=16).reshape(60000, 28 * 28) inputs = np.where(inputs > 99, 1, 0) labels = np.frombuffer(buf2, dtype='uint8', offset=8) data_train = Dataset(inputs, labels) with open("t10k-images-idx3-ubyte.gz", 'rb') as f1, open("t10k-labels-idx1-ubyte.gz", 'rb') as f2: buf1 = gzip.GzipFile(fileobj=f1).read(16 + 10000 * 28 * 28) buf2 = gzip.GzipFile(fileobj=f2).read(8 + 10000) inputs = np.frombuffer(buf1, dtype='uint8', offset=16).reshape(10000, 28 * 28) inputs = np.where(inputs > 99, 1, 0) labels = np.frombuffer(buf2, dtype='uint8', offset=8) data_test = Dataset(inputs, labels) # run naive bayes model = NaiveBayes(10) model.train(data_train) print("{:.1f}%".format(model.accuracy(data_test) * 100))
def build_train_model(p, data, train_data): attrs = [v for k, v in p['var_map'].items() if k != 'out'] train_insts = get_instances(train_data, attrs, p['out']) model = NaiveBayes.NaiveBayes() # PAD MODEL Instances attr_values = { attr: list(set(get_attr_values(data, attr))) for attr in attrs } label_values = list(set(get_attr_values(data, p['out']))) pad_insts = get_padding_instances(attr_values, label_values) # hacky way to assure all insts are seen at least once # TRAIN padding instances for row in pad_insts: model.add_instances(row) # TRAIN Model for row in train_insts: model.add_instances(row) model.train() return model
def comparing_models(X_train, X_test, y_train, y_test): AdaBoost(X_train, X_test, y_train, y_test) Logistic_Regression(X_train, X_test, y_train, y_test) NaiveBayes(X_train, X_test, y_train, y_test) XGBoost(X_train, X_test, y_train, y_test) RandomForest(X_train, X_test, y_train, y_test) SVM(X_train, X_test, y_train, y_test) NeuralNetwork(X_train, X_test, y_train, y_test)
def naive_bayes_accuracy(p, data): attrs = p['var_map'].values() instances = get_instances(data, attrs, p['out']) print(len(instances)) train_insts, test_insts = split_train_test(instances, train_ratio=0.9) # BUILD Model model = NaiveBayes.NaiveBayes() # PAD MODEL Instances attr_values = { attr: list(set(get_attr_values(data, attr))) for attr in attrs } label_values = list(set(get_attr_values(data, p['out']))) pad_insts = get_padding_instances(attr_values, label_values) # hacky way to assure all insts are seen at least once for row in pad_insts: model.add_instances(row) # TRAIN Model for row in train_insts: model.add_instances(row) model.train() print(len(test_insts)) # EVAL ACCUCCURACY accs = [] target_confs = [] preds = {} for row in test_insts: pred = model.predict(row) for attr in pred: if attr not in preds: preds[attr] = [] preds[attr] += [pred[attr]] target_confs += [pred[row['label']]] avg_confs = { attr: sum(confs) / len(confs) for attr, confs in preds.items() } print('n rows', len(target_confs)) return sum(target_confs) / len(target_confs)
def _initialize_models(self, data_generator): """Initializes models prior to training.""" models = { "Linear Regression": LinearRegression(), "Logistic Regression": LogisticRegression(), "Quadratic Regression": QuadraticRegression(), "Naive Bayes'": NaiveBayes(std_X=data_generator.std_X, m0=data_generator.m0s, m1=data_generator.m1s), "kNN CV": kNNCV(n_folds=self.n_folds) } return models
def main(): np.random.seed(0) X_train, X_val, y_train, y_val, x_sex, x_age, x_sex_age = get_credit() model = NaiveBayes(2) model.train(X_train, y_train) print("------------------------------------------------------------") print("Train accuracy:") print(model.accuracy(X_train, y_train)) print("------------------------------------------------------------") print("Test accuracy:") print(model.accuracy(X_val, y_val)) print("------------------------------------------------------------") print("Fairness measures:") model.print_fairness(X_val, y_val, x_sex_age)
def get_model(model_type: str, model_config: Dict, w2v: torch.Tensor, vocab_list: List, model_name: str) -> nn.Module: # Instantiate model and configuration train_config = { "num_epochs": 30, "lr_rate": 2e-5, "log_step": 100, "l2norm": False, "l2factor": 3., "lambda": 0.01, } if model_type == "nb": model = NaiveBayes(model_config) elif model_type == "lr": model = LogisticRegression(model_config) train_config["lr_rate"] = 2e-3 elif model_type == "ff": model = feedforwardNN(model_config, w2v) train_config["num_epochs"] = 50 train_config["lr_rate"] = 2e-4 elif model_type == "cnn": model = convolutionalNN(model_config, w2v) train_config["num_epochs"] = 30 train_config["lr_rate"] = 2e-4 train_config["l2norm"] = False elif model_type == "bertff": model = BERTfeedforward(model_config, vocab_list) train_config["num_epochs"] = 30 train_config["lr_rate"] = 1e-5 else: raise ValueError("Model type is not supported.") # Load model if model_name is not "": model = torch.load("./models/"+model_name) return model, train_config
if __name__ == "__main__": train_data = [] train_label = [] load_data = [] for file in config.data_files: load_data.append(LoadData(file)) for cpt in range(len(load_data)): train_x, train_y = load_data[cpt].getTrainData() train_data += train_x train_label += train_y nb_model_nb = NaiveBayes(train_data, train_label) nb_model_svm = SVM(train_data, train_label) # Save Naive Bayes Model nb_pickle = open(config.naive_bayes_path, 'wb') pickle.dump(nb_model_nb, nb_pickle) nb_pickle.close() # Save SVM Model svm_pickle = open(config.SVM_path, 'wb') pickle.dump(nb_model_nb, svm_pickle) svm_pickle.close() valid_data = [] valid_label = [] for cpt in range(len(load_data)):
from models import KNN, NaiveBayes, DecisionTree from data import Data training_data = Data('skinTraining') testing_data = Data('skinTesting') knn = KNN(training_data, testing_data) bayes = NaiveBayes(training_data, testing_data) tree = DecisionTree(training_data, testing_data) knnResult = knn.evaluate() bayesResult = bayes.evaluate() treeResult = tree.evaluate() print knnResult, treeResult, bayesResult
goldLabels.append(label) s1 = " ".join(leaves(t1)) s2 = " ".join(leaves(t2)) modelPredict = model.predict(s1, s2) predictions.append(modelPredict) count += 1 accuracy = accuracy_score(predictions, goldLabels) print "Accuracy on SICK %s set: %f" % (dataSet, accuracy) if __name__ == "__main__": parser = argparse.ArgumentParser( description="arguments for CioEntails system") parser.add_argument("--model", type=str, default="baseline", help="Name of model to use for system") args = parser.parse_args() if args.model == "baseline": model = Baseline("cosineSimilarity", ["keyword_overlap"]) elif args.model == "keyword": model = Keyword("cosineSimilarity", ["keyword_overlap"]) elif args.model == "NB": model = NaiveBayes("cosineSimilarity", ["keyword_overlap"]) start = time.time() evaluateModel(model, args.model, sick_dev_reader) print "Evaluation done in %f seconds" % (time.time() - start)
from models import NaiveBayes import numpy as np import pandas as pd from dfply import * target = NaiveBayes() verification_data = np.load('./data/verification_data.npy' ,allow_pickle=True).tolist() df = pd.DataFrame.from_dict(verification_data, orient='index').reset_index().rename(columns={'index':'link', 0:'true_label'}) verification_link = [] predict_caterogy = [] for link in df['link']: category = target.classify(link) verification_link.append(link) predict_caterogy.append(category) predict_data = dict(zip(verification_link, predict_caterogy)) predict_data = pd.DataFrame.from_dict(predict_data, orient='index').reset_index().rename(columns={'index':'link', 0:'pre_label'}) verification = pd.merge(df, predict_data, on='link') verification = verification >> mutate(flg = if_else(X.true_label == X.pre_label, 1, 0)) print(verification['flg'].sum()/160)