def construct_data(model_name, x, y, bins=20): model = get_model(model_name) data_name = get_model_data(model_name) try: data = get_dataset(data_name, split=True, verbose=0, discrete=True) except LookupError: print("Cannot find data with name {}".format(data_name)) return None ranges = data['ranges'] categories = data['categories'] discretizer = data['discretizer'] hists = data2histogram(x, bins, ranges) confidence = model.fidelity(x) if isinstance(model, SurrogateMixin) else None score = model.score(y, model.predict(x)) ret = { 'data': x, 'target': y, # 'featureNames': data['feature_names'], # 'labelNames': data['target_names'], # 'isCategorical': is_categorical, # 'categories': categories, # 'continuous': [True] * x.shape[1], 'hists': hists, # 'ranges': ranges, 'ratios': get_category_ratios(x, discretizer, categories), # 'discretizers': discretizer2json(discretizer, x), 'confidence': confidence, 'score': score } return ret
def train_svm(name='oversample', dataset='pima', C=1., sample=True, **kwargs): from sklearn.svm import SVC data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] uniq, counts = np.unique(train_y, return_counts=True) print('before sample: [{}]'.format('/'.join([str(c) for c in counts]))) sample_filters = {'Glucose': [105, 121], 'Age': [31.5, 64.4], 'Body Mass Index': [25.7, 100]} filters = {feature_names.index(key): value for key, value in sample_filters.items()} # filters[train_x.shape[1]] = [0] # filters print("over sampling training data") if sample: train_x, train_y = re_sampling(train_x, train_y, filters, rate=1) print("#data after over sampling:", len(train_y)) uniq, counts = np.unique(train_y, return_counts=True) print('after sample: [{}]'.format('/'.join([str(c) for c in counts]))) one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical'] model_name = '-'.join([dataset, 'svm'] + [name]) model = SVC(C=C, probability=True, **kwargs) nn = SKClassifier(model, name=model_name, standardize=True, one_hot_encoder=one_hot_encoder) nn.train(train_x, train_y) nn.evaluate(train_x, train_y, stage='train') acc, loss, auc = nn.test(test_x, test_y) nn.save() return acc, loss, auc
def cv_nn(dataset, neurons=(20,20), max_iter=1000): from sklearn.model_selection import cross_validate, ShuffleSplit from sklearn.neural_network import MLPClassifier n_test = 5 data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] train_x, train_y = do_re_sample(train_x, train_y, feature_names) alphas = [0.1, 0.5, 1.0] cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) for alpha in alphas: clf = MLPClassifier(neurons, alpha=alpha, max_iter=max_iter, tol=1e-5) scores = [] for i in range(n_test): cv_scores = cross_validate(clf, train_x, train_y, cv=cv) scores += cv_scores['test_score'].tolist() mean_score = np.mean(scores) std_score = np.std(scores) min_score = np.min(scores) max_score = np.max(scores) print('alpha {}:'.format(alpha)) print('score: {}, std: {}, min: {}, max: {}\n'.format(mean_score, std_score, min_score, max_score))
def model_metric(model_name, data): try: model = get_model(model_name) except FileNotFoundError: return None if data == 'train' or data == 'test': dataset = get_dataset(get_model_data(model_name), split=True) if data == 'train': x = dataset['train_x'] y = dataset['train_y'] else: x = dataset['test_x'] y = dataset['test_y'] # elif data == 'sample_train' or 'sample_test': # pass else: raise ValueError("Unknown data {}".format(data)) conf_mat = confusion_matrix(y, model.predict(x)) y_pred = model.predict_prob(x) # if y_pred.shape[1] == 2: # auc = roc_auc_score(y, y_pred[:, 1]) # else: auc = auc_score(y, y_pred, average=None) ret = { 'confusionMatrix': conf_mat, 'auc': auc } return jsonify(ret)
def get_stream(model_name, data_type, conditional=True, bins=20, filters=None): model = get_model(model_name) dataset = get_dataset(get_model_data(model_name), split=True) ranges = dataset['ranges'] categories = dataset['categories'] x, y = get_model_x_y(model_name, data_type, filters) streams = compute_streams(model, x, y, ranges, categories, conditional, bins) return jsonify(streams)
def train_nn(name='nn', dataset='abalone2', neurons=(20, ), alpha=0.01, **kwargs): from sklearn.neural_network import MLPClassifier, MLPRegressor data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] uniq, counts = np.unique(train_y, return_counts=True) print('before sample: [{}]'.format('/'.join([str(c) for c in counts]))) # abalone2 sample_filters = { 'shell weight': [0.249, 0.432], 'shucked weight': [0.337, 0.483] } # wine_quality_red # sample_filters = {'alcohol': [10.5, 11.7]} filters = { feature_names.index(key): value for key, value in sample_filters.items() } # abalone2 # filters[train_x.shape[1]] = [0, 2] # wine_quality_red # filters[train_x.shape[1]] = [2, 4] # filters print("over sampling training data") train_x, train_y = over_sampling(train_x, train_y, filters, rate=2) print("#data after over sampling:", len(train_y)) uniq, counts = np.unique(train_y, return_counts=True) print('after sample: [{}]'.format('/'.join([str(c) for c in counts]))) one_hot_encoder, is_categorical = data['one_hot_encoder'], data[ 'is_categorical'] model_name = '-'.join([dataset, name] + [str(neuron) for neuron in neurons] + ['oversample']) model = MLPClassifier(hidden_layer_sizes=neurons, max_iter=5000, alpha=alpha, **kwargs) nn = SKClassifier(model, name=model_name, standardize=True, one_hot_encoder=one_hot_encoder) nn.train(train_x, train_y) nn.evaluate(train_x, train_y, stage='train') acc, loss, auc = nn.test(test_x, test_y) nn.save() return acc, loss, auc
def train_svm(name='svm', dataset='wine', C=1.0, problem='classification', **kwargs): data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] if rebalance: print("balancing training data") train_x, train_y = sample_balance(train_x, train_y) print("#data after balancing:", len(train_y)) one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical'] model_name = '-'.join([dataset, name]) svm = SVM(name=model_name, problem=problem, C=C, one_hot_encoder=one_hot_encoder, **kwargs) svm.train(train_x, train_y) svm.evaluate(train_x, train_y, stage='train') acc, loss, auc = svm.test(test_x, test_y) return svm, acc
def train_nn(name='nn', dataset='wine', neurons=(20,), alpha=0.01, problem='classification', **kwargs): data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] if rebalance: print("balancing training data") train_x, train_y = sample_balance(train_x, train_y) print("#data after balancing:", len(train_y)) one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical'] model_name = '-'.join([dataset, name] + [str(neuron) for neuron in neurons]) nn = NeuralNet(name=model_name, problem=problem, neurons=neurons, max_iter=5000, alpha=alpha, one_hot_encoder=one_hot_encoder, **kwargs) nn.train(train_x, train_y) nn.evaluate(train_x, train_y, stage='train') acc, loss, auc = nn.test(test_x, test_y) return nn, acc
def get_model_x_y(model_name, data_type='train', filters=None): data_name = get_model_data(model_name) model = get_model(model_name) try: data = get_dataset(data_name, split=True, verbose=0, discrete=True) except LookupError: print("Cannot find data with name {}".format(data_name)) return None if data_type == 'train' or data_type == 'test': x = data[data_type + '_x'] y = data[data_type + '_y'] elif data_type == 'sample train' or 'sample test': x, y = get_surrogate_data(model, data_type) else: raise ValueError("Unkown data_type {}".format(data_type)) return filter_data(data['is_categorical'], x, y, filters)
def train_nn(name='nn', dataset='wine', neurons=(20,), alpha=0.01, **kwargs): from sklearn.neural_network import MLPClassifier data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] if rebalance: print("balancing training data") train_x, train_y = sample_balance(train_x, train_y) print("#data after balancing:", len(train_y)) one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical'] model_name = '-'.join([dataset, name] + [str(neuron) for neuron in neurons]) model = MLPClassifier(hidden_layer_sizes=neurons, max_iter=5000, alpha=alpha, **kwargs) nn = SKClassifier(model, name=model_name, standardize=True, one_hot_encoder=one_hot_encoder) nn.train(train_x, train_y) nn.evaluate(train_x, train_y, stage='train') nn.test(test_x, test_y) nn.save()
def train_svm(name='svm', dataset='wine', C=1.0, problem='classification', **kwargs): from sklearn.svm import SVC data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] if rebalance: print("balancing training data") train_x, train_y = sample_balance(train_x, train_y) print("#data after balancing:", len(train_y)) one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical'] model_name = '-'.join([dataset, name]) model = SVC(C=C, probability=True, **kwargs) svm = SKClassifier(model, name=model_name, one_hot_encoder=one_hot_encoder) svm.train(train_x, train_y) svm.evaluate(train_x, train_y, stage='train') svm.test(test_x, test_y) svm.save()
def train_surrogate(model_file, is_global=True, sampling_rate=5., surrogate='rule', rule_maxlen=2, min_support=0.01, eta=1, iters=50000, _lambda=30, alpha=1): is_rule = surrogate == 'rule' model = load_model(model_file) dataset = model.name.split('-')[0] data = get_dataset(dataset, split=True, discrete=is_rule, one_hot=is_rule) train_x, train_y, test_x, test_y, feature_names, is_categorical = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'], data['is_categorical'] # print(feature_names) print("Original model:") model.test(test_x, test_y) print("Surrogate model:") model_name = surrogate + '-surrogate-' + model.name if surrogate == 'rule': surrogate_model = RuleSurrogate(name=model_name, discretizer=data['discretizer'], rule_minlen=1, rule_maxlen=rule_maxlen, min_support=min_support, _lambda=_lambda, nchain=30, eta=eta, iters=iters, alpha=alpha) elif surrogate == 'tree': surrogate_model = TreeSurrogate(name=model_name, max_depth=None, min_samples_leaf=0.01) else: raise ValueError("Unknown surrogate type {}".format(surrogate)) constraints = get_constraints(train_x, is_categorical) # sigmas = [0] * train_x.shape[1] # print(sigmas) if is_global: instances = train_x else: instances = train_x[19:20, :] # print('train_y:') # print(train_y) # print('target_y') # print(model.predict(instances)) if isinstance(surrogate_model, RuleSurrogate): surrogate_model.surrogate(model, instances, constraints, sampling_rate, rediscretize=True) else: surrogate_model.surrogate(model, instances, constraints, sampling_rate) # surrogate_model.evaluate(train_x, train_y) surrogate_model.describe(feature_names=feature_names) surrogate_model.save() # surrogate_model.self_test() if is_global: surrogate_model.test(test_x, test_y) else: surrogate_model.test(train_x[19:20, :], train_y[19:20])
def train_tree(name='tree', dataset='wine', max_depth=None, min_samples_leaf=0.005, **kwargs): data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names, one_hot_encoder = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'], data['one_hot_encoder'] if rebalance: print("balancing training data") train_x, train_y = sample_balance(train_x, train_y) print("#data after balancing:", len(train_y)) model_name = '-'.join([dataset, name]) tree = Tree(name=model_name, max_depth=max_depth, min_samples_leaf=min_samples_leaf, one_hot_encoder=one_hot_encoder, **kwargs) tree.train(train_x, train_y) tree.evaluate(train_x, train_y, stage='train') tree.test(test_x, test_y) tree.describe() tree.export(get_path('models', '{}.json'.format(model_name))) tree.save()
def train_nn(name='sample', dataset='pima', neurons=(20,), alpha=0.01, sample=True, **kwargs): from sklearn.neural_network import MLPClassifier, MLPRegressor data = get_dataset(dataset, split=True, discrete=False, one_hot=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] # filters[train_x.shape[1]] = [0] # filters if sample: train_x, train_y = do_re_sample(train_x, train_y, feature_names) one_hot_encoder, is_categorical = data['one_hot_encoder'], data['is_categorical'] model_name = '-'.join([dataset, 'nn'] + [str(neuron) for neuron in neurons] + [name]) model = MLPClassifier(hidden_layer_sizes=neurons, max_iter=5000, alpha=alpha, **kwargs) nn = SKClassifier(model, name=model_name, standardize=True, one_hot_encoder=one_hot_encoder) nn.train(train_x, train_y) nn.evaluate(train_x, train_y, stage='train') acc, loss, auc = nn.test(test_x, test_y) nn.save() return acc, loss, auc
def train_rule(name='rule', dataset='breast_cancer', rule_max_len=2, **kwargs): data = get_dataset(dataset, split=True, discrete=True) train_x, train_y, test_x, test_y, feature_names = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'] from iml.models.rule_model import RuleList if rebalance: print("balancing training data") train_x, train_y = sample_balance(train_x, train_y) print("#data after balancing:", len(train_y)) # print(train_x.shape, train_x.dtype) discretizer = data['discretizer'] model_name = '-'.join([dataset, name]) brl = RuleList(name=model_name, rule_maxlen=rule_max_len, discretizer=discretizer, **kwargs) brl.train(train_x, train_y) brl.evaluate(train_x, train_y, stage='train') # print(brl.infer(test_x)) brl.test(test_x, test_y) brl.describe(feature_names=feature_names) brl.save()
def model_meta(model_name): data_name = get_model_data(model_name) try: data = get_dataset(data_name, split=True, verbose=0, discrete=True) except LookupError: print("Cannot find data with name {}".format(data_name)) return None discretizer = data['discretizer'] ranges = None if 'ranges' not in data else data['ranges'] categories = None if 'categories' not in data else data['categories'] is_categorical = data['is_categorical'] ret = { 'featureNames': data['feature_names'], 'labelNames': data['target_names'], 'isCategorical': is_categorical, 'categories': categories, # 'continuous': [True] * x.shape[1], 'ranges': ranges, 'discretizers': discretizer2json(discretizer), } return ret
def train_surrogate(model_file, sampling_rate=5., sample=True, rule_maxlen=2, min_support=0.01, eta=1, iters=50000, _lambda=30): model = load_model(model_file) dataset = model.name.split('-')[0] data = get_dataset(dataset, split=True, discrete=True, one_hot=False) train_x, train_y, test_x, test_y, feature_names, is_categorical = \ data['train_x'], data['train_y'], data['test_x'], data['test_y'], data['feature_names'], data['is_categorical'] ranges = data['ranges'] if sample: train_x, train_y = do_re_sample(train_x, train_y, feature_names) # print(feature_names) print("Original model:") model.test(test_x, test_y) print("Surrogate model:") model_name = 'rule-surrogate-' + model.name surrogate_model = RuleSurrogate(name=model_name, discretizer=data['discretizer'], rule_minlen=1, rule_maxlen=rule_maxlen, min_support=min_support, _lambda=_lambda, nchain=30, eta=eta, iters=iters) constraints = get_constraints(is_categorical, ranges) # sigmas = [0] * train_x.shape[1] # print(sigmas) instances = train_x # print('train_y:') # print(train_y) # print('target_y') # print(model.predict(instances)) if isinstance(surrogate_model, RuleSurrogate): surrogate_model.surrogate(model, instances, constraints, sampling_rate, rediscretize=True) else: surrogate_model.surrogate(model, instances, constraints, sampling_rate) # surrogate_model.evaluate(train_x, train_y) surrogate_model.describe(feature_names=feature_names) surrogate_model.save() # surrogate_model.self_test() surrogate_model.test(test_x, test_y)