def spnClassificationSPNFit(X, Y, alpha=0.001, min_slices=80): classes = numpy.unique(Y) spns = [] trainll = numpy.zeros((X.shape[0], classes.shape[0])) ws = [] for j in range(classes.shape[0]): idx = Y == classes[j] ws.append(float(numpy.sum(idx)) / X.shape[0]) data_train_class = X[idx, :] spn = LearnSPN(cache=memory, alpha=alpha, min_instances_slice=min_slices, cluster_prep_method=None, families="gaussian").fit_structure(data_train_class) spns.append(spn) trainll[idx, j] = spn.eval(data_train_class, individual=True) x = Variable(len(classes)) constraints = [sum_entries(x) == 1, x > 0] A = numpy.exp(trainll) objective = Maximize(sum_entries(log(A * x))) prob = Problem(objective, constraints) prob.solve() # print("Optimal value", prob.solve()) #ws = sum(x.value.tolist(), []) print(ws) return {'classes': classes, 'spns': spns, 'weights': ws}
def fit(self, X, y): # Check that X and y have correct shape X, y = check_X_y(X, y) # Store the classes seen during fit self.classes_ = unique_labels(y) print(y.shape, numpy.unique(y)) print(self.classes_) #0/0 self.X_ = X self.y_ = y # Return the classifier # classes = numpy.unique(Y) self.spns_ = [] self.ws_ = [] trainll = numpy.zeros((X.shape[0],self.classes_.shape[0])) for j in range(self.classes_.shape[0]): idx = y == self.classes_[j] #self.ws_.append(float(numpy.sum(idx)) / X.shape[0]) data_train_class = X[idx, :] spn = LearnSPN(alpha=self.alpha, min_instances_slice=self.min_instances_slice, cluster_prep_method="sqrt", families=self.families, cache=memory).fit_structure(data_train_class) self.spns_.append(spn) trainll[idx, j] = spn.eval(data_train_class, individual=True) #self.ws_ = self.ws_/numpy.sum(self.ws_) x = Variable(self.classes_.shape[0]) constraints = [sum_entries(x) == 1, x > 0] A = numpy.exp(trainll) objective = Maximize(sum_entries(log(A * x))) prob = Problem(objective, constraints) prob.solve() self.ws_ = sum(x.value.tolist(), []) #print("Optimal w",self.ws_) return self
def pspnperplexity(train, test, min_slices, ind_test_method, row_cluster_method): c1 = Chrono().start() spn = LearnSPN(alpha=0.001, min_slices=min_slices, cluster_prep_method="sqrt", ind_test_method=ind_test_method, row_cluster_method=row_cluster_method).fit_structure(train) c1.end() time = c1.elapsed() pwb, perplexity, words, logl = spn.perplexity(test) print( "SPN ll=%s %.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" % (logl, pwb, perplexity, test.shape[0], words)) return perplexity, logl, time, spn.size()
def spnClassificationGeneralFit(X, Y, maxClasses, alpha=0.001, min_slices=500): # need to convert Y into one-hot encoding as there is no multinomial till now #Y = getOneHotEncoding(Y, maxClasses) print('X shape : ',X.shape) print('Y shape : ',Y.shape) families = ['gaussian']*X.shape[1]+['binomial']*Y.shape[1] data_train_class = numpy.c_[X,Y] spn = LearnSPN(cache=memory, row_cluster_method="RandomPartition",ind_test_method="subsample",alpha=alpha, min_features_slice=30, min_instances_slice=min_slices, cluster_prep_method=None, families=families).fit_structure(data_train_class) return spn
def spnlearn(data, alpha, min_slices=30, cluster_prep_method=None): numpy_rand_gen = numpy.random.RandomState(1337) print("learnspn") spn = LearnSPN( min_instances_slice=min_slices, row_cluster_method="KMeans", n_cluster_splits=2, # g_factor=5*10.0**-17, # g_factor=0.5, alpha=alpha, n_iters=2000, n_restarts=4, rand_gen=numpy_rand_gen, cluster_prep_method=cluster_prep_method).fit_structure(data=data) return spn
def spnClassificationNBFit(X, Y, alpha=0.001, min_slices=80): classes = numpy.unique(Y) spns = [] # trainll = numpy.zeros((X.shape[0],classes.shape[0])) ws = [] for j in range(classes.shape[0]): idx = Y == classes[j] ws.append(float(numpy.sum(idx)) / X.shape[0]) data_train_class = X[idx, :] spn = LearnSPN(cache=memory, alpha=alpha, min_instances_slice=min_slices, cluster_prep_method=None, families="gaussian").fit_structure(data_train_class) spns.append(spn) # trainll[idx, j] = spn.eval(data_train_class, individual=True) return {'classes':classes, 'spns':spns, 'weights':ws}
return pdn.getLogLikelihood(test) for dsname, data, featureNames in [datasets.getCommunitiesAndCrimes()]: #for dsname, data, featureNames in [datasets.getNips(), datasets.getSynthetic(), datasets.getMSNBCclicks(), datasets.getCommunitiesAndCrimes()]: printlocal(dsname) printlocal(featureNames) printlocal(len(featureNames)) printlocal(data.shape) stats = Stats(name=dsname) for train, test, i in kfolded(data, 5): spn = LearnSPN(alpha=0.001, min_instances_slice=80, cluster_prep_method="sqrt", cache=memory).fit_structure(train) printlocal("done") stats.addConfig("PSPN", spn.config) # stats.add("SPN Pois", Stats.LOG_LIKELIHOOD, llspn(spn, test)) printlocal("LL") stats.add("PSPN", Stats.MODEL_SIZE, spn.size()) printlocal("model size") prediction = spnComputeLambdas(spn, test) printlocal("model spnComputeLambdas") #prediction2 = spnComputeLambdasCuda(spn, test) prediction2 = spnComputeLambdas2(spn, test) printlocal("model spnComputeLambdas2") stats.add("PSPN", Stats.ABS_ERROR, abs_error(test, prediction)) stats.add("PSPN", Stats.SQUARED_ERROR, squared_error(test, prediction))
out_log.write(preamble) out_log.flush() # # looping over all parameters combinations for g_factor in g_factors: for cluster_penalty in cluster_penalties: for min_inst_slice in min_inst_slices: # # Creating the structure learner learner = LearnSPN(g_factor=g_factor, min_instances_slice=min_inst_slice, # alpha=alpha, row_cluster_method=args.cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=args.n_row_clusters, n_iters=args.n_iters, n_restarts=args.n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, rand_gen=numpy_rand_gen) learn_start_t = perf_counter() # # build an spn on the training set spn = learner.fit_structure(data=train, feature_sizes=features) # spn = learner.fit_structure_bagging(data=train, # feature_sizes=features, # n_components=10)
out_log.write(preamble) out_log.flush() # # looping over all parameters combinations for g_factor in g_factors: for cluster_penalty in cluster_penalties: for min_inst_slice in min_inst_slices: # # Creating the structure learner learner = LearnSPN(g_factor=g_factor, min_instances_slice=min_inst_slice, # alpha=alpha, row_cluster_method=args.cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=args.n_row_clusters, n_iters=args.n_iters, n_restarts=args.n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, rand_gen=numpy_rand_gen) learn_start_t = perf_counter() # # build an spn on the training set #spn = learner.fit_structure(data=train, # feature_sizes=features) spn = learner.fit_structure_bagging(data=train, feature_sizes=features, n_components=10)
fold_models = [] fold_params = defaultdict(dict) for i, (train, valid, test) in enumerate(fold_splits): # # fixing the seed rand_gen = numpy.random.RandomState(seed) # # Creating the structure learner learner = LearnSPN(g_factor=g_factor, min_instances_slice=min_inst_slice, alpha=alphas[0], row_cluster_method=args.cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=args.n_row_clusters, n_iters=args.n_iters, n_restarts=args.n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, kde_leaves=kde_leaves, rand_gen=rand_gen) learn_start_t = perf_counter() # # build an spn on the training set spn = learner.fit_structure(data=train, feature_sizes=features) learn_end_t = perf_counter() l_time = learn_end_t - learn_start_t logging.info('Structure learned in {} secs'.format(l_time)) fold_models.append(spn)
from natsort.natsort import natsorted from natsort.ns_enum import ns import numpy from algo.learnspn import LearnSPN result = json.load(open('gnspnoutfile4.json')) oldres = json.dumps(result) for fname in natsorted(glob("datasets/simdata*.csv"), alg=ns.IGNORECASE): print(fname) name = "%s_%s" % (fname.split("_")[1], fname.split("_")[2]) idx = int(fname.split("_")[3]) - 1 data = numpy.loadtxt(fname, dtype=float, delimiter=",", skiprows=1) for alpha in ["0.001", "0.0001", "0.00001"]: t0 = time.time() spn = LearnSPN(alpha=float(alpha), min_instances_slice=data.shape[0] - 1, cluster_first=False).fit_structure(data) ptime = (time.time() - t0) result[name]["glmptest_%s" % (alpha)][idx][0] = ptime # print(spn.to_text(list(map(lambda x: "V"+str(x),range(2,200000))))) print(oldres) print(json.dumps(result)) with open('gnspnoutfile4_withtime.txt', 'w') as outfile: json.dump(result, outfile)
def evalspnComplete(labels, data, dsname, writer, alpha, min_instances_slice=50): cvfolds = StratifiedKFold(labels, n_folds=10, random_state=123) classes = list(set(labels)) evalresults = OrderedDict() for train_index, test_index in cvfolds: train_data = data[train_index, ] train_labels = labels[train_index] test_data = data[test_index, ] test_labels = labels[test_index] # clfsvc = GridSearchCV(estimator=svm.SVC(kernel='linear', probability=True), param_grid=dict(C=numpy.logspace(-10, 0, 10)), n_jobs=50, cv=5) clfsvc = GridSearchCV(estimator=svm.SVC(kernel='linear', probability=True), param_grid={'C': [10 ** 3, 10 ** 2, 10 ** 1, 10 ** 0, 10 ** -1, 10 ** -2, 10 ** -3]}, n_jobs=50, cv=5) start = time.time() evalModel(clfsvc, test_data, test_labels, train_data, train_labels, "SVM raw", evalresults) evalresults.setdefault("SVM time in secs \t\t", []).append((time.time() - start)) clspn = SPNClassifier(alpha=alpha, min_instances_slice=min_instances_slice) start = time.time() evalModel(clspn, test_data, test_labels, train_data, train_labels, "SPN NB raw", evalresults) evalresults.setdefault("SPN time in secs \t\t", []).append((time.time() - start)) #print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") #clflr = LogisticRegression(solver='lbfgs') #start = time.time() #evalModel(clflr, test_data, test_labels, train_data, train_labels, "LR NB raw", evalresults) #evalresults.setdefault("SPN time in secs \t\t", []).append((time.time() - start)) continue evals_train = numpy.zeros((train_data.shape[0], 0)) evals_test = numpy.zeros((test_data.shape[0], 0)) grads_train = numpy.zeros((train_data.shape[0], 0)) grads_test = numpy.zeros((test_data.shape[0], 0)) activations_train = numpy.zeros((train_data.shape[0], 0)) activations_test = numpy.zeros((test_data.shape[0], 0)) #model = ClassificationNBFit(train_data, train_labels) timespn = 0 for c in classes: #break idx = train_labels == c print(idx) data_train_class = train_data[idx, :] start = time.time() spn = LearnSPN(alpha=alpha, min_instances_slice=min_instances_slice, cluster_prep_method="sqrt", cache=memory).fit_structure(data_train_class) print(alpha, min_instances_slice) # spn = spnlearn(data_train_class, alpha, min_slices=min_slices, cluster_prep_method="sqrt", family="poisson") timespn += (time.time() - start) # continue evalperclass = numpy.asarray(spn.eval(train_data, individual=True)).reshape((train_data.shape[0], 1)) print(evalperclass.shape) print(evalperclass) gradsperclass = spn.gradients(train_data) activationperclass = spn.activations(train_data) print(evals_train.shape) evals_train = numpy.append(evals_train, evalperclass, axis=1) print(evals_train) grads_train = numpy.hstack((grads_train, gradsperclass)) activations_train = numpy.hstack((activations_train, activationperclass)) evals_test = numpy.hstack((evals_test, numpy.asarray(spn.eval(test_data, individual=True)).reshape((test_data.shape[0], 1)))) grads_test = numpy.hstack((grads_test, spn.gradients(test_data))) activations_test = numpy.hstack((activations_test, spn.activations(test_data))) print("loop done") evalresults.setdefault("SPN time in secs \t\t", []).append(timespn) evalModel(clflr, evals_test, test_labels, evals_train, train_labels, "SPN per class ll -> LR", evalresults) #evalModel(clfsvc, grads_test, test_labels, grads_train, train_labels, "SPN per class gradients -> SVM", evalresults) #evalModel(clfsvc, activations_test, test_labels, activations_train, train_labels, "SPN per class activations -> SVM", evalresults) writer.write(json.dumps(evalresults)) writer.write("\n") for key, value in evalresults.items(): writer.write("%s: %0.6f (+/- %0.6f) \n" % (key, mean(value), stdev(value) * 2)) writer.write("\n")
def learn_model(self, cltree_leaves, args, comp, bgg): #set parameters for learning AC (cltree_leaves=True)and AL(cltree_leaves=false) print('-------MODELS CONSTRUCTION-----------') verbose = 1 n_row_clusters = 2 cluster_method = 'GMM' seed = 1337 n_iters = 100 n_restarts = 4 cluster_penalties = [1.0] sklearn_Args = None if not args: g_factors = [5, 10, 15] min_inst_slices = [10, 50, 100] alphas = [0.1, 0.5, 1.0, 2.0] else: g_factors = [args[0]] min_inst_slices = [args[1]] alphas = [args[2]] # setting verbosity level if verbose == 1: logging.basicConfig(level=logging.INFO) elif verbose == 2: logging.basicConfig(level=logging.DEBUG) # logging.info("Starting with arguments:\n") if sklearn_Args is not None: sklearn_key_value_pairs = sklearn_translate({ ord('['): '', ord(']'): '' }).split(',') sklearn_args = { key.strip(): value.strip() for key, value in [pair.strip().split('=') for pair in sklearn_key_value_pairs] } else: sklearn_args = {} # logging.info(sklearn_args) # initing the random generators MAX_RAND_SEED = 99999999 # sys.maxsize rand_gen = random.Random(seed) numpy_rand_gen = numpy.random.RandomState(seed) # # elaborating the dataset # dataset_name = self.dataset # logging.info('Loading datasets: %s', dataset_name) train = self.train n_instances = train.shape[0] # # estimating the frequencies for the features # logging.info('') freqs, features = dataset.data_2_freqs(train) best_train_avg_ll = NEG_INF best_state = {} best_test_lls = None index = 0 spns = [] for g_factor in g_factors: for cluster_penalty in cluster_penalties: for min_inst_slice in min_inst_slices: print('model') # Creating the structure learner learner = LearnSPN( g_factor=g_factor, min_instances_slice=min_inst_slice, # alpha=alpha, row_cluster_method=cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=n_row_clusters, n_iters=n_iters, n_restarts=n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, rand_gen=numpy_rand_gen) learn_start_t = perf_counter() # build an spn on the training set if (bgg): spn = learner.fit_structure_bagging( data=train, feature_sizes=features, n_components=comp) else: spn = learner.fit_structure(data=train, feature_sizes=features) learn_end_t = perf_counter() n_edges = spn.n_edges() n_levels = spn.n_layers() n_weights = spn.n_weights() n_leaves = spn.n_leaves() # # smoothing can be done after the spn has been built for alpha in alphas: # logging.info('Smoothing leaves with alpha = %f', alpha) spn.smooth_leaves(alpha) spns.append(spn) # Compute LL on training set # logging.info('Evaluating on training set') train_ll = 0.0 for instance in train: (pred_ll, ) = spn.eval(instance) train_ll += pred_ll train_avg_ll = train_ll / train.shape[0] # updating best stats according to train ll if train_avg_ll > best_train_avg_ll: best_train_avg_ll = train_avg_ll best_state['alpha'] = alpha best_state['min_inst_slice'] = min_inst_slice best_state['g_factor'] = g_factor best_state['cluster_penalty'] = cluster_penalty best_state['train_ll'] = train_avg_ll best_state['index'] = index best_state['name'] = self.dataset # writing to file a line for the grid # stats = stats_format([g_factor, # cluster_penalty, # min_inst_slice, # alpha, # n_edges, n_levels, # n_weights, n_leaves, # train_avg_ll], # '\t', # digits=5) # index = index + 1 best_spn = spns[best_state['index']] # logging.info('Grid search ended.') # logging.info('Best params:\n\t%s', best_state) return best_spn, best_state['g_factor'], best_state[ 'min_inst_slice'], best_state['alpha']
# fixing the seed rand_gen = numpy.random.RandomState(seed) stats_dict = {} # # Creating the structure learner learner = LearnSPN( g_factor=g_factor, min_instances_slice=min_inst_slice, alpha=alpha, row_cluster_method=args.cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=args.n_row_clusters, n_iters=args.n_iters, n_restarts=args.n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, kde_leaves=kde_leaves, rand_gen=rand_gen, features_split_method=args.features_split_method, entropy_threshold=entropy_threshold, adaptive_entropy=adaptive_entropy, percentage_rand_features=percentage_rand_features, percentage_instances=percentage_instances) learn_start_t = perf_counter() # # build an spn on the training set spn = learner.fit_structure(data=train, feature_sizes=features, learn_stats=stats_dict)
best_state_mix = {} # # looping over all parameters combinations for g_factor in g_factors: for cluster_penalty in cluster_penalties: for min_inst_slice in min_inst_slices: # # Creating the structure learner learner = LearnSPN( g_factor=g_factor, min_instances_slice=min_inst_slice, # alpha=alpha, row_cluster_method=args.cluster_method, cluster_penalty=cluster_penalty, n_cluster_splits=args.n_row_clusters, n_iters=args.n_iters, n_restarts=args.n_restarts, sklearn_args=sklearn_args, cltree_leaves=cltree_leaves, rand_gen=numpy_rand_gen) # # learning a mixture spns = \ learner.fit_mixture_bootstrap(train, n_mix_components=n_mix, bootstrap_samples_ids=bootstrap_ids, feature_sizes=features, perc=perc,
from joblib.memory import Memory import numpy from algo.learnspn import LearnSPN if __name__ == '__main__': memory = Memory(cachedir="/tmp", verbose=0, compress=9) #data = numpy.loadtxt("data/breast_cancer/wdbc.data", delimiter=",") #data = data[:,1:] features_data = numpy.loadtxt("data/food/train/_preLogits.csv", delimiter=",") labels_data = numpy.loadtxt("data/food/train/_groundtruth.csv", delimiter=",").astype(int) data = numpy.c_[features_data, labels_data] print(data.shape) print(data[1, :]) fams = ["gaussian"] * features_data.shape[1] + ["binomial" ] * labels_data.shape[1] spn = LearnSPN(cache=memory, alpha=0.001, min_instances_slice=200, cluster_prep_method=None, families=fams).fit_structure(data) print(spn.to_tensorflow(["V" + str(i) for i in range(data.shape[1])], data))
def learn(denseCorpus): return LearnSPN(alpha=0.001, min_instances_slice=100, cluster_prep_method="sqrt", ind_test_method="subsample", sub_sample_rows=2000).fit_structure(denseCorpus)
def plotJointProb(filename, data, datarange): print(filename) print(data.shape) spn = LearnSPN(alpha=0.001, min_instances_slice=30, cache=memory).fit_structure(data) matplotlib.rcParams.update({'font.size': 16}) pcm = cm.Blues f1 = 0 f2 = 1 x = data[:, f1] y = data[:, f2] amin, amax = datarange[0], datarange[1] bins = numpy.asarray(list(range(amin, amax))) def getPxy(spn, f1, f2, xbins, ybins): import locale locale.setlocale(locale.LC_NUMERIC, 'C') Pxy = spn.getJointDist(f1, f2) jointDensity = numpy.zeros((max(xbins) + 1, max(ybins) + 1)) for x in xbins: for y in ybins: jointDensity[x, y] = Pxy(x, y) return jointDensity plt.clf() fig = plt.figure(figsize=(7, 7)) # [left, bottom, width, height] xyHist = plt.axes([0.3, 0.3, 0.5, 0.5]) cax = xyHist.imshow(getPxy(spn, f1, f2, bins, bins), extent=[amin, amax, amin, amax], interpolation='nearest', origin='lower', cmap=pcm) xyHist.set_xlim(amin, amax) xyHist.set_ylim(amin, amax) if amax > 20: xyHist.xaxis.set_major_locator(plticker.MultipleLocator(base=10)) xyHist.yaxis.set_major_locator(plticker.MultipleLocator(base=10)) xyHist.yaxis.grid(True, which='major', linestyle='-', color='darkgray') xyHist.xaxis.grid(True, which='major', linestyle='-', color='darkgray') xyHistcolor = plt.axes([0.82, 0.3, 0.03, 0.5]) plt.colorbar(cax, cax=xyHistcolor) font = fm.FontProperties(size=32) # cax.yaxis.get_label().set_fontproperties(font) # cax.xaxis.get_label().set_fontproperties(font) xHist = plt.axes([0.05, 0.3, 0.15, 0.5]) xHist.xaxis.set_major_formatter(NullFormatter()) # probs xHist.yaxis.set_major_formatter(NullFormatter()) # counts xHist.hist(x, bins=bins, orientation='horizontal', color='darkblue') xHist.invert_xaxis() xHist.set_ylim(amin, amax) yHist = plt.axes([0.3, 0.05, 0.5, 0.15]) yHist.yaxis.set_major_formatter(NullFormatter()) # probs yHist.xaxis.set_major_formatter(NullFormatter()) # counts yHist.hist(y, bins=bins, color='darkblue') yHist.invert_yaxis() yHist.set_xlim(amin, amax) for elem in [xyHist, xHist, yHist]: elem.yaxis.grid(True, which='major', linestyle='-', color='darkgray') elem.xaxis.grid(True, which='major', linestyle='-', color='darkgray') plt.savefig(os.path.dirname(os.path.abspath(__file__)) + "/" + filename, bbox_inches='tight', dpi=600)
adj = numpy.zeros((nF, nF)) for i in range(nF): for j in range(i + 1, nF): print(i, j) adj[i, j] = adj[j, i] = spn.computeDistance(words[i], words[j], words, True) return adj dsname, data, words = getNips() spn = LearnSPN(alpha=0.001, min_instances_slice=100, cluster_prep_method="sqrt", cache=memory).fit_structure(data) adjc = getMIAdjc(spn, words) #adjc = getDistAdjc(spn, words) adjc = numpy.log(adjc) print(adjc) print(numpy.any(adjc > 0.8)) def show_graph_with_labels(fname, adjacency_matrix, mylabels): def make_label_dict(labels): l = {}