def test1(data, features): data = data[:, 1:20] features = features[0:data.shape[1]] arcs, mixt = getArchetypes(data, 3) nrfolds = 10 stats = Stats(name=dsname) for train, test, i in kfolded(mixt, nrfolds): c = Chrono().start() spn = SPN.LearnStructure(train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) c.end() spn.root.validate() ll = numpy.mean(spn.root.eval(test)) print(ll) stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll) stats.add("HSPN", Stats.TIME, c.elapsed()) stats.save("stats_" + dsname + ".json") print(arcs)
def rbo(l1, l2, p=0.98): """ It computea Ranked Biased Overlap (RBO) score. l1 -- ranked list l2 -- ranked list """ if not l1: l1 = [] if not l2: l2 = [] sl, ll = sorted([(len(l1), l1), (len(l2), l2)]) s, S = sl l, L = ll if s == 0: return 0 # Calculate the overlaps at ranks 1 through l # (the longer of the two lists) ss = set([]) # contains elements from the smaller list till depth i ls = set([]) # contains elements from the longer list till depth i x_d = {0: 0} sum1 = 0.0 for i in range(l): x = L[i] y = S[i] if i < s else None d = i + 1 # if two elements are identical then # we don't need to add to either of the set if x == y: x_d[d] = x_d[d - 1] + 1.0 # else add items to respective list # and calculate overlap else: ls.add(x) if y != None: ss.add(y) x_d[d] = x_d[d - 1] + (1.0 if x in ss else 0.0) + (1.0 if y in ls else 0.0) #calculate average overlap sum1 += x_d[d] / d * pow(p, d) sum2 = 0.0 for i in range(l - s): d = s + i + 1 sum2 += x_d[d] * (d - s) / (d * s) * pow(p, d) sum3 = ((x_d[l] - x_d[s]) / l + x_d[s] / s) * pow(p, l) # Equation 32 rbo_ext = (1 - p) / p * (sum1 + sum2) + sum3 return rbo_ext
def getHydrochemLL(): dsname, data, features = getHydrochem() print(data) print(data.shape) featureTypes = ["continuous"] * data.shape[1] domains = [[0, 1]] * data.shape[1] print(domains) families = ['piecewise'] * data.shape[1] #families = ['histogram'] * data.shape[1] #@memory.cache def learn(data, families, mininst, alpha, th): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=th), domains=domains, alpha=alpha, families=families, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=mininst) return spn stats = Stats(name=dsname) alll = [] for train, test, i in kfolded(data, 5): dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=100000) ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train, families, 10, 0.1, 0.1) ll = spn.root.eval(test) alll.append(numpy.mean(ll)) stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) print(numpy.mean(alll)) stats.save("results/hydrochems/" + dsname + ".json")
def computeNIPS(dimensions): dsname, data, features = getNips() lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50, learning_method='online', learning_offset=50., random_state=0) lda.fit(data) mixt = lda.transform(data) def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")
def getAirPollution(dimensions): dsname, data, features = getAirQualityUCITimeless() idxmissing = data == -200 data = data[:, numpy.sum(idxmissing,0) < 2000] idxmissing = data == -200 data = data[numpy.sum(idxmissing,1) == 0, :] idxmissing = data == -200 print(data.shape) _, mixt = getArchetypes(data, dimensions) if mixt is None: print( "no archetypes", dimensions) #0/0 return def normalize(data): mixtd = data mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001 mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001 mixtnorm = numpy.sum(mixtd, axis=1) mixtd = numpy.divide(mixtd, mixtnorm[:, None]) return mixtd+0.0 mixt = normalize(mixt) print(data.shape) featureTypes = ["continuous"] * mixt.shape[1] domains = [[0,1]] * mixt.shape[1] print(domains) @memory.cache def learn(data): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families = ['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), #min_instances_slice=int(data.shape[0]*0.01)) min_instances_slice=200) return spn stats = Stats(name=dsname) for train, test, i in kfolded(mixt, 10): print(i) #dirichlet_alphas = getDirichlet(train) dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000) print("dirichlet done") ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas) stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll)) stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)) spn = learn(train) print("spn done") ll = spn.root.eval(test) print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))) print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))) stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")