def estimate_density(self, training_data, validation_data=None): """Fit a MSPN on the training data. The variable validation_data is never used.""" feature_types = [] feature_names = [] families = [] for feat, str_type in training_data.features: feature_types.append(str_type) feature_names.append(feat.symbol_name()) if 'leaf' in self.learner_args: families.append(self.learner_args['leaf']) else: families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()]) if 'row_split' in self.learner_args: if self.learner_args['row_split'] == 'gower': row_split_method = Splitting.Gower(n_clusters=2) elif self.learner_args['row_split'] == 'rdc-kmeans': row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) else: raise NotImplementedError() else: row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1) rand_seed = self.learner_args['seed'] mspnargs = { k: v for k, v in self.learner_args.items() if k not in ['seed', 'leaf', 'row_split'] } # let MSPNs sort this out families = None self.spn = SPN.LearnStructure(asarray(training_data.data), feature_types, families=families, featureNames=feature_names, rand_seed=rand_seed, row_split_method=row_split_method, col_split_method=col_split_method, **mspnargs)
def learn(data): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families=['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn
def learn(data, featureTypes, families, domains, min_instances_slice, alpha=0.1): spn = SPN.LearnStructure(data, alpha=alpha, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, families=families, min_instances_slice=min_instances_slice) return spn
def learn(data, featureTypes, families, domains, feature_names, min_instances_slice, prior_weight=0.0): return SPN.LearnStructure( data, prior_weight=prior_weight, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, families=families, featureNames=feature_names, min_instances_slice=min_instances_slice)
col_split_method=Splitting.IndependenceTest(0.001)) marg = pspn.marginalize([0, 1, 2, 3]) print(marg.toEquation()) print(marg) 0 / 0 mspn = learn(train, featureTypes=["discrete"] * data.shape[1], families=["isotonic"] * data.shape[1], domains=domains, feature_names=words, min_instances_slice=200, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.1, OHE=False)) #print(pspn) # print(mspn) print("sum LL pspn", numpy.sum(pspn.root.eval(test))) print("sum LL mspn", numpy.sum(mspn.root.eval(test))) print("mean LL pspn", numpy.mean(pspn.root.eval(test))) print("mean LL mspn", numpy.mean(mspn.root.eval(test))) 0 / 0 def getmiforfeature(input): spn, i, j = input
def learn_spn(dataset="data/iris", precision=25, independence=0.1, header=0, date=None, isotonic=False, histogram=True, types=False): skiprows = [1] if types else [] df = pd.read_csv(dataset, delimiter=",", header=header, parse_dates=date, skiprows=skiprows) df = df.dropna(axis=0, how='any') featureNames = df.columns.values.tolist() if header == 0 else [ "X_{}".format(i) for i in range(len(df.columns)) ] dtypes = df.dtypes if types: featureTypes = [] families = [] with open(dataset, 'r') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='|') csvreader.__next__() _types = csvreader.__next__() for featureType in _types: print(featureType) if featureType == 'cat': featureTypes.append('categorical') if histogram: families.append('histogram') elif isotonic: families.append('isotonic') else: families.append('piecewise') elif featureType == 'con': featureTypes.append('continuous') families.append('piecewise' if not isotonic else 'isotonic') elif featureType == 'dis': featureTypes.append('discrete') families.append('piecewise' if not isotonic else 'isotonic') else: featureTypes.append('unknown') families.append('piecewise' if not isotonic else 'isotonic') def to_featureTypes(types): featureTypes = [] families = [] for featureType in types: if featureType.kind == 'O': featureTypes.append('categorical') if histogram: families.append('histogram') elif isotonic: families.append('isotonic') else: families.append('piecewise') elif featureType.kind == 'f': featureTypes.append('continuous') families.append('piecewise' if not isotonic else 'isotonic') elif featureType.kind == np.dtype('i'): featureTypes.append('discrete') families.append('piecewise' if not isotonic else 'isotonic') else: featureTypes.append('unknown') families.append('piecewise' if not isotonic else 'isotonic') return featureTypes, families if not types: featureTypes, families = to_featureTypes(dtypes) data_dictionary = { 'features': [{ "name": name, "family": family, "type": typ, 'pandas_type': dtypes[i] } for i, (name, family, typ) in enumerate(zip(featureNames, families, featureTypes))], 'num_entries': len(df) } # print(df.info()) idx = df.columns for id, name in enumerate(idx): if featureTypes[id] == 'categorical': lb = LabelEncoder() data_dictionary['features'][id]["encoder"] = lb df[name] = df[name].astype('category') df[name] = lb.fit_transform(df[name]) data_dictionary['features'][id]["values"] = lb.transform( lb.classes_) if dtypes[id].kind == 'M': df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D') # print(df.head()) data = np.array(df) # print(featureTypes) spn = SPN.LearnStructure( data, featureTypes=featureTypes, featureNames=featureNames, min_instances_slice=precision, families=families, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=independence)) spn.name = dataset return spn, data_dictionary