def estimate_density(self, training_data, validation_data=None):
        """Fit a MSPN on the training data. The variable validation_data is
        never used."""
        feature_types = []
        feature_names = []
        families = []
        for feat, str_type in training_data.features:
            feature_types.append(str_type)
            feature_names.append(feat.symbol_name())
            if 'leaf' in self.learner_args:
                families.append(self.learner_args['leaf'])
            else:
                families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()])

        if 'row_split' in self.learner_args:
            if self.learner_args['row_split'] == 'gower':
                row_split_method = Splitting.Gower(n_clusters=2)
            elif self.learner_args['row_split'] == 'rdc-kmeans':
                row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                           k=20,
                                                           OHE=1)
            else:
                raise NotImplementedError()

        else:
            row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                       k=20,
                                                       OHE=1)

        col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1)

        rand_seed = self.learner_args['seed']
        mspnargs = {
            k: v
            for k, v in self.learner_args.items()
            if k not in ['seed', 'leaf', 'row_split']
        }

        # let MSPNs sort this out
        families = None
        self.spn = SPN.LearnStructure(asarray(training_data.data),
                                      feature_types,
                                      families=families,
                                      featureNames=feature_names,
                                      rand_seed=rand_seed,
                                      row_split_method=row_split_method,
                                      col_split_method=col_split_method,
                                      **mspnargs)
Example #2
0
 def learn(data):
     spn = SPN.LearnStructure(
         data,
         featureTypes=featureTypes,
         row_split_method=Splitting.KmeansRDCRows(),
         col_split_method=Splitting.RDCTest(threshold=0.3),
         domains=domains,
         alpha=0.1,
         families=['histogram'] * data.shape[1],
         # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
         # domains, families=families, row_split_method=Splitting.KmeansRows(),
         # col_split_method=Splitting.RDCTest(),
         min_instances_slice=min_instances_slice)
     return spn
Example #3
0
def learn(data,
          featureTypes,
          families,
          domains,
          min_instances_slice,
          alpha=0.1):
    spn = SPN.LearnStructure(data,
                             alpha=alpha,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRDCRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             domains=domains,
                             families=families,
                             min_instances_slice=min_instances_slice)
    return spn
Example #4
0
def learn(data,
          featureTypes,
          families,
          domains,
          feature_names,
          min_instances_slice,
          prior_weight=0.0):
    return SPN.LearnStructure(
        data,
        prior_weight=prior_weight,
        featureTypes=featureTypes,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=0.3),
        domains=domains,
        families=families,
        featureNames=feature_names,
        min_instances_slice=min_instances_slice)
             col_split_method=Splitting.IndependenceTest(0.001))

marg = pspn.marginalize([0, 1, 2, 3])

print(marg.toEquation())
print(marg)

0 / 0

mspn = learn(train,
             featureTypes=["discrete"] * data.shape[1],
             families=["isotonic"] * data.shape[1],
             domains=domains,
             feature_names=words,
             min_instances_slice=200,
             row_split_method=Splitting.KmeansRDCRows(),
             col_split_method=Splitting.RDCTest(threshold=0.1, OHE=False))

#print(pspn)
# print(mspn)

print("sum LL pspn", numpy.sum(pspn.root.eval(test)))
print("sum LL mspn", numpy.sum(mspn.root.eval(test)))
print("mean LL pspn", numpy.mean(pspn.root.eval(test)))
print("mean LL mspn", numpy.mean(mspn.root.eval(test)))

0 / 0


def getmiforfeature(input):
    spn, i, j = input
Example #6
0
def learn_spn(dataset="data/iris",
              precision=25,
              independence=0.1,
              header=0,
              date=None,
              isotonic=False,
              histogram=True,
              types=False):
    skiprows = [1] if types else []
    df = pd.read_csv(dataset,
                     delimiter=",",
                     header=header,
                     parse_dates=date,
                     skiprows=skiprows)
    df = df.dropna(axis=0, how='any')
    featureNames = df.columns.values.tolist() if header == 0 else [
        "X_{}".format(i) for i in range(len(df.columns))
    ]

    dtypes = df.dtypes

    if types:
        featureTypes = []
        families = []
        with open(dataset, 'r') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            csvreader.__next__()
            _types = csvreader.__next__()
        for featureType in _types:
            print(featureType)
            if featureType == 'cat':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType == 'con':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType == 'dis':
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')

    def to_featureTypes(types):
        featureTypes = []
        families = []
        for featureType in types:
            if featureType.kind == 'O':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType.kind == 'f':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType.kind == np.dtype('i'):
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')
        return featureTypes, families

    if not types:
        featureTypes, families = to_featureTypes(dtypes)

    data_dictionary = {
        'features':
        [{
            "name": name,
            "family": family,
            "type": typ,
            'pandas_type': dtypes[i]
        }
         for i, (name, family,
                 typ) in enumerate(zip(featureNames, families, featureTypes))],
        'num_entries':
        len(df)
    }

    # print(df.info())

    idx = df.columns

    for id, name in enumerate(idx):
        if featureTypes[id] == 'categorical':
            lb = LabelEncoder()
            data_dictionary['features'][id]["encoder"] = lb
            df[name] = df[name].astype('category')
            df[name] = lb.fit_transform(df[name])
            data_dictionary['features'][id]["values"] = lb.transform(
                lb.classes_)
        if dtypes[id].kind == 'M':
            df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D')

    # print(df.head())
    data = np.array(df)

    # print(featureTypes)
    spn = SPN.LearnStructure(
        data,
        featureTypes=featureTypes,
        featureNames=featureNames,
        min_instances_slice=precision,
        families=families,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=independence))

    spn.name = dataset
    return spn, data_dictionary