Beispiel #1
0
 def get_insert_dist(self, n_features, initial_seq):
     if isinstance(initial_seq[0], int) \
             or np.issubdtype(initial_seq[0], np.integer): #equal distribution
         return DiscreteDistribution.from_samples(range(n_features))
     else:  #distribution based on initial sequence
         return MultivariateGaussianDistribution.from_samples(
             np.array(initial_seq))
Beispiel #2
0
def train_model(data: np.ndarray,
                clusters: int = 5,
                init_nodes: list = None) -> BayesianNetwork:

    bn = BayesNet()
    #Сluster the initial data in order to fill in a hidden variable based on the distribution of clusters
    kmeans = KMeans(n_clusters=clusters, random_state=0).fit(data)
    labels = kmeans.labels_
    hidden_dist = DiscreteDistribution.from_samples(labels)
    hidden_var = np.array(hidden_dist.sample(data.shape[0]))
    new_data = np.column_stack((data, hidden_var))
    latent = (new_data.shape[1]) - 1

    #Train the network structure on data taking into account a hidden variable
    bn = hc_rr(new_data, latent=latent, init_nodes=init_nodes)
    structure = []
    nodes = sorted(list(bn.nodes()))
    for rv in nodes:
        structure.append(tuple(bn.F[rv]['parents']))
    structure = tuple(structure)
    bn = BayesianNetwork.from_structure(new_data, structure)
    bn.bake()
    #Learn a hidden variable
    hidden_var = np.array([np.nan] * (data.shape[0]))
    new_data = np.column_stack((data, hidden_var))
    bn.predict(new_data)
    bn.fit(new_data)
    bn.bake()
    return (bn)
Beispiel #3
0
 def get_match_dist(self, index, n_features, initial_seq):
     if isinstance(initial_seq[index], int):
         return DiscreteDistribution.from_samples(range(n_features))
         #return DiscreteDistribution.from_samples(np.concatenate(
         #    (np.repeat(index, INITIAL_EMPHASIS), range(n_features))))
     else:
         return MultivariateGaussianDistribution.from_samples(
             np.concatenate(
                 (np.tile(index,
                          (INITIAL_EMPHASIS, 1)), np.array(initial_seq))))
 def worker(node: Type[BaseNode]) -> DiscreteParams:
     parents = node.disc_parents + node.cont_parents
     if not parents:
         dist = DiscreteDistribution.from_samples(
             data[node.name].values)
         cprob = list(dict(sorted(dist.items())).values())
         vals = sorted(
             [str(x) for x in list(dist.parameters[0].keys())])
     else:
         dist = DiscreteDistribution.from_samples(
             data[node.name].values)
         vals = sorted(
             [str(x) for x in list(dist.parameters[0].keys())])
         dist = ConditionalProbabilityTable.from_samples(
             data[parents + [node.name]].values)
         params = dist.parameters[0]
         cprob = dict()
         for i in range(0, len(params), len(vals)):
             probs = []
             for j in range(i, (i + len(vals))):
                 probs.append(params[j][-1])
             combination = [str(x) for x in params[i][0:len(parents)]]
             cprob[str(combination)] = probs
     return {"cprob": cprob, 'vals': vals}
def test_discrete():
    d = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25})

    assert_equal(d.log_probability('C'), -1.3862943611198906)
    assert_equal(d.log_probability('A'), d.log_probability('C'))
    assert_equal(d.log_probability('G'), d.log_probability('T'))
    assert_equal(d.log_probability('a'), float('-inf'))

    seq = "ACGTACGTTGCATGCACGCGCTCTCGCGC"
    d.fit(list(seq))

    assert_equal(d.log_probability('C'), -0.9694005571881036)
    assert_equal(d.log_probability('A'), -1.9810014688665833)
    assert_equal(d.log_probability('T'), -1.575536360758419)

    seq = "ACGTGTG"
    d.fit(list(seq), weights=[0., 1., 2., 3., 4., 5., 6.])

    assert_equal(d.log_probability('A'), float('-inf'))
    assert_equal(d.log_probability('C'), -3.044522437723423)
    assert_equal(d.log_probability('G'), -0.5596157879354228)

    d.summarize(list("ACG"), weights=[0., 1., 2.])
    d.summarize(list("TGT"), weights=[3., 4., 5.])
    d.summarize(list("G"), weights=[6.])
    d.from_summaries()

    assert_equal(d.log_probability('A'), float('-inf'))
    assert_equal(round(d.log_probability('C'), 4), -3.0445)
    assert_equal(round(d.log_probability('G'), 4), -0.5596)

    d = DiscreteDistribution({'A': 0.0, 'B': 1.0})
    d.summarize(list("ABABABAB"))
    d.summarize(list("ABAB"))
    d.summarize(list("BABABABABABABABABA"))
    d.from_summaries(inertia=0.75)
    assert_equal(d.parameters[0], {'A': 0.125, 'B': 0.875})

    d = DiscreteDistribution({'A': 0.0, 'B': 1.0})
    d.summarize(list("ABABABAB"))
    d.summarize(list("ABAB"))
    d.summarize(list("BABABABABABABABABA"))
    d.from_summaries(inertia=0.5)
    assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75})

    d.freeze()
    d.fit(list('ABAABBAAAAAAAAAAAAAAAAAA'))
    assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75})

    d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'])
    assert_equal(d.parameters[0], {'A': 0.75, 'B': 0.25})

    d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'],
                                          pseudocount=0.5)
    assert_equal(d.parameters[0], {'A': 0.70, 'B': 0.30})

    d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=6)
    assert_equal(d.parameters[0], {'A': 0.5625, 'B': 0.4375})

    e = Distribution.from_json(d.to_json())
    assert_equal(e.name, "DiscreteDistribution")
    assert_equal(e.parameters[0], {'A': 0.5625, 'B': 0.4375})

    f = pickle.loads(pickle.dumps(e))
    assert_equal(f.name, "DiscreteDistribution")
    assert_equal(f.parameters[0], {'A': 0.5625, 'B': 0.4375})
def test_discrete_robust_json_serialization():
	d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=6)

	e = from_json(d.to_json())
	assert_equal(e.name, "DiscreteDistribution")
	assert_equal(e.parameters[0], {'A': 0.5625, 'B': 0.4375})
def test_discrete():
	d = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25})

	assert_equal(d.log_probability('C'), -1.3862943611198906)
	assert_equal(d.log_probability('A'), d.log_probability('C'))
	assert_equal(d.log_probability('G'), d.log_probability('T'))
	assert_equal(d.log_probability('a'), float('-inf'))

	seq = "ACGTACGTTGCATGCACGCGCTCTCGCGC"
	d.fit(list(seq))

	assert_equal(d.log_probability('C'), -0.9694005571881036)
	assert_equal(d.log_probability('A'), -1.9810014688665833)
	assert_equal(d.log_probability('T'), -1.575536360758419)

	seq = "ACGTGTG"
	d.fit(list(seq), weights=[0., 1., 2., 3., 4., 5., 6.])

	assert_equal(d.log_probability('A'), float('-inf'))
	assert_equal(d.log_probability('C'), -3.044522437723423)
	assert_equal(d.log_probability('G'), -0.5596157879354228)

	d.summarize(list("ACG"), weights=[0., 1., 2.])
	d.summarize(list("TGT"), weights=[3., 4., 5.])
	d.summarize(list("G"), weights=[6.])
	d.from_summaries()

	assert_equal(d.log_probability('A'), float('-inf'))
	assert_equal(round(d.log_probability('C'), 4), -3.0445)
	assert_equal(round(d.log_probability('G'), 4), -0.5596)

	d = DiscreteDistribution({'A': 0.0, 'B': 1.0})
	d.summarize(list("ABABABAB"))
	d.summarize(list("ABAB"))
	d.summarize(list("BABABABABABABABABA"))
	d.from_summaries(inertia=0.75)
	assert_equal(d.parameters[0], {'A': 0.125, 'B': 0.875})

	d = DiscreteDistribution({'A': 0.0, 'B': 1.0})
	d.summarize(list("ABABABAB"))
	d.summarize(list("ABAB"))
	d.summarize(list("BABABABABABABABABA"))
	d.from_summaries(inertia=0.5)
	assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75})

	d.freeze()
	d.fit(list('ABAABBAAAAAAAAAAAAAAAAAA'))
	assert_equal(d.parameters[0], {'A': 0.25, 'B': 0.75})

	d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'])
	assert_equal(d.parameters[0], {'A': 0.75, 'B': 0.25})

	d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=0.5)
	assert_equal(d.parameters[0], {'A': 0.70, 'B': 0.30})

	d = DiscreteDistribution.from_samples(['A', 'B', 'A', 'A'], pseudocount=6)
	assert_equal(d.parameters[0], {'A': 0.5625, 'B': 0.4375})

	e = Distribution.from_json(d.to_json())
	assert_equal(e.name, "DiscreteDistribution")
	assert_equal(e.parameters[0], {'A': 0.5625, 'B': 0.4375})

	f = pickle.loads(pickle.dumps(e))
	assert_equal(f.name, "DiscreteDistribution")
	assert_equal(f.parameters[0], {'A': 0.5625, 'B': 0.4375})