Beispiel #1
0
	def test_multinomial_sampler_stats(self):
		'''
		This tests that the sampler really does produce results whose
		statistics match those requested by the counts vector
		'''
		# Seed numpy's random function to make the test reproducible
		np.random.seed(1)

		# Make a sampler with probabilities proportional to counts
		counts = range(1,6)
		sampler = Categorical(counts)

		# Draw one hundred thousand samples, then total up the fraction of
		# each outcome obseved
		counter = Counter(sampler.sample((100000,)))
		total = float(sum(counter.values()))
		found_normalized = [
			counter[i] / total for i in range(len(counts))
		]

		# Make an list of the expected fractions by which each outcome
		# should be observed, in the limit of infinite sample
		total_in_expected = float(sum(counts))
		expected_normalized = [
			c / total_in_expected for c in counts
		]

		# Check if each outcome was observed with a fraction that is within
		# 0.005 of the expected fraction
		close = [
			abs(f - e) < 0.005
			for f,e in zip(found_normalized, expected_normalized)
		]
		self.assertTrue(all(close))
Beispiel #2
0
    def test_multinomial_sampler_stats(self):
        '''
		This tests that the sampler really does produce results whose
		statistics match those requested by the counts vector
		'''
        # Seed numpy's random function to make the test reproducible
        np.random.seed(1)

        # Make a sampler with probabilities proportional to counts
        counts = range(1, 6)
        sampler = Categorical(counts)

        # Draw one hundred thousand samples, then total up the fraction of
        # each outcome obseved
        counter = Counter(sampler.sample((100000, )))
        total = float(sum(counter.values()))
        found_normalized = [counter[i] / total for i in range(len(counts))]

        # Make an list of the expected fractions by which each outcome
        # should be observed, in the limit of infinite sample
        total_in_expected = float(sum(counts))
        expected_normalized = [c / total_in_expected for c in counts]

        # Check if each outcome was observed with a fraction that is within
        # 0.005 of the expected fraction
        close = [
            abs(f - e) < 0.005
            for f, e in zip(found_normalized, expected_normalized)
        ]
        self.assertTrue(all(close))
Beispiel #3
0
    def test_multinomial_sampler(self):
        counts = range(1, 6)
        sampler = Categorical(counts)

        # Test asking for a single sample (where no shape tuple supplied)
        single_sample = sampler.sample()
        self.assertTrue(type(single_sample) is np.int64)

        # Test asking for an array of samples (by passing a shape tuple)
        shape = (2, 3, 5)
        array_sample = sampler.sample(shape)
        self.assertTrue(type(array_sample) is np.ndarray)
        self.assertTrue(array_sample.shape == shape)
Beispiel #4
0
	def test_multinomial_sampler(self):
		counts = range(1,6)
		sampler = Categorical(counts)

		# Test asking for a single sample (where no shape tuple supplied)
		single_sample = sampler.sample()
		self.assertTrue(type(single_sample) is np.int64)

		# Test asking for an array of samples (by passing a shape tuple)
		shape = (2,3,5)
		array_sample = sampler.sample(shape)
		self.assertTrue(type(array_sample) is np.ndarray)
		self.assertTrue(array_sample.shape == shape)
Beispiel #5
0
    def get_alias_edge(self, src, dst):
        '''
		Get the alias edge setup lists for a given edge.
		'''
        G = self.G
        p = self.p
        q = self.q

        unnormalized_probs = []
        for dst_nbr in sorted(G.neighbors(dst)):
            if dst_nbr == src:
                # return to prev
                unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p)
            elif G.has_edge(dst_nbr, src):
                # BFS
                unnormalized_probs.append(G[dst][dst_nbr]['weight'])
            else:
                # DFS, go deeper
                unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q)
        for i in range(len(unnormalized_probs)):
            unnormalized_probs[i] = abs(unnormalized_probs[i])

        norm_const = sum(unnormalized_probs)
        normalized_probs = [
            float(u_prob) / norm_const for u_prob in unnormalized_probs
        ]

        return normalized_probs and Categorical(normalized_probs)
Beispiel #6
0
 def preprocessorMain(self):
     self.removeTargetColumn()
     while (1):
         print("\nTasks (Preprocessing)\n")
         for task in self.tasks:
             print(task)
         while (1):
             try:
                 choice = int(
                     input(
                         "\nWhat do you want to do? [enter -1 to exit]:  "))
             except ValueError:
                 print("Integer Value required. Try again.....")
                 continue
             break
         if choice == -1:
             exit()
         elif choice == 1:
             DataDescription(self.data).describe()
         elif choice == 2:
             self.data = Imputation(self.data).imputer()
         elif choice == 3:
             self.data = Categorical(self.data).categoricalMain()
         elif choice == 4:
             self.data = FeatureScaling(self.data).scaling()
         elif choice == 5:
             Download(self.data).download()
         else:
             print("\nWrong choice!! Try again...")
Beispiel #7
0
	def choose_token(self, idx, length):
		'''
		Randomly choosea  token according to the kernel supplied
		in the constructor.  Note that when sampling the context near
		the beginning of a sentence, the left part of the context window
		will be truncated.  Similarly, sampling context near the end of
		a sentence leads to truncation of the right part of the context
		window.  Short sentences lead to truncation on both sides.

		To ensure that samples are returned within the possibly truncated
		window, two values define the actual extent of the context to be
		sampled:

		`idx`: index of the query word within the context.  E.g. if the
			valid context is constrained to a sentence, and the query word
			is the 3rd token in the sentence, idx should be 2 (because
			of 0-based indexing)

		`length`: length of the the context, E.g. If context is
			constrained to a sentence, and sentence is 7 tokens long,
			length should be 7.
		'''

		# If the token is near the edges of the context, then the
		# sampling kernel will be truncated (we can't sample before the
		# firs word in the sentence, or after the last word).
		# Determine the slice indices that define the truncated kernel.
		negative_idx = length - idx
		start = max(0, self.K - idx)
		stop = min(2*self.K, self.K + negative_idx - 1)

		# We make a separate multinomial sampler for each different
		# truncation of the kernel, because they each define a different
		# set of sampling probabilityes.  If we don't have a sampler for
		# this particular kernel shape, make one.
		if not (start, stop) in self.samplers:

			trunc_probabilities = self.kernel[start:stop]
			self.samplers[start,stop] = (
				Categorical(trunc_probabilities)
			)

		# Sample from the multinomial sampler for the context of this shape
		outcome_idx = self.samplers[start,stop].sample()

		# Map this into the +/- indexing relative to the query word
		relative_idx = self.indices[outcome_idx + start]

		# And then map this into absolute indexing
		result_idx = relative_idx + idx

		return result_idx
Beispiel #8
0
        def assign_probablistic_sampler_per_node(node):
            unnormalized_probs = []
            norm_const = 0
            for nbr in sorted(G.neighbors(node)):
                abs_prob = abs(G[node][nbr]['weight'])
                unnormalized_probs.append(abs_prob)
                norm_const += abs_prob

            normalized_probs = [
                float(u_prob) / norm_const for u_prob in unnormalized_probs
            ]

            # alias_nodes[node] contains a sampler (using the alias method) of the
            # randome walk probability distribution of all its neighbors, the probability list is sorted by the neighbor nodes.
            alias_nodes[node] = normalized_probs and Categorical(
                normalized_probs)