def reducer(self, n, vars): MRJob.set_status(self, "=============> reducer called") samples_from_mappers = [] counts_from_mappers = [] # First read all the counts from different mappers fo we know the total number of items and we can give # each of the sets coming from different mappers their appropriate weight total_counts_from_mappers = 0 for x in vars: input = json.loads(x) total_counts_from_mappers += input[0] counts_from_mappers.append(input[0]) samples_from_mappers.append(input[1]) # Now based on the number of samples in each mapper we need to select appropriate number of samples form # samples_from_mappers i = 0 for sample_set in samples_from_mappers: weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers number_of_needed_samples = int(round(weight * self.options.sample_size)) for j in range(number_of_needed_samples): yield 1, sample_set.pop() i += 1
def test_counters_and_status(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar') mr_job.set_status('Initializing qux gradients...') mr_job.increment_counter('Foo', 'Bar') mr_job.increment_counter('Foo', 'Baz', 20) mr_job.set_status('Sorting metasyntactic variables...') parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual( parsed_stderr, { 'counters': { 'Foo': { 'Bar': 2, 'Baz': 20 } }, 'statuses': [ 'Initializing qux gradients...', 'Sorting metasyntactic variables...' ], 'other': [] })
def test_counters_and_status(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar') mr_job.set_status('Initializing qux gradients...') mr_job.increment_counter('Foo', 'Bar') mr_job.increment_counter('Foo', 'Baz', 20) mr_job.set_status('Sorting metasyntactic variables...') parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual( parsed_stderr, { 'counters': { 'Foo': { 'Bar': 2, 'Baz': 20 } }, 'statuses': [ 'Initializing qux gradients...', 'Sorting metasyntactic variables...' ], 'other': [] }) # make sure parse_counters() works self.assertEqual(mr_job.parse_counters(), parsed_stderr['counters'])
def reducer(self, n, vars): MRJob.set_status(self, "=============> reducer called") print "reducer:", vars samples_from_mappers = [] counts_from_mappers = [] # First read all the counts from different mappers fo we know the total number of items and we can give # each of the sets coming from different mappers their appropriate weight total_counts_from_mappers = 0 for x in vars: input = json.loads(x) total_counts_from_mappers += input[0] counts_from_mappers.append(input[0]) samples_from_mappers.append(input[1]) # Now based on the number of samples in each mapper we need to select appropriate number of samples form # samples_from_mappers i = 0 fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w") for sample_set in samples_from_mappers: weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers number_of_needed_samples = int(round(weight * self.options.sample_size)) for j in range(number_of_needed_samples): fileOut.write(str(sample_set.pop()) + '\n') i += 1 fileOut.close() if False: yield 1,2
def mapper_final(self): MRJob.set_status(self, "=============> mapper final called") out = [self.count, self.samples] jOut = json.dumps(out) yield 1, jOut
def reducer(self, n, vars): MRJob.set_status(self, "=============> reducer called") samples_from_mappers = [] counts_from_mappers = [] # First read all the counts from different mappers fo we know the total number of items and we can give # each of the sets coming from different mappers their appropriate weight total_counts_from_mappers = 0 for x in vars: input = json.loads(x) total_counts_from_mappers += input[0] counts_from_mappers.append(input[0]) samples_from_mappers.append(input[1]) # Now based on the number of samples in each mapper we need to select appropriate number of samples form # samples_from_mappers i = 0 for sample_set in samples_from_mappers: weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers number_of_needed_samples = int( round(weight * self.options.sample_size)) for j in range(number_of_needed_samples): yield 1, sample_set.pop() i += 1
def mapper_final(self): MRJob.set_status(self, "=============> mapper final called") out = [self.count, self.samples] jOut = json.dumps(out) yield 1, jOut
def reducer_final(self): MRJob.set_status(self, "=============> reducer final called") for label in self.output: stratum_samples = self.output[label] yield label, (len(stratum_samples), stratum_samples)
def mapper_final(self): MRJob.set_status(self, "=============> mapper final called") for label in self.strata: stratum = self.strata[label] number_of_samples = int( len(stratum) * self.options.sampling_rate ) if not stratum: # stratum should not be empty pass else:
def mapper(self, key, line): MRJob.set_status(self, "=============> mapper called") sample_line = line self.count += 1 if len(self.samples) <= self.options.sample_size: self.samples.append(sample_line) else: expected_prob = (self.options.sample_size * 1.0) / self.count actual_prob = random.random() if actual_prob <= expected_prob: index = random.randint(0, self.options.sample_size) self.samples[index] = sample_line
def mapper(self, key, line): MRJob.set_status(self, "=============> mapper called") sample_line = line self.count += 1 if len(self.samples) <= self.options.sample_size: self.samples.append(sample_line) else: expected_prob = (self.options.sample_size * 1.0) / self.count actual_prob = random.random() if actual_prob <= expected_prob: index = random.randint(0, self.options.sample_size) self.samples[index] = sample_line
def test_counters_and_status(self): mr_job = MRJob().sandbox() mr_job.increment_counter("Foo", "Bar") mr_job.set_status("Initializing qux gradients...") mr_job.increment_counter("Foo", "Bar") mr_job.increment_counter("Foo", "Baz", 20) mr_job.set_status("Sorting metasyntactic variables...") parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual( parsed_stderr, { "counters": {"Foo": {"Bar": 2, "Baz": 20}}, "statuses": ["Initializing qux gradients...", "Sorting metasyntactic variables..."], "other": [], }, )
def test_unicode_set_status(self): mr_job = MRJob().sandbox() # shouldn't raise an exception mr_job.set_status(u'💩')
def test_unicode_set_status(self): mr_job = MRJob().sandbox() # shouldn't raise an exception mr_job.set_status(u'💩')
def mapper_init(self): MRJob.set_status(self, "=============> mapper init called")