def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0
        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                yield 1, sample_set.pop()

            i += 1
Exemple #2
0
    def test_counters_and_status(self):
        mr_job = MRJob().sandbox()

        mr_job.increment_counter('Foo', 'Bar')
        mr_job.set_status('Initializing qux gradients...')
        mr_job.increment_counter('Foo', 'Bar')
        mr_job.increment_counter('Foo', 'Baz', 20)
        mr_job.set_status('Sorting metasyntactic variables...')

        parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue())

        self.assertEqual(
            parsed_stderr, {
                'counters': {
                    'Foo': {
                        'Bar': 2,
                        'Baz': 20
                    }
                },
                'statuses': [
                    'Initializing qux gradients...',
                    'Sorting metasyntactic variables...'
                ],
                'other': []
            })
Exemple #3
0
    def test_counters_and_status(self):
        mr_job = MRJob().sandbox()

        mr_job.increment_counter('Foo', 'Bar')
        mr_job.set_status('Initializing qux gradients...')
        mr_job.increment_counter('Foo', 'Bar')
        mr_job.increment_counter('Foo', 'Baz', 20)
        mr_job.set_status('Sorting metasyntactic variables...')

        parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue())

        self.assertEqual(
            parsed_stderr, {
                'counters': {
                    'Foo': {
                        'Bar': 2,
                        'Baz': 20
                    }
                },
                'statuses': [
                    'Initializing qux gradients...',
                    'Sorting metasyntactic variables...'
                ],
                'other': []
            })

        # make sure parse_counters() works
        self.assertEqual(mr_job.parse_counters(), parsed_stderr['counters'])
Exemple #4
0
    def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        print "reducer:", vars
        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0

        fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w")

        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                fileOut.write(str(sample_set.pop()) + '\n')


            i += 1
        fileOut.close()
        if False: yield 1,2
Exemple #5
0
    def mapper_final(self):
        MRJob.set_status(self, "=============>  mapper final called")

        out = [self.count, self.samples]
        jOut = json.dumps(out)

        yield 1, jOut
    def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0
        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(
                round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                yield 1, sample_set.pop()

            i += 1
    def mapper_final(self):
        MRJob.set_status(self, "=============>  mapper final called")

        out = [self.count, self.samples]
        jOut = json.dumps(out)

        yield 1, jOut
Exemple #8
0
    def reducer_final(self):

        MRJob.set_status(self, "=============>  reducer final called")

        for label in self.output:
            stratum_samples = self.output[label]
            yield label, (len(stratum_samples), stratum_samples)
Exemple #9
0
    def mapper_final(self):

        MRJob.set_status(self, "=============>  mapper final called")

        for label in self.strata:
            stratum = self.strata[label]
            number_of_samples = int( len(stratum) * self.options.sampling_rate )

            if not stratum: # stratum should not be empty                                                                                                                                                 
        pass
            else:
    def mapper(self, key, line):
        MRJob.set_status(self, "=============>  mapper called")

        sample_line = line

        self.count += 1
        if len(self.samples) <= self.options.sample_size:
            self.samples.append(sample_line)
        else:
            expected_prob = (self.options.sample_size * 1.0) / self.count
            actual_prob = random.random()
            if actual_prob <= expected_prob:
                index = random.randint(0, self.options.sample_size)
                self.samples[index] = sample_line
    def mapper(self, key, line):
        MRJob.set_status(self, "=============>  mapper called")

        sample_line = line

        self.count += 1
        if len(self.samples) <= self.options.sample_size:
            self.samples.append(sample_line)
        else:
            expected_prob = (self.options.sample_size * 1.0) / self.count
            actual_prob = random.random()
            if actual_prob <= expected_prob:
                index = random.randint(0, self.options.sample_size)
                self.samples[index] = sample_line
Exemple #12
0
    def test_counters_and_status(self):
        mr_job = MRJob().sandbox()

        mr_job.increment_counter("Foo", "Bar")
        mr_job.set_status("Initializing qux gradients...")
        mr_job.increment_counter("Foo", "Bar")
        mr_job.increment_counter("Foo", "Baz", 20)
        mr_job.set_status("Sorting metasyntactic variables...")

        parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue())

        self.assertEqual(
            parsed_stderr,
            {
                "counters": {"Foo": {"Bar": 2, "Baz": 20}},
                "statuses": ["Initializing qux gradients...", "Sorting metasyntactic variables..."],
                "other": [],
            },
        )
Exemple #13
0
 def test_unicode_set_status(self):
     mr_job = MRJob().sandbox()
     # shouldn't raise an exception
     mr_job.set_status(u'💩')
Exemple #14
0
 def test_unicode_set_status(self):
     mr_job = MRJob().sandbox()
     # shouldn't raise an exception
     mr_job.set_status(u'💩')
Exemple #15
0
 def mapper_init(self):
     MRJob.set_status(self, "=============>  mapper init called")