Exemple #1
0
    def _run_wordcount_it(self, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Set extra options to the pipeline for test purpose
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'results'
        ])
        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'output': output,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        extra_opts.update(opts)

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
Exemple #2
0
 def test_basics(self):
   temp_path = self.create_temp_file(self.SAMPLE_TEXT)
   expected_words = collections.defaultdict(int)
   for word in re.findall(r'[\w\']+', self.SAMPLE_TEXT, re.UNICODE):
     expected_words[word] += 1
   wordcount.run(['--input=%s*' % temp_path, '--output=%s.result' % temp_path],
                 save_main_session=False)
   # Parse result file and compare.
   results = []
   with open_shards(temp_path + '.result-*-of-*') as result_file:
     for line in result_file:
       match = re.search(r'(\S+): ([0-9]+)', line)
       if match is not None:
         results.append((match.group(1), int(match.group(2))))
   self.assertEqual(sorted(results), sorted(expected_words.items()))
  def test_wordcount_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output = '/'.join([test_pipeline.get_option('output'),
                       test_pipeline.get_option('job_name'),
                       'results'])
    pipeline_verifiers = [PipelineStateMatcher(),
                          FileChecksumMatcher(output + '*-of-*',
                                              self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
 def test_basics(self):
   temp_path = self.create_temp_file(self.SAMPLE_TEXT)
   expected_words = collections.defaultdict(int)
   for word in re.findall(r'\w+', self.SAMPLE_TEXT):
     expected_words[word] += 1
   wordcount.run([
       '--input=%s*' % temp_path,
       '--output=%s.result' % temp_path])
   # Parse result file and compare.
   results = []
   with open(temp_path + '.result-00000-of-00001') as result_file:
     for line in result_file:
       match = re.search(r'([a-z]+): ([0-9]+)', line)
       if match is not None:
         results.append((match.group(1), int(match.group(2))))
   self.assertEqual(sorted(results), sorted(expected_words.iteritems()))
Exemple #5
0
 def test_basics(self):
     temp_path = self.create_temp_file(self.SAMPLE_TEXT)
     expected_words = collections.defaultdict(int)
     for word in re.findall(r'\w+', self.SAMPLE_TEXT):
         expected_words[word] += 1
     wordcount.run([
         '--input=%s*' % temp_path,
         '--output=%s.result' % temp_path])
     # Parse result file and compare.
     results = []
     with open(temp_path + '.result-00000-of-00001') as result_file:
         for line in result_file:
             match = re.search(r'([a-z]+): ([0-9]+)', line)
             if match is not None:
                 results.append((match.group(1), int(match.group(2))))
     self.assertEqual(sorted(results), sorted(expected_words.iteritems()))
 def test_basics(self):
   temp_path = self.create_temp_file(self.SAMPLE_TEXT)
   expected_words = collections.defaultdict(int)
   for word in re.findall(r'[\w\']+', self.SAMPLE_TEXT, re.UNICODE):
     expected_words[word.encode('utf-8')] += 1
   wordcount.run([
       '--input=%s*' % temp_path,
       '--output=%s.result' % temp_path])
   # Parse result file and compare.
   results = []
   with open_shards(temp_path + '.result-*-of-*') as result_file:
     for line in result_file:
       match = re.search(r'(\S+): ([0-9]+)', line, re.UNICODE)
       if match is not None:
         results.append((match.group(1), int(match.group(2))))
   self.assertEqual(sorted(results), sorted(expected_words.items()))
Exemple #7
0
    def test_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Set extra options to the pipeline for test purpose
        output = '/'.join([
            test_pipeline.get_option('output'),
            test_pipeline.get_option('job_name'), 'results'
        ])
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM)
        ]
        extra_opts = {
            'output': output,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
  def test_wordcount_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output = '/'.join([test_pipeline.get_option('output'),
                       str(int(time.time())),
                       'results'])
    arg_sleep_secs = test_pipeline.get_option('sleep_secs')
    sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
    pipeline_verifiers = [PipelineStateMatcher(),
                          FileChecksumMatcher(output + '*-of-*',
                                              self.DEFAULT_CHECKSUM,
                                              sleep_secs)]
    extra_opts = {'output': output,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register clean up before pipeline execution
    self.addCleanup(delete_files, [output + '*'])

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def run_wordcount_example():
    wordcount.run()