Example #1
0
  def run(self):
    # AERS_SGML_2007q4.ZIP has files in sqml

    filenames = []
    for input in self.input():
      sgml_path = '/s[gq]ml/*.SGM'
      xml_path = '/[Xx][Mm][Ll]/*.xml'
      logging.info('Checking for inputs in: %s', input.path)
      filenames.extend(glob.glob(input.path + sgml_path))
      filenames.extend(glob.glob(input.path + xml_path))

    assert len(filenames) > 0, 'No files to process for quarter? %s' % self.quarter

    input_shards = []
    for filename in filenames:
      if 'test' in filename.lower():
        continue
      logging.info('Adding input file to pool: %s', filename)
      input_shards.append(filename)

    report_counts = parallel.mapreduce(
      parallel.Collection.from_list(input_shards),
      xml_to_json.ExtractSafetyReportsMapper(),
      xml_to_json.MergeSafetyReportsReducer(),
      self.output().path,
      num_shards=16)

    combined_counts = collections.defaultdict(int)
    for rc in report_counts:
      for timestamp, count in rc.iteritems():
        combined_counts[timestamp] += count

    print '----REPORT COUNTS----'
    for timestamp, count in sorted(combined_counts.items()):
      print '>> ', timestamp, count
Example #2
0
    def run(self):
        logging.info('Pipelining...')
        # AERS_SGML_2007q4.ZIP has files in sqml
        sgml_path = '/AERS_SGML_*/s[gq]ml/*.SGM'
        xml_path = '/FAERS_XML*/[Xx][Mm][Ll]/*.xml'
        filenames = glob.glob(self.input().path + sgml_path)
        filenames.extend(glob.glob(self.input().path + xml_path))

        input_shards = []
        for filename in filenames:
            if 'test' in filename.lower():
                continue
            logging.info('Adding input file to pool: %s', filename)
            input_shards.append(filename)

        report_counts = parallel.mapreduce(
            parallel.Collection.from_list(input_shards),
            xml_to_json.ExtractSafetyReportsMapper(),
            xml_to_json.MergeSafetyReportsReducer(),
            self.output().path, 10)

        combined_counts = collections.defaultdict(int)
        for rc in report_counts:
            for timestamp, count in rc.iteritems():
                combined_counts[timestamp] += count

        print '----REPORT COUNTS----'
        for timestamp, count in sorted(combined_counts.items()):
            print '>> ', timestamp, count