def run(self): # AERS_SGML_2007q4.ZIP has files in sqml filenames = [] for input in self.input(): sgml_path = '/s[gq]ml/*.SGM' xml_path = '/[Xx][Mm][Ll]/*.xml' logging.info('Checking for inputs in: %s', input.path) filenames.extend(glob.glob(input.path + sgml_path)) filenames.extend(glob.glob(input.path + xml_path)) assert len(filenames) > 0, 'No files to process for quarter? %s' % self.quarter input_shards = [] for filename in filenames: if 'test' in filename.lower(): continue logging.info('Adding input file to pool: %s', filename) input_shards.append(filename) report_counts = parallel.mapreduce( parallel.Collection.from_list(input_shards), xml_to_json.ExtractSafetyReportsMapper(), xml_to_json.MergeSafetyReportsReducer(), self.output().path, num_shards=16) combined_counts = collections.defaultdict(int) for rc in report_counts: for timestamp, count in rc.iteritems(): combined_counts[timestamp] += count print '----REPORT COUNTS----' for timestamp, count in sorted(combined_counts.items()): print '>> ', timestamp, count
def run(self): logging.info('Pipelining...') # AERS_SGML_2007q4.ZIP has files in sqml sgml_path = '/AERS_SGML_*/s[gq]ml/*.SGM' xml_path = '/FAERS_XML*/[Xx][Mm][Ll]/*.xml' filenames = glob.glob(self.input().path + sgml_path) filenames.extend(glob.glob(self.input().path + xml_path)) input_shards = [] for filename in filenames: if 'test' in filename.lower(): continue logging.info('Adding input file to pool: %s', filename) input_shards.append(filename) report_counts = parallel.mapreduce( parallel.Collection.from_list(input_shards), xml_to_json.ExtractSafetyReportsMapper(), xml_to_json.MergeSafetyReportsReducer(), self.output().path, 10) combined_counts = collections.defaultdict(int) for rc in report_counts: for timestamp, count in rc.iteritems(): combined_counts[timestamp] += count print '----REPORT COUNTS----' for timestamp, count in sorted(combined_counts.items()): print '>> ', timestamp, count