def run(self): # Read the entire packaging DB in memory for speed. package_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() parallel.mapreduce( parallel.Collection.from_sharded(self.input()[1].path), mapper=ProductAndPackagingMergingMapper(package_db=package_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() json_glob = glob.glob(self.input()[1].path + '/*.json') parallel.mapreduce( parallel.Collection.from_glob(json_glob, parallel.JSONLineInput()), mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=10, map_workers=5)
def run(self): input_dir = self.input().path for xml_filename in glob.glob('%(input_dir)s/*.xml' % locals()): parallel.mapreduce(input_collection=parallel.Collection.from_glob( xml_filename, parallel.XMLDictInput), mapper=XML2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1, map_workers=1)
def run(self): input_db = self.input()[0].path harmonized_file = self.input()[1].path parallel.mapreduce(parallel.Collection.from_sharded(input_db), mapper=annotate.AnnotateMapper(harmonized_file), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=4, map_workers=1)
def run(self): input_db = self.input()[0].path harmonized_file = self.input()[1].path parallel.mapreduce( parallel.Collection.from_sharded(input_db), mapper=AnnotateMapper(harmonized_file), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
def run(self): input_dir = dirname(self.input().path) for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()): parallel.mapreduce(parallel.Collection.from_glob( csv_filename, parallel.CSVDictLineInput()), mapper=CSV2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1, map_workers=8)
def run(self): csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict() parallel.mapreduce( parallel.Collection.from_glob(self.input()[0].path, parallel.CSVDictLineInput()), mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, map_workers=10, num_shards=10) # Do not hit fda.gov too hard here.
def run(self): with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin: rows = (line.split('\t') for line in fin) doc_lookup = {row[0]: row[1] for row in rows} parallel.mapreduce(parallel.Collection.from_glob( join(self.input().path, 'TE.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=TE2JSONMapper(doc_lookup=doc_lookup), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): input_shards = [] input_dir = self.input().path for xml_filename in glob.glob(input_dir + '/*.xml'): input_shards.append(xml_filename) parallel.mapreduce(parallel.Collection.from_list(input_shards), mapper=XML2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=len(input_shards))
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce(parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')), mapper=ClassificationMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def _run(self): json_dir = self.input()['data'].path input_glob = glob.glob(json_dir + '/*.json') for file_name in input_glob: logging.info('Running file %s', file_name) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.JSONLineInput()), mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'), reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), output_prefix='/tmp/loadjson.' + self.index_name)
def test_identity(self): os.system('rm -rf /tmp/test-identity*') source_files = ['/tmp/test-identity-%d' % i for i in range(10)] for f in source_files: os.system('touch "%s"' % f) source = parallel.Collection(source_files, parallel.FilenameInput) parallel.mapreduce(source, parallel.IdentityMapper(), parallel.IdentityReducer(), '/tmp/test-identity', 2) results = sorted(list(parallel.ShardedDB.open('/tmp/test-identity/'))) for i in range(10): key, value = results[i] assert key == '/tmp/test-identity-%d' % i, results[i] assert value == ''
def run(self): input_shards = [] input_dir = self.input().path for subdir, dirs, files in os.walk(input_dir): for file in files: if file.endswith('.xml'): if not file in NULLIFIED: input_shards.append(os.path.join(subdir, file)) else: logging.info("Skipping a nullified case: " + file) parallel.mapreduce(parallel.Collection.from_list(input_shards), mapper=XML2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): applications_db = self.input()[0].path products_db = self.input()[1].path applications_docs_db = self.input()[2].path submissions_db = self.input()[3].path submissions_property_type_db = self.input()[4].path marketing_status = self.input()[5].path te_db = self.input()[6].path parallel.mapreduce( parallel.Collection.from_sharded(applications_db), mapper=MergeAllMapper(applications_db, products_db, applications_docs_db, submissions_db, submissions_property_type_db, marketing_status, te_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, map_workers=1, num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
def run(self): import sys import csv maxInt = sys.maxsize decrement = True while decrement: # decrease the maxInt value by factor 10 # as long as the OverflowError occurs. decrement = False try: csv.field_size_limit(maxInt) except OverflowError: maxInt = int(maxInt / 10) decrement = True parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput(delimiter='\t')), mapper=NDC2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput()), mapper=NSDE2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): mapreduce(Collection.from_sharded(self.input().path), mapper=Harmonized2OpenFDAMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): parallel.mapreduce( parallel.Collection.from_glob(self.batch, parallel.LineInput), mapper=SPL2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.JSONLineInputUnicode()), mapper=SubstanceData2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce( parallel.Collection.from_sharded(self.input()[1].path), mapper=UpcMapper(spl_s3_dir=SPL_S3_DIR), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce(parallel.Collection.from_sharded(self.input().path), mapper=CurrentSPLMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)