def run(self): parallel.mapreduce(parallel.Collection.from_glob( join(self.input().path, 'Products.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=Product2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput(delimiter='|')), mapper=RXNorm2JSONMapper(), reducer=RXNormReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput(delimiter='\t')), mapper=NDCProduct2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): input_dir = self.input().path output_dir = self.output().path common.shell_cmd('mkdir -p %s', dirname(output_dir)) NEEDS_HEADERS = { 'estabtypes.txt': ['establishment_type_id', 'description'] } inputs = [] for input_file in glob.glob(input_dir + '/*.txt'): if basename(input_file) in REMAPPED_FILES: continue header_key = basename(input_file) fieldnames = NEEDS_HEADERS.get(header_key, None) inputs.append( parallel.Collection.from_glob( input_file, parallel.CSVDictLineInput(delimiter='|', fieldnames=fieldnames, quoting=csv.QUOTE_NONE, escapechar='\\'))) parallel.mapreduce(inputs=inputs, mapper=TXT2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce(parallel.Collection.from_glob( join(self.input().path, 'SubmissionPropertyType.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=SubmissionPropertyType2JSONMapper(), reducer=parallel.ListReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce( parallel.Collection.from_glob(glob.glob(join(self.input().path, '*.csv')), parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)), mapper=CSV2JSONMapper(), reducer=CSV2JSONReducer(), output_prefix=self.output().path)
def run(self): input_files = glob.glob(self.requires().output().path + '/*.csv') parallel.mapreduce(parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput()), mapper=SerologyCSV2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): file_name = join(self.input().path, CAERS_FILE) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)), mapper=CSV2JSONMapper(), reducer=CSV2JSONReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce(parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')), mapper=PMAMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): input_dir = dirname(self.input().path) for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()): parallel.mapreduce(parallel.Collection.from_glob( csv_filename, parallel.CSVDictLineInput()), mapper=CSV2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1, map_workers=8)
def run(self): csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict() parallel.mapreduce( parallel.Collection.from_glob(self.input()[0].path, parallel.CSVDictLineInput()), mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, map_workers=10, num_shards=10) # Do not hit fda.gov too hard here.
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce(parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')), mapper=ClassificationMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin: rows = (line.split('\t') for line in fin) doc_lookup = {row[0]: row[1].rstrip() for row in rows} parallel.mapreduce( parallel.Collection.from_glob( join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup), reducer=parallel.ListReducer(), output_prefix=self.output().path)
def run(self): with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin: rows = (line.split('\t') for line in fin) doc_lookup = {row[0]: row[1] for row in rows} parallel.mapreduce(parallel.Collection.from_glob( join(self.input().path, 'TE.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=TE2JSONMapper(doc_lookup=doc_lookup), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): import sys import csv maxInt = sys.maxsize decrement = True while decrement: # decrease the maxInt value by factor 10 # as long as the OverflowError occurs. decrement = False try: csv.field_size_limit(maxInt) except OverflowError: maxInt = int(maxInt / 10) decrement = True parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput(delimiter='\t')), mapper=NDC2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput()), mapper=NSDE2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def mapreduce_inputs(self): input_files = glob.glob(self.input_dir + '/*.csv') return parallel.Collection.from_glob(input_files, parallel.CSVDictLineInput())
def mapreduce_inputs(self): input_files = glob.glob(dirname(self.requires().output().path) + '/*.csv') return parallel.Collection.from_glob(input_files, parallel.CSVDictLineInput())
def mapreduce_inputs(self): input_files = glob.glob(self.input().path + '/*.txt') return parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0'))