def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         join(self.input().path, 'Products.txt'),
         parallel.CSVDictLineInput(delimiter='\t')),
                        mapper=Product2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemple #2
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput(delimiter='|')),
                        mapper=RXNorm2JSONMapper(),
                        reducer=RXNormReducer(),
                        output_prefix=self.output().path,
                        num_shards=10)
Exemple #3
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput(delimiter='\t')),
                        mapper=NDCProduct2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path,
                        num_shards=1)
Exemple #4
0
    def run(self):
        input_dir = self.input().path
        output_dir = self.output().path
        common.shell_cmd('mkdir -p %s', dirname(output_dir))

        NEEDS_HEADERS = {
            'estabtypes.txt': ['establishment_type_id', 'description']
        }

        inputs = []
        for input_file in glob.glob(input_dir + '/*.txt'):
            if basename(input_file) in REMAPPED_FILES:
                continue
            header_key = basename(input_file)
            fieldnames = NEEDS_HEADERS.get(header_key, None)
            inputs.append(
                parallel.Collection.from_glob(
                    input_file,
                    parallel.CSVDictLineInput(delimiter='|',
                                              fieldnames=fieldnames,
                                              quoting=csv.QUOTE_NONE,
                                              escapechar='\\')))

        parallel.mapreduce(inputs=inputs,
                           mapper=TXT2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         join(self.input().path, 'SubmissionPropertyType.txt'),
         parallel.CSVDictLineInput(delimiter='\t')),
                        mapper=SubmissionPropertyType2JSONMapper(),
                        reducer=parallel.ListReducer(),
                        output_prefix=self.output().path)
Exemple #6
0
 def run(self):
   parallel.mapreduce(
     parallel.Collection.from_glob(glob.glob(join(self.input().path, '*.csv')),
                                   parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)),
     mapper=CSV2JSONMapper(),
     reducer=CSV2JSONReducer(),
     output_prefix=self.output().path)
Exemple #7
0
 def run(self):
     input_files = glob.glob(self.requires().output().path + '/*.csv')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files, parallel.CSVDictLineInput()),
                        mapper=SerologyCSV2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path,
                        num_shards=1)
Exemple #8
0
 def run(self):
   file_name = join(self.input().path, CAERS_FILE)
   parallel.mapreduce(
     parallel.Collection.from_glob(file_name,
       parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)),
     mapper=CSV2JSONMapper(),
     reducer=CSV2JSONReducer(),
     output_prefix=self.output().path,
     num_shards=10)
Exemple #9
0
 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|', strip_str='\0')),
                        mapper=PMAMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemple #10
0
 def run(self):
     input_dir = dirname(self.input().path)
     for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()):
         parallel.mapreduce(parallel.Collection.from_glob(
             csv_filename, parallel.CSVDictLineInput()),
                            mapper=CSV2JSONMapper(),
                            reducer=parallel.IdentityReducer(),
                            output_prefix=self.output().path,
                            num_shards=1,
                            map_workers=8)
Exemple #11
0
 def run(self):
     csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict()
     parallel.mapreduce(
         parallel.Collection.from_glob(self.input()[0].path,
                                       parallel.CSVDictLineInput()),
         mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db),
         reducer=parallel.IdentityReducer(),
         output_prefix=self.output().path,
         map_workers=10,
         num_shards=10)  # Do not hit fda.gov too hard here.
Exemple #12
0
 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|',
                                   quoting=csv.QUOTE_NONE,
                                   escapechar='\\')),
                        mapper=ClassificationMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemple #13
0
  def run(self):
    with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin:
       rows = (line.split('\t') for line in fin)
       doc_lookup = {row[0]: row[1].rstrip() for row in rows}

    parallel.mapreduce(
      parallel.Collection.from_glob(
        join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')),
      mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup),
      reducer=parallel.ListReducer(),
      output_prefix=self.output().path)
    def run(self):
        with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin:
            rows = (line.split('\t') for line in fin)
            doc_lookup = {row[0]: row[1] for row in rows}

        parallel.mapreduce(parallel.Collection.from_glob(
            join(self.input().path, 'TE.txt'),
            parallel.CSVDictLineInput(delimiter='\t')),
                           mapper=TE2JSONMapper(doc_lookup=doc_lookup),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)
Exemple #15
0
    def run(self):
        import sys
        import csv
        maxInt = sys.maxsize
        decrement = True

        while decrement:
            # decrease the maxInt value by factor 10
            # as long as the OverflowError occurs.

            decrement = False
            try:
                csv.field_size_limit(maxInt)
            except OverflowError:
                maxInt = int(maxInt / 10)
                decrement = True
        parallel.mapreduce(parallel.Collection.from_glob(
            self.input().path, parallel.CSVDictLineInput(delimiter='\t')),
                           mapper=NDC2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path,
                           num_shards=1)
Exemple #16
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput()),
                        mapper=NSDE2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemple #17
0
 def mapreduce_inputs(self):
     input_files = glob.glob(self.input_dir + '/*.csv')
     return parallel.Collection.from_glob(input_files,
                                          parallel.CSVDictLineInput())
Exemple #18
0
 def mapreduce_inputs(self):
   input_files = glob.glob(dirname(self.requires().output().path) + '/*.csv')
   return parallel.Collection.from_glob(input_files,
                                        parallel.CSVDictLineInput())
Exemple #19
0
 def mapreduce_inputs(self):
     input_files = glob.glob(self.input().path + '/*.txt')
     return parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|', strip_str='\0'))