Python CSVDictLineInput Examples, openfda.parallel.CSVDictLineInput Python Examples

Example #1

0

Show file

File: pipeline.py Project: tralfamadoriangray/openfda

 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         join(self.input().path, 'Products.txt'),
         parallel.CSVDictLineInput(delimiter='\t')),
                        mapper=Product2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)

Example #2

0

Show file

 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput(delimiter='|')),
                        mapper=RXNorm2JSONMapper(),
                        reducer=RXNormReducer(),
                        output_prefix=self.output().path,
                        num_shards=10)

Example #3

0

Show file

File: pipeline.py Project: zheismysavior/openfda

 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput(delimiter='\t')),
                        mapper=NDCProduct2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path,
                        num_shards=1)

Example #4

0

Show file

    def run(self):
        input_dir = self.input().path
        output_dir = self.output().path
        common.shell_cmd('mkdir -p %s', dirname(output_dir))

        NEEDS_HEADERS = {
            'estabtypes.txt': ['establishment_type_id', 'description']
        }

        inputs = []
        for input_file in glob.glob(input_dir + '/*.txt'):
            if basename(input_file) in REMAPPED_FILES:
                continue
            header_key = basename(input_file)
            fieldnames = NEEDS_HEADERS.get(header_key, None)
            inputs.append(
                parallel.Collection.from_glob(
                    input_file,
                    parallel.CSVDictLineInput(delimiter='|',
                                              fieldnames=fieldnames,
                                              quoting=csv.QUOTE_NONE,
                                              escapechar='\\')))

        parallel.mapreduce(inputs=inputs,
                           mapper=TXT2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)

Example #5

0

Show file

File: pipeline.py Project: tralfamadoriangray/openfda

 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         join(self.input().path, 'SubmissionPropertyType.txt'),
         parallel.CSVDictLineInput(delimiter='\t')),
                        mapper=SubmissionPropertyType2JSONMapper(),
                        reducer=parallel.ListReducer(),
                        output_prefix=self.output().path)

Example #6

0

Show file

 def run(self):
   parallel.mapreduce(
     parallel.Collection.from_glob(glob.glob(join(self.input().path, '*.csv')),
                                   parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)),
     mapper=CSV2JSONMapper(),
     reducer=CSV2JSONReducer(),
     output_prefix=self.output().path)

Example #7

0

Show file

File: pipeline.py Project: thecodemasterk/openfda

 def run(self):
     input_files = glob.glob(self.requires().output().path + '/*.csv')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files, parallel.CSVDictLineInput()),
                        mapper=SerologyCSV2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path,
                        num_shards=1)

Example #8

0

Show file

File: pipeline.py Project: raz101zor/openfda

 def run(self):
   file_name = join(self.input().path, CAERS_FILE)
   parallel.mapreduce(
     parallel.Collection.from_glob(file_name,
       parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)),
     mapper=CSV2JSONMapper(),
     reducer=CSV2JSONReducer(),
     output_prefix=self.output().path,
     num_shards=10)

Example #9

0

Show file

File: pipeline.py Project: zheismysavior/openfda

 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|', strip_str='\0')),
                        mapper=PMAMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)

Example #10

0

Show file

 def run(self):
     input_dir = dirname(self.input().path)
     for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()):
         parallel.mapreduce(parallel.Collection.from_glob(
             csv_filename, parallel.CSVDictLineInput()),
                            mapper=CSV2JSONMapper(),
                            reducer=parallel.IdentityReducer(),
                            output_prefix=self.output().path,
                            num_shards=1,
                            map_workers=8)

Example #11

0

Show file

 def run(self):
     csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict()
     parallel.mapreduce(
         parallel.Collection.from_glob(self.input()[0].path,
                                       parallel.CSVDictLineInput()),
         mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db),
         reducer=parallel.IdentityReducer(),
         output_prefix=self.output().path,
         map_workers=10,
         num_shards=10)  # Do not hit fda.gov too hard here.

Example #12

0

Show file

 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|',
                                   quoting=csv.QUOTE_NONE,
                                   escapechar='\\')),
                        mapper=ClassificationMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)

Example #13

0

Show file

File: pipeline.py Project: FDA/openfda

  def run(self):
    with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin:
       rows = (line.split('\t') for line in fin)
       doc_lookup = {row[0]: row[1].rstrip() for row in rows}

    parallel.mapreduce(
      parallel.Collection.from_glob(
        join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')),
      mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup),
      reducer=parallel.ListReducer(),
      output_prefix=self.output().path)

Example #14

0

Show file

File: pipeline.py Project: tralfamadoriangray/openfda

    def run(self):
        with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin:
            rows = (line.split('\t') for line in fin)
            doc_lookup = {row[0]: row[1] for row in rows}

        parallel.mapreduce(parallel.Collection.from_glob(
            join(self.input().path, 'TE.txt'),
            parallel.CSVDictLineInput(delimiter='\t')),
                           mapper=TE2JSONMapper(doc_lookup=doc_lookup),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)

Example #15

0

Show file

File: pipeline.py Project: dirkkoolmees/openfda

    def run(self):
        import sys
        import csv
        maxInt = sys.maxsize
        decrement = True

        while decrement:
            # decrease the maxInt value by factor 10
            # as long as the OverflowError occurs.

            decrement = False
            try:
                csv.field_size_limit(maxInt)
            except OverflowError:
                maxInt = int(maxInt / 10)
                decrement = True
        parallel.mapreduce(parallel.Collection.from_glob(
            self.input().path, parallel.CSVDictLineInput(delimiter='\t')),
                           mapper=NDC2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path,
                           num_shards=1)

Example #16

0

Show file

File: pipeline.py Project: zheismysavior/openfda

 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput()),
                        mapper=NSDE2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)

Example #17

0

Show file

File: pipeline.py Project: weisong82/openfda

 def mapreduce_inputs(self):
     input_files = glob.glob(self.input_dir + '/*.csv')
     return parallel.Collection.from_glob(input_files,
                                          parallel.CSVDictLineInput())

Example #18

0

Show file

File: pipeline.py Project: zheismysavior/openfda

 def mapreduce_inputs(self):
   input_files = glob.glob(dirname(self.requires().output().path) + '/*.csv')
   return parallel.Collection.from_glob(input_files,
                                        parallel.CSVDictLineInput())

Example #19

0

Show file

File: pipeline.py Project: LiuFang816/SALSTM_py_data

 def mapreduce_inputs(self):
     input_files = glob.glob(self.input().path + '/*.txt')
     return parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|', strip_str='\0'))