Ejemplo n.º 1
0
 def run(self):
   db_list = [s.path for s in self.input()]
   parallel.mapreduce(
     parallel.Collection.from_sharded_list(db_list),
     mapper=MergeUpdatesMapper(),
     reducer=MergeUpdatesReducer(),
     output_prefix=self.output().path)
Ejemplo n.º 2
0
    def run(self):
        input_dir = self.input().path
        output_dir = self.output().path
        common.shell_cmd("mkdir -p %s", dirname(output_dir))

        NEEDS_HEADERS = {"estabtypes.txt": ["establishment_type_id", "description"]}

        inputs = []
        for input_file in glob.glob(input_dir + "/*.txt"):
            if basename(input_file) in REMAPPED_FILES:
                continue
            header_key = basename(input_file)
            fieldnames = NEEDS_HEADERS.get(header_key, None)
            inputs.append(
                parallel.Collection.from_glob(
                    input_file,
                    parallel.CSVDictLineInput(
                        delimiter="|", fieldnames=fieldnames, quoting=csv.QUOTE_NONE, escapechar="\\"
                    ),
                )
            )

        parallel.mapreduce(
            inputs=inputs, mapper=TXT2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path
        )
Ejemplo n.º 3
0
  def run_mr(self,
             prefix, input_data,
             input_format=parallel.LineInput(),
             mapper=parallel.IdentityMapper(),
             reducer=parallel.IdentityReducer(),
             output_format=parallel.LevelDBOutput(),
             num_shards=5):
    os.system('rm -rf "%s"' % prefix)
    source = self.make_files(os.path.join(prefix, 'input'), input_data, input_format)
    output_prefix = os.path.join(prefix, 'output')

    parallel.mapreduce(source,
                       mapper=mapper,
                       reducer=reducer,
                       output_format=output_format,
                       output_prefix=output_prefix,
                       num_shards=num_shards)

    if isinstance(output_format, parallel.LevelDBOutput):
      return sorted(list(parallel.ShardedDB.open(output_prefix)))

    if isinstance(output_format, parallel.JSONOutput):
      return json.load(open(output_prefix))

    if isinstance(output_format, parallel.JSONLineOutput):
      result = []
      with open(output_prefix, 'r') as input_f:
        for line in input_f:
          result.append(json.loads(line))
      return result
Ejemplo n.º 4
0
 def run(self):
   harmonized_file = self.input()[0].path
   parallel.mapreduce(
     parallel.Collection.from_sharded(self.input()[1].path),
     annotate.AnnotateMapper(harmonized_file),
     parallel.IdentityReducer(),
     self.output().path)
Ejemplo n.º 5
0
  def _run(self):
    json_dir = self.input()['data'].path

    mapper = LoadJSONMapper(
      config.es_host(),
      index_name=self.index_name,
      type_name=self.type_name,
      docid_key=self.docid_key,
      incremental=self.use_checksum
    )

    parallel.mapreduce(
      parallel.Collection.from_sharded(json_dir),
      mapper=mapper,
      reducer=parallel.IdentityReducer(),
      output_format=parallel.NullOutput(),
      map_workers=self.load_json_workers,
      num_shards=1,
      output_prefix=config.tmp_dir('%s/load-json' % self.index_name)
    )

    # update metadata index
    elasticsearch_requests.update_process_datetime(
      config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')
    )

    # optimize index, if requested
    if self.optimize_index:
      optimize_index(self.index_name, wait_for_merge=False)
Ejemplo n.º 6
0
  def run(self):
    # Since we only iterate over dates in the umbrella process, we need to
    # skip batch files that do not exist
    output_file = self.output().path
    if not os.path.exists(self.batch):
      common.shell_cmd('touch %s', output_file)
      return

    input_file = self.input()[1].path
    es = elasticsearch.Elasticsearch(self.es_host)
    index_util.start_index_transaction(es, 'druglabel', self.epoch)
    parallel.mapreduce(
      input_collection=parallel.Collection.from_sharded(input_file),
      mapper=index_util.LoadJSONMapper(self.es_host,
                                       'druglabel',
                                       'spl',
                                       self.epoch,
                                       docid_key='set_id',
                                       version_key='version'),
      reducer=parallel.NullReducer(),
      output_prefix='/tmp/loadjson.druglabel',
      num_shards=1,
      map_workers=1)
    index_util.commit_index_transaction(es, 'druglabel')
    common.shell_cmd('touch %s', output_file)
Ejemplo n.º 7
0
 def run(self):
   parallel.mapreduce(
     parallel.Collection.from_sharded_list([batch.path for batch in self.input()]),
     mapper=SPLSetIDMapper(),
     reducer=parallel.ListReducer(),
     output_prefix=self.output().path,
     num_shards=16)
Ejemplo n.º 8
0
  def run(self):
    files = glob.glob(self.input().path + '/*/*.txt')

    if self.loader_task == 'init':
      input_files = [f for f in files if not any(i for i in IGNORE_FILES if i in f)]
    else:
      input_files = [f for f in files if self.loader_task in f]

    # Load and cache device problem codes.
    problem_codes_reference = {}
    device_problem_codes = {}

    reader = csv.reader(open(DEVICE_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter='|')
    for idx, line in enumerate(reader):
      if len(line) > 1:
        problem_codes_reference[line[0]] = line[1].strip()

    reader = csv.reader(open(DEVICE_PROBLEMS_FILE), quoting=csv.QUOTE_NONE, delimiter='|')
    for idx, line in enumerate(reader):
      if len(line) > 1:
        device_problem_codes[line[0]] = [line[1]] if device_problem_codes.get(line[0]) is None else \
        device_problem_codes[line[0]] + [line[1]]

    parallel.mapreduce(
      parallel.Collection.from_glob(
        input_files, parallel.CSVLineInput(quoting=csv.QUOTE_NONE, delimiter='|')),
      mapper=CSV2JSONMapper(problem_codes_reference=problem_codes_reference, device_problem_codes=device_problem_codes),
      reducer=CSV2JSONJoinReducer(),
      output_prefix=self.output().path)
Ejemplo n.º 9
0
 def run(self):
   input_glob = glob.glob(SPL_S3_DIR + '/*/barcodes/otc-bars.json')
   parallel.mapreduce(
     parallel.Collection.from_sharded(self.input()[1].path),
     mapper=UpcMapper(),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Ejemplo n.º 10
0
 def run(self):
   parallel.mapreduce(
     glob.glob(self.input()[1].path + '*-of-*'),
     load_json_mapper,
     parallel.null_reducer,
     output_prefix=self.output().path,
     num_shards=1,
     map_workers=1)
Ejemplo n.º 11
0
 def run(self):
   parallel.mapreduce(
     parallel.Collection.from_sharded(self.input()[1].path),
     LoadJSONMapper(),
     parallel.NullReducer(),
     output_prefix=self.output().path,
     num_shards=1,
     map_workers=1)
Ejemplo n.º 12
0
 def run(self):
   harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()
   db_list = [s.path for s in self.input()[1:]]
   parallel.mapreduce(
     parallel.Collection.from_sharded_list(db_list),
     mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Ejemplo n.º 13
0
 def run(self):
   parallel.mapreduce(
       parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput(delimiter='|')),
       mapper=RXNorm2JSONMapper(),
       reducer=RXNormReducer(),
       output_prefix=self.output().path,
       num_shards=10)
Ejemplo n.º 14
0
 def run(self):
   parallel.mapreduce(
       parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput(delimiter='\t')),
       mapper=NDC2JSONMapper(),
       reducer=parallel.IdentityReducer(),
       output_prefix=self.output().path,
       num_shards=1)
Ejemplo n.º 15
0
 def run(self):
   parallel.mapreduce(
       parallel.Collection.from_glob(
         self.input().path, parallel.JSONLineInput()),
       mapper=parallel.IdentityMapper(),
       reducer=parallel.IdentityReducer(),
       output_prefix=self.output().path,
       num_shards=1)
Ejemplo n.º 16
0
 def run(self):
   db_list = [s.path for s in self.input()]
   mapreduce(
     Collection.from_sharded_list(db_list),
     mapper=JoinMapper(),
     reducer=PivotReducer(),
     output_prefix=self.output().path,
     num_shards=10)
Ejemplo n.º 17
0
 def run(self):
   tables = ['owner_operator', 'contact_addresses', 'official_correspondent']
   join_keys = ['contact_id']
   parallel.mapreduce(
     parallel.Collection.from_sharded(self.input().path),
     mapper=JoinMapper(tables=tables, join_keys=join_keys),
     reducer=OwnerOperatorJoinReducer(),
     output_prefix=self.output().path,
     num_shards=10)
Ejemplo n.º 18
0
 def run(self):
   file_name = join(self.input().path, CAERS_FILE)
   parallel.mapreduce(
     parallel.Collection.from_glob(file_name,
       parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)),
     mapper=CSV2JSONMapper(),
     reducer=CSV2JSONReducer(),
     output_prefix=self.output().path,
     num_shards=10)
Ejemplo n.º 19
0
  def run(self):
    harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()

    parallel.mapreduce(
      parallel.Collection.from_sharded(self.input()[1].path),
      mapper=PMAAnnotateMapper(harmonized_db=harmonized_db),
      reducer=parallel.IdentityReducer(),
      output_prefix=self.output().path,
      num_shards=10)
Ejemplo n.º 20
0
 def run(self):
   common.shell_cmd('mkdir -p %s', dirname(self.output().path))
   input_files = glob.glob(self.input().path + '/*.txt')
   parallel.mapreduce(
     parallel.Collection.from_glob(
       input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')),
     mapper=PMAMapper(),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Ejemplo n.º 21
0
 def run(self):
   harmonized_file = self.input()[0].path
   parallel.mapreduce(
     glob.glob(self.input()[1].path + '*-of-*'),
     annotate.AnnotateMapper(harmonized_file),
     parallel.identity_reducer,
     self.output().path,
     num_shards=10,
     map_workers=2)
Ejemplo n.º 22
0
 def run(self):
   tables = ['listing_estabtypes', 'estabtypes']
   join_keys = ['establishment_type_id']
   parallel.mapreduce(
     parallel.Collection.from_sharded(self.input().path),
     mapper=JoinMapper(tables=tables, join_keys=join_keys),
     reducer=JoinEstablishmentTypesReducer(),
     output_prefix=self.output().path,
     num_shards=10)
Ejemplo n.º 23
0
 def run(self):
     file_name = join(self.input().path, CAERS_FILE)
     parallel.mapreduce(parallel.Collection.from_glob(
         file_name,
         parallel.CSVDictLineInput(delimiter=',',
                                   quoting=csv.QUOTE_MINIMAL)),
                        mapper=CSV2JSONMapper(),
                        reducer=CSV2JSONReducer(),
                        output_prefix=self.output().path,
                        num_shards=10)
Ejemplo n.º 24
0
 def run(self):
     csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict()
     parallel.mapreduce(
         parallel.Collection.from_glob(self.input()[0].path,
                                       parallel.CSVDictLineInput()),
         mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db),
         reducer=parallel.IdentityReducer(),
         output_prefix=self.output().path,
         map_workers=10,
         num_shards=10)  # Do not hit fda.gov too hard here.
Ejemplo n.º 25
0
 def run(self):
   common.shell_cmd('mkdir -p %s', join(BASE_DIR, 'tmp'))
   files = glob.glob(self.input().path + '/*/*.json')
   parallel.mapreduce(
     parallel.Collection.from_glob(files, parallel.JSONLineInput()),
     mapper=ParallelExportMapper(output_dir=self.output().path),
     reducer=parallel.NullReducer(),
     output_prefix=join(BASE_DIR, 'tmp'),
     output_format=parallel.NullOutput(),
     map_workers=10)
Ejemplo n.º 26
0
  def run(self):
    input_db = self.input()[0].path
    harmonized_file = self.input()[1].path

    parallel.mapreduce(
      parallel.Collection.from_sharded(input_db),
      mapper=AnnotateMapper(harmonized_file),
      reducer=parallel.IdentityReducer(),
      output_prefix=self.output().path,
      num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
Ejemplo n.º 27
0
  def run(self):
    input_db = self.input()[0].path
    harmonized_file = self.input()[1].path

    parallel.mapreduce(
      parallel.Collection.from_sharded(input_db),
      mapper=annotate.AnnotateMapper(harmonized_file),
      reducer=parallel.IdentityReducer(),
      output_prefix=self.output().path,
      num_shards=1)
Ejemplo n.º 28
0
 def run(self):
   harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()
   json_glob = glob.glob(self.input()[1].path + '/*.json')
   parallel.mapreduce(
     parallel.Collection.from_glob(json_glob, parallel.JSONLineInput()),
     mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path,
     num_shards=10,
     map_workers=5)
Ejemplo n.º 29
0
    def run(self):
        input_db = self.input()[0].path
        harmonized_file = self.input()[1].path

        parallel.mapreduce(parallel.Collection.from_sharded(input_db),
                           mapper=annotate.AnnotateMapper(harmonized_file),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path,
                           num_shards=1,
                           map_workers=1)
Ejemplo n.º 30
0
 def run(self):
     common.shell_cmd('mkdir -p %s', join(BASE_DIR, 'tmp'))
     files = glob.glob(self.input().path + '/*/*.json')
     parallel.mapreduce(
         parallel.Collection.from_glob(files, parallel.JSONLineInput()),
         mapper=ParallelExportMapper(output_dir=self.output().path),
         reducer=parallel.NullReducer(),
         output_prefix=join(BASE_DIR, 'tmp'),
         output_format=parallel.NullOutput(),
         map_workers=10)
Ejemplo n.º 31
0
    def run(self):
        # Read the entire packaging DB in memory for speed.
        package_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()

        parallel.mapreduce(
            parallel.Collection.from_sharded(self.input()[1].path),
            mapper=ProductAndPackagingMergingMapper(package_db=package_db),
            reducer=parallel.IdentityReducer(),
            output_prefix=self.output().path,
            num_shards=1)
Ejemplo n.º 32
0
 def run(self):
     input_dir = dirname(self.input().path)
     for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()):
         parallel.mapreduce(parallel.Collection.from_glob(
             csv_filename, parallel.CSVDictLineInput()),
                            mapper=CSV2JSONMapper(),
                            reducer=parallel.IdentityReducer(),
                            output_prefix=self.output().path,
                            num_shards=1,
                            map_workers=8)
Ejemplo n.º 33
0
 def run(self):
     input_dir = self.input().path
     for xml_filename in glob.glob('%(input_dir)s/*.xml' % locals()):
         parallel.mapreduce(parallel.Collection.from_glob(
             xml_filename, parallel.XMLDictInput(depth=1)),
                            mapper=XML2JSONMapper(),
                            reducer=parallel.IdentityReducer(),
                            output_prefix=self.output().path,
                            num_shards=1,
                            map_workers=8)
Ejemplo n.º 34
0
 def run(self):
     harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()
     json_glob = glob.glob(self.input()[1].path + '/*.json')
     parallel.mapreduce(
         parallel.Collection.from_glob(json_glob, parallel.JSONLineInput()),
         mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db),
         reducer=parallel.IdentityReducer(),
         output_prefix=self.output().path,
         num_shards=10,
         map_workers=5)
Ejemplo n.º 35
0
 def run(self):
   common.shell_cmd('mkdir -p %s', dirname(self.output().path))
   input_files = glob.glob(self.input().path + '/*.txt')
   parallel.mapreduce(
     parallel.Collection.from_glob(
       input_files, parallel.CSVDictLineInput(delimiter='|',
                                              quoting=csv.QUOTE_NONE,
                                              escapechar='\\')),
     mapper=ClassificationMapper(),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Ejemplo n.º 36
0
  def run(self):
    with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin:
       rows = (line.split('\t') for line in fin)
       doc_lookup = {row[0]: row[1].rstrip() for row in rows}

    parallel.mapreduce(
      parallel.Collection.from_glob(
        join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')),
      mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup),
      reducer=parallel.ListReducer(),
      output_prefix=self.output().path)
Ejemplo n.º 37
0
    def run(self):
        tables = ['intermediate_owner_operator', 'registration', 'us_agent']
        join_keys = ['reg_key']
        db_list = [s.path for s in self.input()]

        parallel.mapreduce(parallel.Collection.from_sharded_list(db_list),
                           mapper=JoinMapper(tables=tables,
                                             join_keys=join_keys),
                           reducer=RegistrationJoinReducer(),
                           output_prefix=self.output().path,
                           num_shards=10)
Ejemplo n.º 38
0
 def run(self):
     tables = [
         'owner_operator', 'contact_addresses', 'official_correspondent'
     ]
     join_keys = ['contact_id']
     parallel.mapreduce(parallel.Collection.from_sharded(self.input().path),
                        mapper=JoinMapper(tables=tables,
                                          join_keys=join_keys),
                        reducer=OwnerOperatorJoinReducer(),
                        output_prefix=self.output().path,
                        num_shards=10)
Ejemplo n.º 39
0
  def run(self):
    tables = ['intermediate_owner_operator', 'registration', 'us_agent']
    join_keys = ['reg_key']
    db_list = [s.path for s in self.input()]

    parallel.mapreduce(
      parallel.Collection.from_sharded_list(db_list),
      mapper=JoinMapper(tables=tables, join_keys=join_keys),
      reducer=RegistrationJoinReducer(),
      output_prefix=self.output().path,
      num_shards=10)
Ejemplo n.º 40
0
    def run(self):
        input_shards = []
        input_dir = self.input().path
        for xml_filename in glob.glob(input_dir + '/*.xml'):
            input_shards.append(xml_filename)

        parallel.mapreduce(parallel.Collection.from_list(input_shards),
                           mapper=XML2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path,
                           num_shards=len(input_shards))
Ejemplo n.º 41
0
 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|',
                                   quoting=csv.QUOTE_NONE,
                                   escapechar='\\')),
                        mapper=ClassificationMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Ejemplo n.º 42
0
 def run(self):
   input_dir = self.input().path
   for xml_filename in glob.glob('%(input_dir)s/*.xml' % locals()):
     parallel.mapreduce(
       input_collection=parallel.Collection.from_glob(xml_filename,
                                                      parallel.XMLDictInput),
       mapper=XML2JSONMapper(),
       reducer=parallel.IdentityReducer(),
       output_prefix=self.output().path,
       num_shards=1,
       map_workers=1)
Ejemplo n.º 43
0
    def run(self):
        with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin:
            rows = (line.split('\t') for line in fin)
            doc_lookup = {row[0]: row[1] for row in rows}

        parallel.mapreduce(parallel.Collection.from_glob(
            join(self.input().path, 'TE.txt'),
            parallel.CSVDictLineInput(delimiter='\t')),
                           mapper=TE2JSONMapper(doc_lookup=doc_lookup),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)
Ejemplo n.º 44
0
 def _run(self):
  json_dir = self.input()['data'].path
  input_glob = glob.glob(json_dir + '/*.json')
  for file_name in input_glob:
    logging.info('Running file %s', file_name)
    parallel.mapreduce(
      parallel.Collection.from_glob(file_name, parallel.JSONLineInput()),
      mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'),
      reducer=parallel.IdentityReducer(),
      output_format=parallel.NullOutput(),
      output_prefix='/tmp/loadjson.' + self.index_name)
Ejemplo n.º 45
0
    def run(self):
        tables = ["remapped_registration_listing", "listing_pcd", "listing_proprietary_name"]
        join_keys = ["key_val"]

        parallel.mapreduce(
            parallel.Collection.from_sharded(self.input().path),
            mapper=JoinMapper(tables=tables, join_keys=join_keys),
            reducer=ListingJoinReducer(),
            output_prefix=self.output().path,
            num_shards=10,
        )
Ejemplo n.º 46
0
    def run(self):
        tables = [
            'remapped_registration_listing', 'listing_pcd',
            'listing_proprietary_name'
        ]
        join_keys = ['key_val']

        parallel.mapreduce(parallel.Collection.from_sharded(self.input().path),
                           mapper=JoinMapper(tables=tables,
                                             join_keys=join_keys),
                           reducer=ListingJoinReducer(),
                           output_prefix=self.output().path,
                           num_shards=10)
Ejemplo n.º 47
0
 def _run(self):
     json_dir = self.input()['data'].path
     input_glob = glob.glob(json_dir + '/*.json')
     for file_name in input_glob:
         logging.info('Running file %s', file_name)
         parallel.mapreduce(
             parallel.Collection.from_glob(file_name,
                                           parallel.JSONLineInput()),
             mapper=index_util.ReloadJSONMapper(config.es_host(),
                                                self.index_name, 'maude'),
             reducer=parallel.IdentityReducer(),
             output_format=parallel.NullOutput(),
             output_prefix='/tmp/loadjson.' + self.index_name)
Ejemplo n.º 48
0
    def run(self):
        tables = [
            'intermediate_registration_listing',
            'intermediate_establishment_listing', 'intermediate_registration'
        ]
        join_keys = ['reg_key']
        db_list = [s.path for s in self.input()]

        parallel.mapreduce(parallel.Collection.from_sharded_list(db_list),
                           mapper=JoinMapper(tables=tables,
                                             join_keys=join_keys),
                           reducer=JoinAllReducer(),
                           output_prefix=self.output().path,
                           num_shards=10)
Ejemplo n.º 49
0
  def run(self):
    files = glob.glob(self.input().path + '/*/*.txt')

    if self.loader_task == 'init':
      input_files = [f for f in files if not any(i for i in IGNORE_FILES if i in f)]
    else:
      input_files = [f for f in files if self.loader_task in f]

    parallel.mapreduce(
      parallel.Collection.from_glob(
        input_files, parallel.CSVLineInput(quoting=csv.QUOTE_NONE, delimiter='|')),
      mapper=CSV2JSONMapper(),
      reducer=CSV2JSONJoinReducer(),
      output_prefix=self.output().path)
Ejemplo n.º 50
0
    def test_identity(self):
        os.system('rm -rf /tmp/test-identity*')
        source_files = ['/tmp/test-identity-%d' % i for i in range(10)]
        for f in source_files:
            os.system('touch "%s"' % f)

        source = parallel.Collection(source_files, parallel.FilenameInput)
        parallel.mapreduce(source, parallel.IdentityMapper(),
                           parallel.IdentityReducer(), '/tmp/test-identity', 2)

        results = sorted(list(parallel.ShardedDB.open('/tmp/test-identity/')))
        for i in range(10):
            key, value = results[i]
            assert key == '/tmp/test-identity-%d' % i, results[i]
            assert value == ''
Ejemplo n.º 51
0
    def run(self):
        input_shards = []
        input_dir = self.input().path
        for subdir, dirs, files in os.walk(input_dir):
            for file in files:
                if file.endswith('.xml'):
                    if not file in NULLIFIED:
                        input_shards.append(os.path.join(subdir, file))
                    else:
                        logging.info("Skipping a nullified case: " + file)

        parallel.mapreduce(parallel.Collection.from_list(input_shards),
                           mapper=XML2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)
Ejemplo n.º 52
0
    def run(self):
        # AERS_SGML_2007q4.ZIP has files in sqml
        filenames = glob.glob(self.input().path + '/AERS_SGML_*/s[gq]ml/*.SGM')
        filenames.extend(glob.glob(self.input().path +
                                   '/FAERS_XML*/xml/*.xml'))

        input_shards = []
        for filename in filenames:
            if 'test' in filename.lower():
                continue
            logging.info('Adding input file to pool: %s', filename)
            input_shards.append(filename)

        parallel.mapreduce(input_shards, xml_to_json.extract_safety_reports,
                           xml_to_json.merge_safety_reports,
                           self.output().path, 10)
Ejemplo n.º 53
0
    def test_sum(self):
        os.system('rm -rf /tmp/test-sum*')
        source_files = ['/tmp/test-sum-%d' % i for i in range(10)]
        for filename in source_files:
            with open(filename, 'w') as f:
                print >> f, '\n'.join([str(i) for i in range(100)])

        source = parallel.Collection(source_files, parallel.LineInput)
        parallel.mapreduce(source, parallel.IdentityMapper(),
                           parallel.SumReducer(), '/tmp/test-sum', 5)

        results = dict(parallel.ShardedDB.open('/tmp/test-sum/'))
        for i in range(100):
            assert str(i) in results, str(i)
            value = results[str(i)]
            self.assertEqual(value, str(i * 10.0))
Ejemplo n.º 54
0
    def run(self):
        logging.info('Pipelining...')
        # AERS_SGML_2007q4.ZIP has files in sqml
        sgml_path = '/AERS_SGML_*/s[gq]ml/*.SGM'
        xml_path = '/FAERS_XML*/[Xx][Mm][Ll]/*.xml'
        filenames = glob.glob(self.input().path + sgml_path)
        filenames.extend(glob.glob(self.input().path + xml_path))

        input_shards = []
        for filename in filenames:
            if 'test' in filename.lower():
                continue
            logging.info('Adding input file to pool: %s', filename)
            input_shards.append(filename)

        report_counts = parallel.mapreduce(
            parallel.Collection.from_list(input_shards),
            xml_to_json.ExtractSafetyReportsMapper(),
            xml_to_json.MergeSafetyReportsReducer(),
            self.output().path, 10)

        combined_counts = collections.defaultdict(int)
        for rc in report_counts:
            for timestamp, count in rc.iteritems():
                combined_counts[timestamp] += count

        print '----REPORT COUNTS----'
        for timestamp, count in sorted(combined_counts.items()):
            print '>> ', timestamp, count
Ejemplo n.º 55
0
  def run(self):
    # AERS_SGML_2007q4.ZIP has files in sqml

    filenames = []
    for input in self.input():
      sgml_path = '/s[gq]ml/*.SGM'
      xml_path = '/[Xx][Mm][Ll]/*.xml'
      logging.info('Checking for inputs in: %s', input.path)
      filenames.extend(glob.glob(input.path + sgml_path))
      filenames.extend(glob.glob(input.path + xml_path))

    assert len(filenames) > 0, 'No files to process for quarter? %s' % self.quarter

    input_shards = []
    for filename in filenames:
      if 'test' in filename.lower():
        continue
      logging.info('Adding input file to pool: %s', filename)
      input_shards.append(filename)

    report_counts = parallel.mapreduce(
      parallel.Collection.from_list(input_shards),
      xml_to_json.ExtractSafetyReportsMapper(),
      xml_to_json.MergeSafetyReportsReducer(),
      self.output().path,
      num_shards=16)

    combined_counts = collections.defaultdict(int)
    for rc in report_counts:
      for timestamp, count in rc.iteritems():
        combined_counts[timestamp] += count

    print '----REPORT COUNTS----'
    for timestamp, count in sorted(combined_counts.items()):
      print '>> ', timestamp, count
Ejemplo n.º 56
0
  def run(self):
    applications_db = self.input()[0].path
    products_db = self.input()[1].path
    applications_docs_db = self.input()[2].path
    submissions_db = self.input()[3].path
    submissions_property_type_db = self.input()[4].path
    marketing_status = self.input()[5].path
    te_db = self.input()[6].path

    parallel.mapreduce(
      parallel.Collection.from_sharded(applications_db),
      mapper=MergeAllMapper(applications_db, products_db, applications_docs_db, submissions_db,
                            submissions_property_type_db, marketing_status, te_db),
      reducer=parallel.IdentityReducer(),
      output_prefix=self.output().path,
      map_workers=1,
      num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
Ejemplo n.º 57
0
    def run(self):
        ndc_spl_id_index = {}
        ndc_db = self.input()[1].path
        logging.info('Joining data from NDC DB: %s', ndc_db)
        db = parallel.ShardedDB.open(ndc_db)
        db_iter = db.range_iter(None, None)

        # We want each SPL ID that is in the NDC file so that we always use the
        # same SPL file for both ID and SET_ID based joins.
        for (key, val) in db_iter:
            ndc_spl_id_index[val['id']] = True

        parallel.mapreduce(parallel.Collection.from_sharded_list(
            [batch.path for batch in self.input()[0]]),
                           mapper=SPLSetIDMapper(index_db=ndc_spl_id_index),
                           reducer=parallel.ListReducer(),
                           output_prefix=self.output().path,
                           num_shards=16)
Ejemplo n.º 58
0
    def run(self):
        es = elasticsearch.Elasticsearch(self.es_host)
        index_util.start_index_transaction(es, 'drugevent', self.epoch)

        parallel.mapreduce(parallel.Collection.from_sharded(
            self.input()[1].path),
                           index_util.LoadJSONMapper(self.es_host,
                                                     'drugevent',
                                                     'safetyreport',
                                                     self.epoch,
                                                     docid_key='@case_number',
                                                     version_key='@version'),
                           parallel.NullReducer(),
                           output_prefix='/tmp/loadjson.drugevent',
                           num_shards=1,
                           map_workers=1)

        index_util.commit_index_transaction(es, 'drugevent')
Ejemplo n.º 59
0
    def run(self):
        files = glob.glob(self.input().path + '/*/*.txt')
        device_problems = glob.glob(self.input().path +
                                    '/*/foidevproblem*.txt')
        patient_problems = glob.glob(self.input().path +
                                     '/*/patientproblemcode*.txt')

        if self.loader_task == 'init':
            input_files = [
                f for f in files if not any(i for i in IGNORE_FILES if i in f)
            ]
        else:
            input_files = [f for f in files if self.loader_task in f
                           ] + device_problems + patient_problems

        # Load and cache device problem codes.
        device_problem_codes_ref = {}
        reader = csv.reader(open(DEVICE_PROBLEM_CODES_FILE),
                            quoting=csv.QUOTE_NONE,
                            delimiter='|')
        for idx, line in enumerate(reader):
            if len(line) > 1:
                device_problem_codes_ref[line[0]] = line[1].strip()

        # Load and cache patient problem codes.
        patient_problem_codes_ref = {}
        reader = csv.reader(open(PATIENT_PROBLEM_CODES_FILE),
                            quoting=csv.QUOTE_NONE,
                            delimiter='|')
        for idx, line in enumerate(reader):
            if len(line) > 1:
                patient_problem_codes_ref[line[0]] = line[1].strip()

        parallel.mapreduce(
            parallel.Collection.from_glob(
                input_files,
                parallel.CSVSplitLineInput(quoting=csv.QUOTE_NONE,
                                           delimiter='|')),
            mapper=CSV2JSONMapper(
                device_problem_codes_ref=device_problem_codes_ref,
                patient_problem_codes_ref=patient_problem_codes_ref),
            reducer=CSV2JSONJoinReducer(),
            output_prefix=self.output().path)
Ejemplo n.º 60
0
 def run(self):
     output_file = self.output().path
     input_file = self.input()[1].path
     es = elasticsearch.Elasticsearch(self.es_host)
     index_util.start_index_transaction(es, 'recall', self.epoch)
     parallel.mapreduce(
         input_collection=parallel.Collection.from_sharded(input_file),
         mapper=index_util.LoadJSONMapper(self.es_host,
                                          'recall',
                                          'enforcementreport',
                                          self.epoch,
                                          docid_key='@id',
                                          version_key='@version'),
         reducer=parallel.NullReducer(),
         output_prefix='/tmp/loadjson.recall',
         num_shards=1,
         map_workers=1)
     index_util.commit_index_transaction(es, 'recall')
     common.shell_cmd('touch %s', output_file)