def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         join(self.input().path, 'SubmissionPropertyType.txt'),
         parallel.CSVDictLineInput(delimiter='\t')),
                        mapper=SubmissionPropertyType2JSONMapper(),
                        reducer=parallel.ListReducer(),
                        output_prefix=self.output().path)
Exemple #2
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_sharded_list(
         [batch.path for batch in self.input()]),
                        mapper=SPLSetIDMapper(),
                        reducer=parallel.ListReducer(),
                        output_prefix=self.output().path,
                        num_shards=16)
Exemple #3
0
  def run(self):
    with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin:
       rows = (line.split('\t') for line in fin)
       doc_lookup = {row[0]: row[1].rstrip() for row in rows}

    parallel.mapreduce(
      parallel.Collection.from_glob(
        join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')),
      mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup),
      reducer=parallel.ListReducer(),
      output_prefix=self.output().path)
Exemple #4
0
    def run(self):
        ndc_spl_id_index = {}
        ndc_db = self.input()[1].path
        logging.info('Joining data from NDC DB: %s', ndc_db)
        db = parallel.ShardedDB.open(ndc_db)
        db_iter = db.range_iter(None, None)

        # We want each SPL ID that is in the NDC file so that we always use the
        # same SPL file for both ID and SET_ID based joins.
        for (key, val) in db_iter:
            ndc_spl_id_index[val['id']] = True

        parallel.mapreduce(parallel.Collection.from_sharded_list(
            [batch.path for batch in self.input()[0]]),
                           mapper=SPLSetIDMapper(index_db=ndc_spl_id_index),
                           reducer=parallel.ListReducer(),
                           output_prefix=self.output().path,
                           num_shards=16)