def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False) # update metadata index again. Trying to solve mystery of missing "last_update_date" entries... elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper( config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum ) parallel.mapreduce( parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name) ) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD') ) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, last_run_date=arrow.utcnow().format('YYYY-MM-DD'), last_update_date=self.last_update_date() if callable(self.last_update_date) else self.last_update_date) # Refresh the index to make the documents visible to searches. refresh_index(self.index_name) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
import urlparse import arrow import simplejson as json from openfda.tasks import DependencyTriggeredTask from openfda import common, config, parallel, spl from openfda.annotation_table import unii_harmonization from openfda.spl import process_barcodes, extract RUN_DIR = dirname(dirname(os.path.abspath(__file__))) data_dir = config.data_dir('harmonization') BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD') BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE) SPL_S3_DIR = config.data_dir('spl/s3_sync') TMP_DIR = config.tmp_dir() common.shell_cmd('mkdir -p %s', data_dir) common.shell_cmd('mkdir -p %s', BASE_DIR) common.shell_cmd('mkdir -p %s', TMP_DIR) SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db') DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/' PHARM_CLASS_DOWNLOAD = \ DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip' RXNORM_DOWNLOAD = \ DAILYMED_PREFIX + 'rxnorm_mappings.zip' NDC_DOWNLOAD_PAGE = \