def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False) # update metadata index again. Trying to solve mystery of missing "last_update_date" entries... elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
def _run(self): json_data = json.load(open(self.input()['data'].path)) date_str = basename(dirname(dirname(self.input()['data'].path))) index_dict = { 'index': self.index_name, 'doc_type': self.type_name, 'id': date_str, 'body': json_data } es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=60) # We put the same document into the index twice. # Once with a key as its date # Once with a key of `current` # The API will serve the current key as its default response. # How we serve archive records is TBD, but the data is preserved this way. es_client.index(**index_dict) index_dict['id'] = 'current' es_client.index(**index_dict) elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'), arrow.utcnow().format('YYYY-MM-DD'))
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper( config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum ) parallel.mapreduce( parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name) ) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD') ) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, last_run_date=arrow.utcnow().format('YYYY-MM-DD'), last_update_date=self.last_update_date() if callable(self.last_update_date) else self.last_update_date) # Refresh the index to make the documents visible to searches. refresh_index(self.index_name) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
def _run(self): json_data = json.load(open(self.input()['data'].path)) date_str = basename(dirname(dirname(self.input()['data'].path))) index_dict = { 'index': self.index_name, 'doc_type': self.type_name, 'id': date_str, 'body': json_data } es_client = elasticsearch.Elasticsearch(config.es_host()) # We put the same document into the index twice. # Once with a key as its date # Once with a key of `current` # The API will serve the current key as its default response. # How we serve archive records is TBD, but the data is preserved this way. es_client.index(**index_dict) index_dict['id'] = 'current' es_client.index(**index_dict) elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD') )