def _run(self): json_data = json.load(open(self.input()['data'].path)) date_str = basename(dirname(dirname(self.input()['data'].path))) index_dict = { 'index': self.index_name, 'doc_type': self.type_name, 'id': date_str, 'body': json_data } es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=60) # We put the same document into the index twice. # Once with a key as its date # Once with a key of `current` # The API will serve the current key as its default response. # How we serve archive records is TBD, but the data is preserved this way. es_client.index(**index_dict) index_dict['id'] = 'current' es_client.index(**index_dict) elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'), arrow.utcnow().format('YYYY-MM-DD'))
def optimize_index(index_name, wait_for_merge=1): # elasticsearch client throws a timeout error when waiting for an optimize # command to finish, so we use requests instead logging.info('Optimizing: %s (this may take a while)', index_name) resp = requests.post('http://%s/%s/_forcemerge?max_num_segments=1' % (config.es_host(), index_name), timeout=100000) resp.raise_for_status()
def _schema(self): return ResetElasticSearch( target_host=config.es_host(), target_index_name=self.index_name, target_type_name=self.type_name, target_mapping_file=self.mapping_file, delete_index=not self.use_checksum)
def _schema(self): return ResetElasticSearch( target_host=config.es_host(), target_index_name=self.index_name, target_type_name=self.type_name, target_mapping_file=self.mapping_file, delete_index=not self.use_checksum)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, last_run_date=arrow.utcnow().format('YYYY-MM-DD'), last_update_date=self.last_update_date() if callable(self.last_update_date) else self.last_update_date) # Refresh the index to make the documents visible to searches. refresh_index(self.index_name) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper( config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum ) parallel.mapreduce( parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name) ) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD') ) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def optimize_index(index_name, wait_for_merge=1): # elasticsearch client throws a timeout error when waiting for an optimize # command to finish, so we use requests instead logging.info('Optimizing: %s (this may take a while)', index_name) resp = requests.post('http://%s/%s/_optimize?max_num_segments=1&wait_for_merge=%d' % (config.es_host(), index_name, wait_for_merge), timeout=10000) resp.raise_for_status()
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False) # update metadata index again. Trying to solve mystery of missing "last_update_date" entries... elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
def _run(self): json_dir = self.input()['data'].path input_glob = glob.glob(json_dir + '/*.json') for file_name in input_glob: logging.info('Running file %s', file_name) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.JSONLineInput()), mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'), reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), output_prefix='/tmp/loadjson.' + self.index_name)
def _run(self): json_dir = self.input()['data'].path input_glob = glob.glob(json_dir + '/*.json') for file_name in input_glob: logging.info('Running file %s', file_name) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.JSONLineInput()), mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'), reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), output_prefix='/tmp/loadjson.' + self.index_name)
def run(self): schema_file = self.get_schemafile() assert os.path.exists( schema_file ), 'No schema file available for index %s' % self.index_name es_client = elasticsearch.Elasticsearch(config.es_host()) endpoints = self.get_endpoints() # Get all of the endpoints served by this index # Create an `EndpointExport` object for each endpoint in order to export # each endpoint properly. # # Endpoint exports can be: # date range based (quarterly output) # filter based (index serves many endpoints) # vanilla (endpoint is 1 to 1 with index and it is exported all at once) endpoint_batches = [] for endpoint in endpoints: chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS) if endpoint in RANGE_ENDPOINT_MAP: params = RANGE_ENDPOINT_MAP[endpoint] params['chunks'] = chunks endpoint_batches = _make_date_range_endpoint_batch( endpoint, params) elif endpoint in FILTERED_ENPOINT_MAP: params = FILTERED_ENPOINT_MAP[endpoint] query = EndpointExport.build_term_filter(**params) endpoint_batches.append( EndpointExport(endpoint, query=query, chunks=chunks)) else: endpoint_batches.append(EndpointExport(endpoint, chunks=chunks)) # Dump each of the `EndpointExport` objects in the list for ep in endpoint_batches: # The output_dir will be the same for all outputs, once you factor out # the endpoint, so we can safely look at the first one only. output_dir = dirname(dirname(self.output()[0].path)) endpoint_dir = join(output_dir, ep.endpoint[1:]) index_util.dump_index(es_client, ep.index_name, ep.endpoint, join(endpoint_dir, ep.partition), cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3 common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host()) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3 common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3. flock is required to avoid a race condition when copying the schema file. common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file, schema_file, endpoint_dir)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
def _run(self): json_data = json.load(open(self.input()['data'].path)) date_str = basename(dirname(dirname(self.input()['data'].path))) index_dict = { 'index': self.index_name, 'doc_type': self.type_name, 'id': date_str, 'body': json_data } es_client = elasticsearch.Elasticsearch(config.es_host()) # We put the same document into the index twice. # Once with a key as its date # Once with a key of `current` # The API will serve the current key as its default response. # How we serve archive records is TBD, but the data is preserved this way. es_client.index(**index_dict) index_dict['id'] = 'current' es_client.index(**index_dict) elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD') )
def refresh_index(index_name): logging.info('Refreshing: %s', index_name) resp = requests.post('http://%s/%s/_refresh' % (config.es_host(), index_name), timeout=100000) resp.raise_for_status()