Esempio n. 1
0
    def _run(self):
        json_data = json.load(open(self.input()['data'].path))
        date_str = basename(dirname(dirname(self.input()['data'].path)))
        index_dict = {
            'index': self.index_name,
            'doc_type': self.type_name,
            'id': date_str,
            'body': json_data
        }

        es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=60)

        # We put the same document into the index twice.
        # Once with a key as its date
        # Once with a key of `current`
        # The API will serve the current key as its default response.
        # How we serve archive records is TBD, but the data is preserved this way.
        es_client.index(**index_dict)
        index_dict['id'] = 'current'
        es_client.index(**index_dict)

        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'),
            arrow.utcnow().format('YYYY-MM-DD'))
Esempio n. 2
0
def optimize_index(index_name, wait_for_merge=1):
  # elasticsearch client throws a timeout error when waiting for an optimize
  # command to finish, so we use requests instead
  logging.info('Optimizing: %s (this may take a while)', index_name)
  resp = requests.post('http://%s/%s/_forcemerge?max_num_segments=1' %
    (config.es_host(), index_name), timeout=100000)
  resp.raise_for_status()
Esempio n. 3
0
 def _schema(self):
   return ResetElasticSearch(
       target_host=config.es_host(),
       target_index_name=self.index_name,
       target_type_name=self.type_name,
       target_mapping_file=self.mapping_file,
       delete_index=not self.use_checksum)
Esempio n. 4
0
 def _schema(self):
   return ResetElasticSearch(
       target_host=config.es_host(),
       target_index_name=self.index_name,
       target_type_name=self.type_name,
       target_mapping_file=self.mapping_file,
       delete_index=not self.use_checksum)
Esempio n. 5
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(),
            self.index_name,
            last_run_date=arrow.utcnow().format('YYYY-MM-DD'),
            last_update_date=self.last_update_date()
            if callable(self.last_update_date) else self.last_update_date)

        # Refresh the index to make the documents visible to searches.
        refresh_index(self.index_name)

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)
Esempio n. 6
0
  def _run(self):
    json_dir = self.input()['data'].path

    mapper = LoadJSONMapper(
      config.es_host(),
      index_name=self.index_name,
      type_name=self.type_name,
      docid_key=self.docid_key,
      incremental=self.use_checksum
    )

    parallel.mapreduce(
      parallel.Collection.from_sharded(json_dir),
      mapper=mapper,
      reducer=parallel.IdentityReducer(),
      output_format=parallel.NullOutput(),
      map_workers=self.load_json_workers,
      num_shards=1,
      output_prefix=config.tmp_dir('%s/load-json' % self.index_name)
    )

    # update metadata index
    elasticsearch_requests.update_process_datetime(
      config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')
    )

    # optimize index, if requested
    if self.optimize_index:
      optimize_index(self.index_name, wait_for_merge=False)
Esempio n. 7
0
def optimize_index(index_name, wait_for_merge=1):
  # elasticsearch client throws a timeout error when waiting for an optimize
  # command to finish, so we use requests instead
  logging.info('Optimizing: %s (this may take a while)', index_name)
  resp = requests.post('http://%s/%s/_optimize?max_num_segments=1&wait_for_merge=%d' %
    (config.es_host(), index_name, wait_for_merge), timeout=10000)
  resp.raise_for_status()
Esempio n. 8
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)

        # update metadata index again. Trying to solve mystery of missing "last_update_date" entries...
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Esempio n. 9
0
 def _run(self):
  json_dir = self.input()['data'].path
  input_glob = glob.glob(json_dir + '/*.json')
  for file_name in input_glob:
    logging.info('Running file %s', file_name)
    parallel.mapreduce(
      parallel.Collection.from_glob(file_name, parallel.JSONLineInput()),
      mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'),
      reducer=parallel.IdentityReducer(),
      output_format=parallel.NullOutput(),
      output_prefix='/tmp/loadjson.' + self.index_name)
Esempio n. 10
0
 def _run(self):
     json_dir = self.input()['data'].path
     input_glob = glob.glob(json_dir + '/*.json')
     for file_name in input_glob:
         logging.info('Running file %s', file_name)
         parallel.mapreduce(
             parallel.Collection.from_glob(file_name,
                                           parallel.JSONLineInput()),
             mapper=index_util.ReloadJSONMapper(config.es_host(),
                                                self.index_name, 'maude'),
             reducer=parallel.IdentityReducer(),
             output_format=parallel.NullOutput(),
             output_prefix='/tmp/loadjson.' + self.index_name)
Esempio n. 11
0
    def run(self):
        schema_file = self.get_schemafile()
        assert os.path.exists(
            schema_file
        ), 'No schema file available for index %s' % self.index_name

        es_client = elasticsearch.Elasticsearch(config.es_host())

        endpoints = self.get_endpoints()
        # Get all of the endpoints served by this index
        # Create an `EndpointExport` object for each endpoint in order to export
        # each endpoint properly.
        #
        # Endpoint exports can be:
        #   date range based (quarterly output)
        #   filter based (index serves many endpoints)
        #   vanilla (endpoint is 1 to 1 with index and it is exported all at once)
        endpoint_batches = []
        for endpoint in endpoints:
            chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS)
            if endpoint in RANGE_ENDPOINT_MAP:
                params = RANGE_ENDPOINT_MAP[endpoint]
                params['chunks'] = chunks
                endpoint_batches = _make_date_range_endpoint_batch(
                    endpoint, params)
            elif endpoint in FILTERED_ENPOINT_MAP:
                params = FILTERED_ENPOINT_MAP[endpoint]
                query = EndpointExport.build_term_filter(**params)
                endpoint_batches.append(
                    EndpointExport(endpoint, query=query, chunks=chunks))
            else:
                endpoint_batches.append(EndpointExport(endpoint,
                                                       chunks=chunks))

        # Dump each of the `EndpointExport` objects in the list
        for ep in endpoint_batches:
            # The output_dir will be the same for all outputs, once you factor out
            # the endpoint, so we can safely look at the first one only.
            output_dir = dirname(dirname(self.output()[0].path))
            endpoint_dir = join(output_dir, ep.endpoint[1:])
            index_util.dump_index(es_client,
                                  ep.index_name,
                                  ep.endpoint,
                                  join(endpoint_dir, ep.partition),
                                  cleaner=omit_internal_keys,
                                  query=ep.query,
                                  chunks=ep.chunks)
            common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
Esempio n. 12
0
 def map(self, key, value, output):
   es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
   ep = common.ObjectDict(value)
   schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
   endpoint_dir = join(self.output_dir, ep.endpoint[1:])
   target_dir = join(endpoint_dir, ep.partition)
   common.shell_cmd('mkdir -p %s', target_dir)
   index_util.dump_index(es_client,
                         ep.index_name,
                         ep.endpoint,
                         target_dir,
                         cleaner=omit_internal_keys,
                         query=ep.query,
                         chunks=ep.chunks)
   # Copy the current JSON schema to the zip location so that it is included
   # in the sync to s3
   common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
Esempio n. 13
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host())
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     index_util.dump_index(es_client,
                           ep.index_name,
                           ep.endpoint,
                           target_dir,
                           cleaner=omit_internal_keys,
                           query=ep.query,
                           chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3
     common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
Esempio n. 14
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     index_util.dump_index(es_client,
                           ep.index_name,
                           ep.endpoint,
                           target_dir,
                           cleaner=omit_internal_keys,
                           query=ep.query,
                           chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3. flock is required to avoid a race condition when copying the schema file.
     common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file,
                            schema_file, endpoint_dir)
Esempio n. 15
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Esempio n. 16
0
  def _run(self):
    json_data = json.load(open(self.input()['data'].path))
    date_str = basename(dirname(dirname(self.input()['data'].path)))
    index_dict = {
      'index': self.index_name,
      'doc_type': self.type_name,
      'id': date_str,
      'body': json_data
    }

    es_client = elasticsearch.Elasticsearch(config.es_host())

    # We put the same document into the index twice.
    # Once with a key as its date
    # Once with a key of `current`
    # The API will serve the current key as its default response.
    # How we serve archive records is TBD, but the data is preserved this way.
    es_client.index(**index_dict)
    index_dict['id'] = 'current'
    es_client.index(**index_dict)

    elasticsearch_requests.update_process_datetime(
      config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')
    )
Esempio n. 17
0
def refresh_index(index_name):
    logging.info('Refreshing: %s', index_name)
    resp = requests.post('http://%s/%s/_refresh' %
                         (config.es_host(), index_name),
                         timeout=100000)
    resp.raise_for_status()