Esempio n. 1
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)

        # update metadata index again. Trying to solve mystery of missing "last_update_date" entries...
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Esempio n. 2
0
 def _run(self):
   logging.info('Optimizing %s', self.index_name)
   es = config.es_client()
   es.indices.optimize(
     index=self.index_name,
     max_num_segments=1
   )
Esempio n. 3
0
    def _run(self):
        json_data = json.load(open(self.input()['data'].path))
        date_str = basename(dirname(dirname(self.input()['data'].path)))
        index_dict = {
            'index': self.index_name,
            'doc_type': self.type_name,
            'id': date_str,
            'body': json_data
        }

        es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=60)

        # We put the same document into the index twice.
        # Once with a key as its date
        # Once with a key of `current`
        # The API will serve the current key as its default response.
        # How we serve archive records is TBD, but the data is preserved this way.
        es_client.index(**index_dict)
        index_dict['id'] = 'current'
        es_client.index(**index_dict)

        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'),
            arrow.utcnow().format('YYYY-MM-DD'))
Esempio n. 4
0
  def _run(self):
    json_dir = self.input()['data'].path

    mapper = LoadJSONMapper(
      config.es_host(),
      index_name=self.index_name,
      type_name=self.type_name,
      docid_key=self.docid_key,
      incremental=self.use_checksum
    )

    parallel.mapreduce(
      parallel.Collection.from_sharded(json_dir),
      mapper=mapper,
      reducer=parallel.IdentityReducer(),
      output_format=parallel.NullOutput(),
      map_workers=self.load_json_workers,
      num_shards=1,
      output_prefix=config.tmp_dir('%s/load-json' % self.index_name)
    )

    # update metadata index
    elasticsearch_requests.update_process_datetime(
      config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')
    )

    # optimize index, if requested
    if self.optimize_index:
      optimize_index(self.index_name, wait_for_merge=False)
Esempio n. 5
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(),
            self.index_name,
            last_run_date=arrow.utcnow().format('YYYY-MM-DD'),
            last_update_date=self.last_update_date()
            if callable(self.last_update_date) else self.last_update_date)

        # Refresh the index to make the documents visible to searches.
        refresh_index(self.index_name)

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)
Esempio n. 6
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Esempio n. 7
0
  def _run(self):
    json_data = json.load(open(self.input()['data'].path))
    date_str = basename(dirname(dirname(self.input()['data'].path)))
    index_dict = {
      'index': self.index_name,
      'doc_type': self.type_name,
      'id': date_str,
      'body': json_data
    }

    es_client = elasticsearch.Elasticsearch(config.es_host())

    # We put the same document into the index twice.
    # Once with a key as its date
    # Once with a key of `current`
    # The API will serve the current key as its default response.
    # How we serve archive records is TBD, but the data is preserved this way.
    es_client.index(**index_dict)
    index_dict['id'] = 'current'
    es_client.index(**index_dict)

    elasticsearch_requests.update_process_datetime(
      config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')
    )
Esempio n. 8
0
 def _run(self):
     logging.info('Optimizing %s', self.index_name)
     es = config.es_client()
     es.indices.optimize(index=self.index_name, max_num_segments=1)