Example #1
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)

        # update metadata index again. Trying to solve mystery of missing "last_update_date" entries...
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Example #2
0
  def _run(self):
    json_dir = self.input()['data'].path

    mapper = LoadJSONMapper(
      config.es_host(),
      index_name=self.index_name,
      type_name=self.type_name,
      docid_key=self.docid_key,
      incremental=self.use_checksum
    )

    parallel.mapreduce(
      parallel.Collection.from_sharded(json_dir),
      mapper=mapper,
      reducer=parallel.IdentityReducer(),
      output_format=parallel.NullOutput(),
      map_workers=self.load_json_workers,
      num_shards=1,
      output_prefix=config.tmp_dir('%s/load-json' % self.index_name)
    )

    # update metadata index
    elasticsearch_requests.update_process_datetime(
      config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')
    )

    # optimize index, if requested
    if self.optimize_index:
      optimize_index(self.index_name, wait_for_merge=False)
Example #3
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(),
            self.index_name,
            last_run_date=arrow.utcnow().format('YYYY-MM-DD'),
            last_update_date=self.last_update_date()
            if callable(self.last_update_date) else self.last_update_date)

        # Refresh the index to make the documents visible to searches.
        refresh_index(self.index_name)

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)
Example #4
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Example #5
0
import urlparse

import arrow
import simplejson as json

from openfda.tasks import DependencyTriggeredTask
from openfda import common, config, parallel, spl
from openfda.annotation_table import unii_harmonization
from openfda.spl import process_barcodes, extract

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
data_dir = config.data_dir('harmonization')
BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD')
BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE)
SPL_S3_DIR = config.data_dir('spl/s3_sync')
TMP_DIR = config.tmp_dir()

common.shell_cmd('mkdir -p %s', data_dir)
common.shell_cmd('mkdir -p %s', BASE_DIR)
common.shell_cmd('mkdir -p %s', TMP_DIR)

SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db')
DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/'

PHARM_CLASS_DOWNLOAD = \
  DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip'

RXNORM_DOWNLOAD = \
  DAILYMED_PREFIX + 'rxnorm_mappings.zip'

NDC_DOWNLOAD_PAGE = \
Example #6
0
import urlparse

import arrow
import simplejson as json

from openfda.tasks import DependencyTriggeredTask
from openfda import common, config, parallel, spl
from openfda.annotation_table import unii_harmonization
from openfda.spl import process_barcodes, extract

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
data_dir = config.data_dir('harmonization')
BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD')
BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE)
SPL_S3_DIR = config.data_dir('spl/s3_sync')
TMP_DIR = config.tmp_dir()

common.shell_cmd('mkdir -p %s', data_dir)
common.shell_cmd('mkdir -p %s', BASE_DIR)
common.shell_cmd('mkdir -p %s', TMP_DIR)

SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db')
DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/'

PHARM_CLASS_DOWNLOAD = \
  DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip'

RXNORM_DOWNLOAD = \
  DAILYMED_PREFIX + 'rxnorm_mappings.zip'

NDC_DOWNLOAD_PAGE = \