def run(self):
   ndc_file = self.input()[0].path
   pharma_class_dir = self.input()[1].path
   unii_file = self.input()[2].path
   output_file = self.output().path
   common.shell_cmd_quiet('mkdir -p %s' % dirname(self.output().path))
   unii_harmonization.harmonize_unii(output_file, ndc_file, unii_file, pharma_class_dir)
Exemple #2
0
 def run(self):
     output_dir = self.output().path
     common.shell_cmd_quiet('mkdir -p %s', output_dir)
     input_dir = self.local_dir
     for zip_filename in glob.glob(input_dir + '/*.zip'):
         common.shell_cmd_quiet('unzip -ouq "%s" -d %s', zip_filename,
                                output_dir)
 def run(self):
   zip_filename = self.input().path
   output_filename = self.output().path
   common.shell_cmd_quiet('mkdir -p %s' % dirname(self.output().path))
   cmd = 'unzip -p %(zip_filename)s rxnorm_mappings.txt > \
                   %(output_filename)s' % locals()
   common.shell_cmd_quiet(cmd)
Exemple #4
0
    def _run(self):
        sync_path = FILES_DIR
        target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str)
        s3_cmd = [
            'aws', '--profile',
            config.aws_profile(), 's3', 'sync', sync_path, target_bucket,
            '--exclude "*"', '--include "*.zip"', '--include "*schema.json"'
        ]

        common.shell_cmd_quiet(' '.join(s3_cmd))
Exemple #5
0
 def map(self, zip_file, value, output):
   cmd = 'zipinfo -1 %(zip_file)s' % locals()
   xml_file_name = None
   zip_contents = common.shell_cmd_quiet(cmd)
   xml_match = re.search('^([0-9a-f-]{36})\.xml$', zip_contents, re.I | re.M)
   if (xml_match):
     xml_file_name = xml_match.group()
     spl_dir_name = os.path.join(self.output().path, xml_match.group(1))
     os.system('mkdir -p "%s"' % spl_dir_name)
     common.shell_cmd_quiet('unzip -oq %(zip_file)s -d %(spl_dir_name)s' % locals())
     output.add(xml_file_name, zip_file)
Exemple #6
0
    def run(self):
        sync_path = join(BASE_DIR, self.date_str)
        target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str)
        s3_cmd = [
            'aws', '--profile',
            config.aws_profile(), 's3', 'sync', sync_path, target_bucket,
            '--exclude "*"', '--include "*.zip"', '--include "*schema.json"'
        ]

        common.shell_cmd_quiet(' '.join(s3_cmd))
        common.shell_cmd_quiet('touch %s', self.output().path)
Exemple #7
0
  def run(self):
    logging.info('Extracting: %s', (self.input().path))

    extract_dir = dirname(self.input().path)
    gsrs_file_name = os.path.basename(self.input().path)
    gz_filename = os.path.splitext(gsrs_file_name)[0] + ".gz"
    gsrs_file = join(extract_dir, gsrs_file_name)

    gz_file = join(extract_dir, gz_filename)
    os.rename(gsrs_file, gz_file)
    common.shell_cmd_quiet('gunzip ' + gz_file)
    os.rename(os.path.splitext(gz_file)[0], os.path.splitext(gz_file)[0] + ".json")
Exemple #8
0
    def run(self):
        src_dir = self.input().path
        os.system('mkdir -p "%s"' % self.output().path)
        pattern = join(src_dir, '*.zip')
        zip_files = glob.glob(pattern)

        if len(zip_files) == 0:
            logging.warning('Expected to find one or more daily med SPL files')

        extract_dir = self.output().path
        for zip_file in zip_files:
            common.shell_cmd_quiet(
                'unzip -oq -d %(extract_dir)s %(zip_file)s' % locals())
Exemple #9
0
 def run(self):
     common.shell_cmd_quiet('mkdir -p %s', self.local_dir)
     soup = BeautifulSoup(
         urllib.request.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml')
     for a in soup.find_all(href=re.compile('.*.zip')):
         if '_human_' in a.text:
             try:
                 common.download(
                     a['href'],
                     join(self.local_dir, a['href'].split('/')[-1]))
             except ProcessException as e:
                 logging.error(
                     "Could not download a DailyMed SPL archive: {0}: {1}".
                     format(a['href'], e))
Exemple #10
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     index_util.dump_index(es_client,
                           ep.index_name,
                           ep.endpoint,
                           target_dir,
                           cleaner=omit_internal_keys,
                           query=ep.query,
                           chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3
     common.shell_cmd_quiet('cp %s %s', schema_file, endpoint_dir)
Exemple #11
0
  def run(self):
    cmd = 'iconv -f %s -t %s -c %s > %s' % \
      ('ISO-8859-1//TRANSLIT', 'UTF-8', self.input().path, self.output().path)
    common.shell_cmd_quiet(cmd)

    # CSV exported by FDA iRes is often malformed because it can contain multiple columns
    # with the same name: More Code Info. Most likely iRes does this when the code information is
    # deemed too large to fit into a single column; but in any case the columns should have been named
    # distinctly, e.g. "More Code Info 01", "More Code Info 02" etc.
    # We handle this case here with Pandas and give the columns distinct names.
    df = pd.read_csv(self.output().path, index_col=False, encoding='utf-8',
                     dtype=str)
    code_info_columns = list(filter(lambda col: col.startswith('More Code Info'), list(df.columns)))
    if len(code_info_columns) > 1:
      df['Code Info All'] = df[code_info_columns].apply(
        lambda row: ' '.join(list(filter(lambda v: not pd.isna(v), list(row.values)))).strip(), axis=1)
      df.drop(code_info_columns, axis=1, inplace=True)
      df.rename(columns={"Code Info All": "More Code Info"}, inplace=True)
      df.to_csv(self.output().path, encoding='utf-8', index=False, quoting=csv.QUOTE_ALL)
Exemple #12
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     if self.index_changed_since_last_export(es_client, ep.index_name,
                                             target_dir):
         index_util.dump_index(es_client,
                               ep.index_name,
                               ep.endpoint,
                               target_dir,
                               cleaner=omit_internal_keys,
                               query=ep.query,
                               chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3. flock is required to avoid a race condition when copying the schema file.
     common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file,
                            schema_file, endpoint_dir)
Exemple #13
0
    def map(self, xml_file, value, output):
        if os.path.getsize(xml_file) > 0:

            # Oddly enough, some SPL XML files arrive from FDA gzipped, which requires us to take an additional
            # uncompressing step.
            filetype = common.shell_cmd_quiet('file %(xml_file)s' % locals())
            if "gzip compressed data" in filetype.decode(
            ) or "DOS/MBR boot sector" in filetype.decode():
                # logging.warning("SPL XML is gzipped: " + xml_file)
                gz_file = xml_file + '.gz'
                os.rename(xml_file, gz_file)
                with gzip.open(gz_file, 'rb') as f_in, open(xml_file,
                                                            'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

            p = etree.XMLParser(huge_tree=True)
            try:
                tree = etree.parse(open(xml_file), parser=p)
                code = next(
                    iter(
                        tree.xpath(
                            "//ns:document/ns:code[@codeSystem='2.16.840.1.113883.6.1']/@displayName",
                            namespaces=self.NS)), '')
                if code.lower().find('human') != -1:
                    spl_id = tree.xpath('//ns:document/ns:id/@root',
                                        namespaces=self.NS)[0].lower()
                    spl_set_id = tree.xpath('//ns:document/ns:setId/@root',
                                            namespaces=self.NS)[0].lower()
                    version = tree.xpath(
                        '//ns:document/ns:versionNumber/@value',
                        namespaces=self.NS)[0]
                    output.add(spl_set_id, {
                        'spl_id': spl_id,
                        'version': version
                    })
                elif len(code) == 0:
                    logging.warning("Not a drug label SPL file: " + xml_file)
            except XMLSyntaxError as e:
                logging.warning("Invalid SPL file: " + xml_file)
                logging.warning(e)
            except:
                logging.error("Error processing SPL file: " + xml_file)
                traceback.print_exc()
                raise
        else:
            logging.warning("Zero length SPL file: " + xml_file)
def ExtractXMLFromNestedZip(zip_filename, output_dir, exclude_images=True):
  for child_zip_filename in list_zip_files_in_zip(zip_filename):
    base_zip = basename(child_zip_filename)
    target_dir = base_zip.split('.')[0]
    cmd = 'unzip -j -d %(output_dir)s/%(target_dir)s \
                       %(zip_filename)s \
                       %(child_zip_filename)s' % locals()
    common.shell_cmd_quiet(cmd)

    cmd = 'unzip %(output_dir)s/%(target_dir)s/%(base_zip)s -d \
                   %(output_dir)s/%(target_dir)s' % locals()
    if exclude_images:
      cmd += ' -x *.jpg'

      common.shell_cmd_quiet(cmd)
      common.shell_cmd_quiet('rm %(output_dir)s/%(target_dir)s/%(base_zip)s' % locals())
Exemple #15
0
 def map(self, _, value, output):
     value = value.strip()
     xml_file = join(self.spl_path, value, value + '.xml')
     if not os.path.exists(xml_file):
         logging.info('File does not exist, skipping %s', xml_file)
         return
     spl_js = SPL_JS
     loinc = LOINC
     cmd = 'node %(spl_js)s %(xml_file)s %(loinc)s' % locals()
     json_str = ''
     try:
         json_str = common.shell_cmd_quiet(cmd)
         json_obj = json.loads(json_str)
         if not json_obj.get('set_id'):
             logging.error('SPL file has no set_id: %s', xml_file)
         else:
             output.add(xml_file, json_obj)
     except:
         logging.error('Unable to convert SPL XML to JSON: %s', xml_file)
         logging.error('cmd: %s', cmd)
         logging.error('json: %s', json_str)
         logging.error(sys.exc_info()[0])
         raise
Exemple #16
0
 def map(self, _, value, output):
   value = value.strip()
   xml_file = join(self.spl_path, value, value + '.xml')
   if not os.path.exists(xml_file):
     logging.info('File does not exist, skipping %s', xml_file)
     return
   spl_js = SPL_JS
   loinc = LOINC
   cmd = 'node %(spl_js)s %(xml_file)s %(loinc)s' % locals()
   json_str = ''
   try:
     json_str = common.shell_cmd_quiet(cmd)
     json_obj = json.loads(json_str)
     if not json_obj.get('set_id'):
       logging.error('SPL file has no set_id: %s', xml_file)
     else:
       output.add(xml_file, json_obj)
   except:
     logging.error('Unable to convert SPL XML to JSON: %s', xml_file)
     logging.error('cmd: %s', cmd)
     logging.error('json: %s', json_str)
     logging.error(sys.exc_info()[0])
     raise
  def run(self):
    for filename in glob.glob(SPL_S3_DIR + '/*/*.xml'):
      src_dir = dirname(filename)
      barcode_target = join(src_dir, 'barcodes')
      xml_out = join(barcode_target, 'otc-bars.xml')
      json_out = xml_out.replace('.xml', '.json')

      if not os.path.exists(xml_out):
        common.shell_cmd_quiet('mkdir -p %s', barcode_target)
        # logging.info('Zbarimg on directory %s', src_dir)
        cmd = 'find %(src_dir)s -name "*.jpg" -size +0\
                                -exec zbarimg -q --xml {} \; > \
                    %(xml_out)s' % locals()
        common.shell_cmd_quiet(cmd)

      if common.is_older(json_out, xml_out):
        # logging.info('%s does not exist, producing...', json_out)
        process_barcodes.XML2JSON(xml_out)

    common.shell_cmd_quiet('touch %s', self.output().path)
Exemple #18
0
    def run(self):
        cmd = 'iconv -f %s -t %s -c %s > %s' % \
          ('ISO-8859-1//TRANSLIT', 'UTF-8', self.input().path, self.output().path)

        common.shell_cmd_quiet(cmd)
Exemple #19
0
 def test_shell_cmd_quiet(self):
   tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32)))
   common.shell_cmd_quiet('touch %(tmpFile)s' % locals())
   assert common.shell_cmd_quiet('ls %(tmpFile)s' % locals()).startswith(tmpFile)
Exemple #20
0
LOINC = join(RUN_DIR, 'spl/data/sections.csv')

SPL_S3_BUCKET = 's3://openfda-data-spl/data/'
SPL_STAGING_S3_BUCKET = 's3://openfda-data-spl-staging/'
SPL_S3_LOCAL_DIR = config.data_dir('spl/s3_sync')
SPL_INDEX_DIR = config.data_dir('spl/index.db')
SPL_JSON_DIR = config.data_dir('spl/json.db')
SPL_ANNOTATED_DIR = config.data_dir('spl/annotated.db')

DAILY_MED_DIR = config.data_dir('spl/dailymed')
DAILY_MED_DOWNLOADS_DIR = config.data_dir('spl/dailymed/raw')
DAILY_MED_EXTRACT_DIR = config.data_dir('spl/dailymed/extract')
DAILY_MED_FLATTEN_DIR = config.data_dir('spl/dailymed/flatten')
DAILY_MED_DOWNLOADS_PAGE = 'https://dailymed.nlm.nih.gov/dailymed/spl-resources-all-drug-labels.cfm'

common.shell_cmd_quiet('mkdir -p %s', SPL_S3_LOCAL_DIR)
common.shell_cmd_quiet('mkdir -p %s', DAILY_MED_DIR)


class DownloadDailyMedSPL(luigi.Task):
    local_dir = DAILY_MED_DOWNLOADS_DIR

    def output(self):
        return luigi.LocalTarget(self.local_dir)

    def run(self):
        common.shell_cmd_quiet('mkdir -p %s', self.local_dir)
        soup = BeautifulSoup(
            urllib.request.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml')
        for a in soup.find_all(href=re.compile('.*.zip')):
            if '_human_' in a.text:
Exemple #21
0
 def map(self, zip_file, value, output):
     output_dir = self.output().path
     common.shell_cmd_quiet('mkdir -p %s', output_dir)
     common.shell_cmd_quiet('7z x "%s" -aoa -bd -y -o%s', zip_file,
                            output_dir)
Exemple #22
0
 def test_shell_cmd_quiet(self):
   tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32)))
   common.shell_cmd_quiet('touch %(tmpFile)s' % locals())
   assert common.shell_cmd_quiet('ls %(tmpFile)s' % locals()).startswith(tmpFile.encode())
import luigi
import simplejson as json
from bs4 import BeautifulSoup

from openfda import common, config, parallel, spl
from openfda.annotation_table import unii_harmonization
from openfda.spl import process_barcodes, extract
from openfda.tasks import DependencyTriggeredTask

data_dir = config.data_dir('harmonization')
BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD')
BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE)
SPL_S3_DIR = config.data_dir('spl/s3_sync')
TMP_DIR = config.tmp_dir()

common.shell_cmd_quiet('mkdir -p %s', data_dir)
common.shell_cmd_quiet('mkdir -p %s', BASE_DIR)
common.shell_cmd_quiet('mkdir -p %s', TMP_DIR)

SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db')
DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/'

PHARM_CLASS_DOWNLOAD = \
  DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip'

RXNORM_DOWNLOAD = \
  DAILYMED_PREFIX + 'rxnorm_mappings.zip'

UNII_DOWNLOAD = \
  'https://fdasis.nlm.nih.gov/srs/download/srs/UNIIs.zip'
 def run(self):
   zip_filename = self.input().path
   output_dir = self.output().path
   common.shell_cmd_quiet('mkdir -p %s' % output_dir)
   ExtractXMLFromNestedZip(zip_filename, output_dir)