Exemple #1
0
  def run(self):
    logging.basicConfig(level=logging.INFO)

    zip_filename = config.data_dir('nsde/raw/nsde.zip')
    output_dir = config.data_dir('nsde/raw')
    os.system('mkdir -p %s' % output_dir)
    common.download(NSDE_DOWNLOAD, zip_filename)
    os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals())

    csv_file = join(output_dir, self.csv_file_name)
    logging.info("Reading csv file: %s", (csv_file))
    os.system('mkdir -p %s' % dirname(self.output().path))
    df = pd.read_csv(csv_file, encoding='utf-8-sig')
    df.to_json(self.output().path, orient='records')
    with open(self.output().path, "w") as f:
      for row in df.iterrows():
        row[1].to_json(f)
        f.write("\n")
Exemple #2
0
import simplejson as json
import sys
import time

import arrow
import elasticsearch
import luigi

from openfda import config, common, elasticsearch_requests, index_util, parallel
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.tasks import AlwaysRunTask
from openfda.spl import annotate
from openfda.parallel import IdentityReducer

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
META_DIR = config.data_dir('spl/meta')
# Ensure meta directory is available for task tracking
common.shell_cmd('mkdir -p %s', META_DIR)

SPL_JS = join(RUN_DIR, 'spl/spl_to_json.js')
LOINC = join(RUN_DIR, 'spl/data/sections.csv')

SPL_S3_BUCKET = 's3://openfda-data-spl/data/'
SPL_S3_LOCAL_DIR = config.data_dir('spl/s3_sync')
SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, 'change_log/SPLDocuments.csv')
SPL_BATCH_DIR = join(META_DIR, 'batch')
SPL_PROCESS_DIR = config.data_dir('spl/batches')

common.shell_cmd('mkdir -p %s', SPL_S3_LOCAL_DIR)
common.shell_cmd('mkdir -p %s', SPL_PROCESS_DIR)
Exemple #3
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('device_pma/extracted'))
Exemple #4
0
Pipeline for converting CSV nsde data to JSON and importing into Elasticsearch.
'''

import glob
import os
from os.path import join, dirname

import luigi

from openfda import common, config, parallel, index_util
from openfda.common import newest_file_timestamp

NSDE_DOWNLOAD = \
  'https://download.open.fda.gov/Comprehensive_NDC_SPL_Data_Elements_File.zip'
NSDE_EXTRACT_DB = 'nsde/nsde.db'
NSDE_RAW_DIR = config.data_dir('nsde/raw')


class DownloadNSDE(luigi.Task):
    def output(self):
        return luigi.LocalTarget(join(NSDE_RAW_DIR, 'nsde.csv'))

    def run(self):
        output_dir = dirname(self.output().path)
        zip_filename = join(output_dir, 'nsde.zip')
        common.download(NSDE_DOWNLOAD, zip_filename)
        os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals())
        os.rename(glob.glob(join(output_dir, '*.csv'))[0], self.output().path)


class NSDE2JSONMapper(parallel.Mapper):
Exemple #5
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('device_pma/extracted'))
Exemple #6
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('classification/extracted'))
Exemple #7
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('nsde/raw/nsde_raw.json'))
Exemple #8
0
 def output(self):
   return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/annotate.db'), BATCH))
Exemple #9
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('registration/registration_listing.db'))
Exemple #10
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('registration/owner_operator.db'))
Exemple #11
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('registration/extracted'))
Exemple #12
0
from bs4 import BeautifulSoup
import elasticsearch
import luigi
import pandas
import requests
import simplejson as json
import urllib2

from openfda import common, config, elasticsearch_requests, index_util, parallel
from openfda import download_util
from openfda.tasks import AlwaysRunTask
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
  DeviceAnnotateMapper)

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = config.data_dir('registration')
common.shell_cmd('mkdir -p %s', BASE_DIR)
# A directory for holding files that track Task state
META_DIR = config.data_dir('registration/meta')
common.shell_cmd('mkdir -p %s', META_DIR)

DEVICE_REG_PAGE = ('http://www.fda.gov/MedicalDevices/'
                   'DeviceRegulationandGuidance/HowtoMarketYourDevice/'
                   'RegistrationandListing/ucm134495.htm')

S3_BUCKET = 's3://openfda-data-reglist/'
S3_LOCAL_DIR = config.data_dir('registration/s3_sync')

common.shell_cmd('mkdir -p %s', S3_LOCAL_DIR)

REMAPPED_FILES = {
Exemple #13
0
import sys
import time

import arrow
import elasticsearch
import luigi

from openfda import config, common, elasticsearch_requests, index_util, parallel
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.tasks import AlwaysRunTask
from openfda.spl import annotate
from openfda.parallel import IdentityReducer


RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
META_DIR = config.data_dir('spl/meta')
# Ensure meta directory is available for task tracking
common.shell_cmd('mkdir -p %s', META_DIR)

SPL_JS = join(RUN_DIR, 'spl/spl_to_json.js')
LOINC = join(RUN_DIR, 'spl/data/sections.csv')

SPL_S3_BUCKET = 's3://openfda-data-spl/data/'
SPL_S3_LOCAL_DIR = config.data_dir('spl/s3_sync')
SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, 'change_log/SPLDocuments.csv')
SPL_BATCH_DIR = join(META_DIR, 'batch')
SPL_PROCESS_DIR = config.data_dir('spl/batches')

common.shell_cmd('mkdir -p %s', SPL_S3_LOCAL_DIR)
common.shell_cmd('mkdir -p %s', SPL_PROCESS_DIR)
Exemple #14
0
 def output(self):
   return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/extracted'), BATCH))
Exemple #15
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('510k/raw'))
Exemple #16
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('510k', 'json.db'))
Exemple #17
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('registration/annotate.db'))
Exemple #18
0
from os.path import basename, dirname, join
import re
import subprocess
import urllib2
import urlparse

import arrow
import simplejson as json

from openfda.tasks import DependencyTriggeredTask
from openfda import common, config, parallel, spl
from openfda.annotation_table import unii_harmonization
from openfda.spl import process_barcodes, extract

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
data_dir = config.data_dir('harmonization')
BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD')
BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE)
SPL_S3_DIR = config.data_dir('spl/s3_sync')
TMP_DIR = config.tmp_dir()

common.shell_cmd('mkdir -p %s', data_dir)
common.shell_cmd('mkdir -p %s', BASE_DIR)
common.shell_cmd('mkdir -p %s', TMP_DIR)

SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db')
DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/'

PHARM_CLASS_DOWNLOAD = \
  DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip'
Exemple #19
0
import sys
import time

import arrow
import elasticsearch
import luigi

from openfda import config, common, elasticsearch_requests, index_util, parallel
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.tasks import AlwaysRunTask
from openfda.spl import annotate
from openfda.parallel import IdentityReducer


RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
META_DIR = config.data_dir("spl/meta")
# Ensure meta directory is available for task tracking
common.shell_cmd("mkdir -p %s", META_DIR)

SPL_JS = join(RUN_DIR, "spl/spl_to_json.js")
LOINC = join(RUN_DIR, "spl/data/sections.csv")

SPL_S3_BUCKET = "s3://openfda-data-spl/data/"
SPL_S3_LOCAL_DIR = config.data_dir("spl/s3_sync")
SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, "change_log/SPLDocuments.csv")
SPL_BATCH_DIR = join(META_DIR, "batch")
SPL_PROCESS_DIR = config.data_dir("spl/batches")

common.shell_cmd("mkdir -p %s", SPL_S3_LOCAL_DIR)
common.shell_cmd("mkdir -p %s", SPL_PROCESS_DIR)
Exemple #20
0
import arrow
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, index_util, parallel
from openfda import device_common, download_util
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
  DeviceAnnotateMapper)
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
# A directory for holding files that track Task state
META_DIR = config.data_dir('classification/meta')
common.shell_cmd('mkdir -p %s', META_DIR)


DEVICE_CLASS_ZIP = ('http://www.accessdata.fda.gov/premarket/'
                    'ftparea/foiclass.zip')


class DownloadFoiClass(luigi.Task):
  def requires(self):
    return []

  def output(self):
    return luigi.LocalTarget(config.data_dir('classification/raw'))

  def run(self):
Exemple #21
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('res/batches/%s' % self.batch.strftime('%Y%m%d')))
Exemple #22
0
 def output(self):
     file_name = 'res/batches/%s/enforcement.csv' % self.batch.strftime(
         '%Y%m%d')
     return luigi.LocalTarget(config.data_dir(file_name))
Exemple #23
0
from os.path import dirname, join
import sys

import arrow
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, elasticsearch_requests, index_util, parallel
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
  DeviceAnnotateMapper)
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
META_DIR = config.data_dir('device_recall/meta')
common.shell_cmd('mkdir -p %s', META_DIR)

DEVICE_RECALL_BUCKET = 's3://openfda-device-recalls/'
DEVICE_RECALL_LOCAL_DIR = config.data_dir('device_recall/s3_sync')

class SyncS3DeviceRecall(AlwaysRunTask):
  bucket = DEVICE_RECALL_BUCKET
  local_dir = DEVICE_RECALL_LOCAL_DIR

  def _run(self):
    common.cmd(['mkdir', '-p', self.local_dir])
    common.cmd(['aws',
                '--profile=' + config.aws_profile(),
                's3',
                'sync',
Exemple #24
0
'''

import collections
import logging
import os
from os.path import basename, dirname, join
import sys

import luigi
import simplejson as json

from openfda import common, config, parallel
from openfda.parallel import mapreduce, Collection, Mapper, PivotReducer

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
DATA_DIR = config.data_dir('device_harmonization')
common.shell_cmd('mkdir -p %s', DATA_DIR)

PLUCK_MAP = {
  'device_pma': [
    'applicant',
    'trade_name',
    'generic_name',
    'decision_date',
    'advisory_committee',
    'pma_number',
    'product_code',
    'advisory_committee',
  ],
  '510k': [
    'applicant',
Exemple #25
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('device_recall/annotate.db'))
Exemple #26
0
import glob
import logging
import os
import re
from urllib.parse import urljoin
from urllib.request import urlopen
from os.path import join, dirname

import luigi
from bs4 import BeautifulSoup

from openfda import common, config, index_util, parallel
from openfda.common import convert_unicode, newest_file_timestamp

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = config.data_dir('caers')
DOWNLOAD_DIR = config.data_dir('caers/raw')
common.shell_cmd('mkdir -p %s', BASE_DIR)

CAERS_DOWNLOAD_PAGE_URL = 'https://www.fda.gov/food/compliance-enforcement-food/cfsan-adverse-event-reporting-system-caers'

RENAME_MAP = {
  'report id': 'report_number',
  'caers created date': 'date_created',
  'date of event': 'date_started',
  'product type': 'role',
  'product': 'name_brand',
  'product code': 'industry_code',
  'description': 'industry_name',
  'patient age': 'age',
  'age units': 'age_unit',
Exemple #27
0
import arrow
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, elasticsearch_requests, index_util, parallel
from openfda import download_util
from openfda.device_clearance import transform
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
  DeviceAnnotateMapper)
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
# A directory for holding files that track Task state
META_DIR = config.data_dir('510k/meta')
common.shell_cmd('mkdir -p %s', META_DIR)


CLEARED_DEVICE_URL = 'http://www.accessdata.fda.gov/premarket/ftparea/'
CLEARED_DEV_ZIPS = [CLEARED_DEVICE_URL + 'pmn96cur.zip',
  CLEARED_DEVICE_URL + 'pmn9195.zip',
  CLEARED_DEVICE_URL + 'pmn8690.zip',
  CLEARED_DEVICE_URL + 'pmn8185.zip',
  CLEARED_DEVICE_URL + 'pmn7680.zip']


class Download_510K(luigi.Task):
  def requires(self):
    return []
Exemple #28
0
 def output(self):
   return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/annotate.db'), BATCH))
Exemple #29
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('510k/raw'))
Exemple #30
0
import arrow
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, elasticsearch_requests, index_util, parallel
from openfda import download_util
from openfda.device_clearance import transform
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
                                                   DeviceAnnotateMapper)
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
# A directory for holding files that track Task state
META_DIR = config.data_dir('510k/meta')
common.shell_cmd('mkdir -p %s', META_DIR)

CLEARED_DEVICE_URL = 'http://www.accessdata.fda.gov/premarket/ftparea/'
CLEARED_DEV_ZIPS = [
    CLEARED_DEVICE_URL + 'pmn96cur.zip', CLEARED_DEVICE_URL + 'pmn9195.zip',
    CLEARED_DEVICE_URL + 'pmn8690.zip', CLEARED_DEVICE_URL + 'pmn8185.zip',
    CLEARED_DEVICE_URL + 'pmn7680.zip'
]


class Download_510K(luigi.Task):
    def requires(self):
        return []

    def output(self):
Exemple #31
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('510k/extracted'))
Exemple #32
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('510k/extracted'))
Exemple #33
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('510k', 'json.db'))
Exemple #34
0
import glob
import logging
import os
import sys
import traceback

import arrow
import luigi
import xmltodict
from openfda import common, config, index_util, parallel
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
                                                   DeviceAnnotateMapper)

DEVICE_UDI_BUCKET = 's3://cdrh-data'
DEVICE_UDI_LOCAL_DIR = config.data_dir('device_udi/s3_sync')
BATCH = arrow.utcnow().floor('day').format('YYYYMMDD')
AWS_CLI = 'aws'


class SyncS3DeviceUDI(luigi.Task):
  bucket = DEVICE_UDI_BUCKET
  local_dir = DEVICE_UDI_LOCAL_DIR
  aws = AWS_CLI

  def flag_file(self):
    return os.path.join(self.local_dir, '.last_sync_time')

  def complete(self):
    # Only run S3 sync once per day.
    if config.disable_downloads():
Exemple #35
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('nsde/raw/nsde_raw.json'))
Exemple #36
0
 def output(self):
   return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/extracted'), BATCH))
Exemple #37
0
 def output(self):
   return luigi.LocalTarget(config.data_dir(NSDE_EXTRACT_DB))
Exemple #38
0
import json
import os
import re
from os.path import join, dirname

import arrow
import luigi
import pandas as pd

from openfda import common, config, index_util, parallel
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(os.path.abspath(__file__))

S3_ACCESS_LOGS_BUCKET = 's3://openfda-logs/download/'
S3_ACCESS_LOGS_DIR = config.data_dir('downloadstats/s3_logs_raw')
S3_STATS_DB_DIR = config.data_dir('downloadstats/s3_stats.db')
S3_ACCESS_LOGS_CUTOFF = arrow.get('2017-03-01')

CF_ACCESS_LOGS_BUCKET = 's3://openfda-splash-logs/download-cf-logs/'
CF_ACCESS_LOGS_DIR = config.data_dir('downloadstats/cf_logs_raw')
CF_STATS_DB_DIR = config.data_dir('downloadstats/cf_stats.db')

TOTAL_STATS_DB_DIR = config.data_dir('downloadstats/total_stats.db')

ENDPOINT_INDEX_MAP = {
  'animalandveterinary/event': 'animalandveterinarydrugevent',
  'drug/event': 'drugevent',
  'drug/label': 'druglabel',
  'drug/enforcement': 'drugenforcement',
  'drug/ndc': 'ndc',
Exemple #39
0
import arrow
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, index_util, parallel
from openfda import device_common, download_util
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
                                                   DeviceAnnotateMapper)
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
# A directory for holding files that track Task state
META_DIR = config.data_dir('classification/meta')
common.shell_cmd('mkdir -p %s', META_DIR)

DEVICE_CLASS_ZIP = ('https://www.accessdata.fda.gov/premarket/'
                    'ftparea/foiclass.zip')


class DownloadFoiClass(luigi.Task):
    def requires(self):
        return []

    def output(self):
        return luigi.LocalTarget(config.data_dir('classification/raw'))

    def run(self):
        output_filename = join(self.output().path,
Exemple #40
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('classification/annotate.db'))
Exemple #41
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('classification/extracted'))
Exemple #42
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('classification/annotate.db'))
Exemple #43
0
import time
import urllib2

import arrow
import elasticsearch
import luigi

from openfda import parallel, config, index_util, elasticsearch_requests
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.faers import annotate
from openfda.faers import xml_to_json
from openfda.tasks import AlwaysRunTask, DependencyTriggeredTask

# this should be a symlink to wherever the real data directory is
RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = config.data_dir()
FAERS_HISTORIC = (
    'http://www.fda.gov/Drugs/GuidanceCompliance'
    'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm083765.htm')
FAERS_CURRENT = (
    'http://www.fda.gov/Drugs/GuidanceCompliance'
    'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm082193.htm')

MAX_RECORDS_PER_FILE = luigi.IntParameter(-1, is_global=True)


class DownloadDataset(AlwaysRunTask):
    '''
  This task downloads all datasets that have not yet been fetched.
  '''
    def _fetch(self):
Exemple #44
0
''' Device pipeline for downloading, transforming to JSON and loading COVID-19 Serological Testing Validation Project
 data into Elasticsearch.
'''

import glob
from os.path import dirname

import luigi
import arrow

from openfda import common, config, index_util, parallel
from openfda.tasks import AlwaysRunTask
from openfda.common import first_file_timestamp

SEROLOGY_TEST_BUCKET = 's3://openfda-covid19serology/'
SEROLOGY_TEST_SYNC_DIR = config.data_dir('covid19serology/s3_sync')
SEROLOGY_TEST_JSON_DB_DIR = config.data_dir('covid19serology/json.db')


class SyncS3SerologyTest(AlwaysRunTask):

  def _run(self):
    common.cmd(['mkdir', '-p', SEROLOGY_TEST_SYNC_DIR])
    common.cmd(['aws',
                '--profile=' + config.aws_profile(),
                's3',
                'sync',
                SEROLOGY_TEST_BUCKET,
                SEROLOGY_TEST_SYNC_DIR])

  def output(self):
Exemple #45
0
import csv
import re
import logging
import os
from os.path import dirname, join

import arrow
import datetime
import luigi

from openfda import common, config, index_util, parallel

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = config.data_dir('caers')
common.shell_cmd('mkdir -p %s', BASE_DIR)

S3_BUCKET = 's3://openfda-data-caers/'
S3_LOCAL_DIR = config.data_dir('caers/s3_sync')
# TODO(hansnelsen): initiate and resolve naming convention for this file and
#                   s3 bucket. Currently, the file is downloaded from
#                   s3://openfda-lonnie/caers/ (the naming of this file is
#                   not consistent). The pipeline engineer downloads it, renames
#                   it and then uploaded manually to the above bucket.
CAERS_FILE = 'caers.csv'
logging.info(S3_LOCAL_DIR, 'dir')
common.shell_cmd('mkdir -p %s', S3_LOCAL_DIR)

RENAME_MAP = {
  'Report #': 'report_number',
  'Created Date': 'date_created',
  'Event Start Date': 'date_started',
Exemple #46
0
import glob
import os
from os.path import dirname, join

import luigi

from openfda import common, config, download_util, index_util, parallel
from openfda.common import first_file_timestamp
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
                                                   DeviceAnnotateMapper)
from openfda.device_pma import transform

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
# A directory for holding files that track Task state
META_DIR = config.data_dir('device_pma/meta')
RAW_DIR = config.data_dir('device_pma/raw')
common.shell_cmd('mkdir -p %s', META_DIR)

DEVICE_PMA_ZIP = 'https://www.accessdata.fda.gov/premarket/ftparea/pma.zip'


class DownloadPMA(luigi.Task):
    def requires(self):
        return []

    def output(self):
        return luigi.LocalTarget(RAW_DIR)

    def run(self):
        output_filename = join(self.output().path,
Exemple #47
0
import os
import re
import sys
import traceback

import arrow
import luigi
from lxml import etree

from openfda import common, config, index_util, parallel
from openfda.adae import annotate
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.common import newest_file_timestamp

ADAE_BUCKET = 's3://openfda-data-adae'
ADAE_LOCAL_DIR = config.data_dir('adae/s3_sync')
BATCH = arrow.utcnow().floor('day').format('YYYYMMDD')
AWS_CLI = 'aws'

# TODO: move to an external file once the list grows unmanageable.
NULLIFIED = [
    'US-FDACVM-2018-US-045311.xml', 'US-FDACVM-2018-US-048571.xml',
    'US-FDACVM-2018-US-046672.xml', 'US-FDACVM-2017-US-042492.xml',
    'US-FDACVM-2018-US-044065.xml', 'US-FDACVM-2017-US-070108.xml',
    'US-FDACVM-2017-US-002864.xml', 'US-FDACVM-2017-US-002866.xml',
    'US-FDACVM-2017-US-052458.xml', 'US-FDACVM-2017-US-055193.xml',
    'US-FDACVM-2017-US-043931.xml', 'US-FDACVM-2018-US-002321.xml',
    'US-FDACVM-2018-US-063536.xml', 'US-FDACVM-2015-US-221044.xml',
    'US-FDACVM2019-US-016263.xml', 'US-FDACVM-2016-US-062923.xml',
    'US-FDACVM-2017-US-001483.xml', 'US-FDACVM-2017-US-009155.xml',
    'US-FDACVM-2017-US-028125.xml', 'US-FDACVM-2017-US-033030.xml',
Exemple #48
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('device_pma/json.db'))
Exemple #49
0
 def output(self):
     return luigi.LocalTarget(
         os.path.join(config.data_dir('adae/json.db'), BATCH))
Exemple #50
0
 def output(self):
     return luigi.LocalTarget(
         config.data_dir('res/batches/%s' % self.batch.strftime('%Y%m%d')))
Exemple #51
0
from os.path import dirname, join
import sys

import arrow
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, elasticsearch_requests, index_util, parallel
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
                                                   DeviceAnnotateMapper)
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
META_DIR = config.data_dir('device_recall/meta')
common.shell_cmd('mkdir -p %s', META_DIR)

DEVICE_RECALL_BUCKET = 's3://openfda-device-recalls/'
DEVICE_RECALL_LOCAL_DIR = config.data_dir('device_recall/s3_sync')


class SyncS3DeviceRecall(AlwaysRunTask):
    bucket = DEVICE_RECALL_BUCKET
    local_dir = DEVICE_RECALL_LOCAL_DIR

    def _run(self):
        common.cmd(['mkdir', '-p', self.local_dir])
        common.cmd([
            'aws', '--profile=' + config.aws_profile(), 's3', 'sync',
            self.bucket, self.local_dir
Exemple #52
0
 def output(self):
     return luigi.LocalTarget(config.data_dir(NSDE_EXTRACT_DB))
Exemple #53
0
 def output(self):
     return luigi.LocalTarget(config.data_dir('device_recall/annotate.db'))
Exemple #54
0
import arrow
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, download_util, elasticsearch_requests, index_util, parallel

from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
  DeviceAnnotateMapper)
from openfda.device_pma import transform
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
# A directory for holding files that track Task state
META_DIR = config.data_dir('device_pma/meta')
common.shell_cmd('mkdir -p %s', META_DIR)

DEVICE_PMA_ZIP = 'https://www.accessdata.fda.gov/premarket/ftparea/pma.zip'

class DownloadPMA(luigi.Task):
  def requires(self):
    return []

  def output(self):
    return luigi.LocalTarget(config.data_dir('device_pma/raw'))

  def run(self):
    output_filename = join(self.output().path, DEVICE_PMA_ZIP.split('/')[-1])
    common.download(DEVICE_PMA_ZIP, output_filename)
Exemple #55
0
import urllib2

import arrow
import elasticsearch
import luigi

from openfda import parallel, config, index_util, elasticsearch_requests
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.faers import annotate
from openfda.faers import xml_to_json
from openfda.tasks import AlwaysRunTask, DependencyTriggeredTask


# this should be a symlink to wherever the real data directory is
RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = config.data_dir()
FAERS_HISTORIC = ('http://www.fda.gov/Drugs/GuidanceCompliance'
  'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm083765.htm')
FAERS_CURRENT = ('http://www.fda.gov/Drugs/GuidanceCompliance'
  'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm082193.htm')

MAX_RECORDS_PER_FILE = luigi.IntParameter(-1, is_global=True)

class DownloadDataset(AlwaysRunTask):
  '''
  This task downloads all datasets that have not yet been fetched.
  '''
  def _fetch(self):
    for page in [self._faers_current.find_all(href=re.compile('.*.zip')),
                 self._faers_historic.find_all(href=re.compile('.*.zip'))]:
      for a in page:
Exemple #56
0
 def output(self):
   return luigi.LocalTarget(config.data_dir('device_pma/json.db'))
Exemple #57
0
import glob
import logging
import os
import sys
import traceback

import arrow
import luigi
import xmltodict
from openfda import common, config, index_util, parallel
from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
                                                   DeviceAnnotateMapper)

DEVICE_UDI_BUCKET = 's3://cdrh-data'
DEVICE_UDI_LOCAL_DIR = config.data_dir('device_udi/s3_sync')
BATCH = arrow.utcnow().floor('day').format('YYYYMMDD')
AWS_CLI = 'aws'


class SyncS3DeviceUDI(luigi.Task):
  bucket = DEVICE_UDI_BUCKET
  local_dir = DEVICE_UDI_LOCAL_DIR
  aws = AWS_CLI

  def flag_file(self):
    return os.path.join(self.local_dir, '.last_sync_time')

  def complete(self):
    # Only run S3 sync once per day.
    if config.disable_downloads():