def run(self): logging.basicConfig(level=logging.INFO) zip_filename = config.data_dir('nsde/raw/nsde.zip') output_dir = config.data_dir('nsde/raw') os.system('mkdir -p %s' % output_dir) common.download(NSDE_DOWNLOAD, zip_filename) os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) csv_file = join(output_dir, self.csv_file_name) logging.info("Reading csv file: %s", (csv_file)) os.system('mkdir -p %s' % dirname(self.output().path)) df = pd.read_csv(csv_file, encoding='utf-8-sig') df.to_json(self.output().path, orient='records') with open(self.output().path, "w") as f: for row in df.iterrows(): row[1].to_json(f) f.write("\n")
import simplejson as json import sys import time import arrow import elasticsearch import luigi from openfda import config, common, elasticsearch_requests, index_util, parallel from openfda.annotation_table.pipeline import CombineHarmonization from openfda.tasks import AlwaysRunTask from openfda.spl import annotate from openfda.parallel import IdentityReducer RUN_DIR = dirname(dirname(os.path.abspath(__file__))) META_DIR = config.data_dir('spl/meta') # Ensure meta directory is available for task tracking common.shell_cmd('mkdir -p %s', META_DIR) SPL_JS = join(RUN_DIR, 'spl/spl_to_json.js') LOINC = join(RUN_DIR, 'spl/data/sections.csv') SPL_S3_BUCKET = 's3://openfda-data-spl/data/' SPL_S3_LOCAL_DIR = config.data_dir('spl/s3_sync') SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, 'change_log/SPLDocuments.csv') SPL_BATCH_DIR = join(META_DIR, 'batch') SPL_PROCESS_DIR = config.data_dir('spl/batches') common.shell_cmd('mkdir -p %s', SPL_S3_LOCAL_DIR) common.shell_cmd('mkdir -p %s', SPL_PROCESS_DIR)
def output(self): return luigi.LocalTarget(config.data_dir('device_pma/extracted'))
Pipeline for converting CSV nsde data to JSON and importing into Elasticsearch. ''' import glob import os from os.path import join, dirname import luigi from openfda import common, config, parallel, index_util from openfda.common import newest_file_timestamp NSDE_DOWNLOAD = \ 'https://download.open.fda.gov/Comprehensive_NDC_SPL_Data_Elements_File.zip' NSDE_EXTRACT_DB = 'nsde/nsde.db' NSDE_RAW_DIR = config.data_dir('nsde/raw') class DownloadNSDE(luigi.Task): def output(self): return luigi.LocalTarget(join(NSDE_RAW_DIR, 'nsde.csv')) def run(self): output_dir = dirname(self.output().path) zip_filename = join(output_dir, 'nsde.zip') common.download(NSDE_DOWNLOAD, zip_filename) os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) os.rename(glob.glob(join(output_dir, '*.csv'))[0], self.output().path) class NSDE2JSONMapper(parallel.Mapper):
def output(self): return luigi.LocalTarget(config.data_dir('classification/extracted'))
def output(self): return luigi.LocalTarget(config.data_dir('nsde/raw/nsde_raw.json'))
def output(self): return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/annotate.db'), BATCH))
def output(self): return luigi.LocalTarget(config.data_dir('registration/registration_listing.db'))
def output(self): return luigi.LocalTarget(config.data_dir('registration/owner_operator.db'))
def output(self): return luigi.LocalTarget(config.data_dir('registration/extracted'))
from bs4 import BeautifulSoup import elasticsearch import luigi import pandas import requests import simplejson as json import urllib2 from openfda import common, config, elasticsearch_requests, index_util, parallel from openfda import download_util from openfda.tasks import AlwaysRunTask from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir('registration') common.shell_cmd('mkdir -p %s', BASE_DIR) # A directory for holding files that track Task state META_DIR = config.data_dir('registration/meta') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_REG_PAGE = ('http://www.fda.gov/MedicalDevices/' 'DeviceRegulationandGuidance/HowtoMarketYourDevice/' 'RegistrationandListing/ucm134495.htm') S3_BUCKET = 's3://openfda-data-reglist/' S3_LOCAL_DIR = config.data_dir('registration/s3_sync') common.shell_cmd('mkdir -p %s', S3_LOCAL_DIR) REMAPPED_FILES = {
import sys import time import arrow import elasticsearch import luigi from openfda import config, common, elasticsearch_requests, index_util, parallel from openfda.annotation_table.pipeline import CombineHarmonization from openfda.tasks import AlwaysRunTask from openfda.spl import annotate from openfda.parallel import IdentityReducer RUN_DIR = dirname(dirname(os.path.abspath(__file__))) META_DIR = config.data_dir('spl/meta') # Ensure meta directory is available for task tracking common.shell_cmd('mkdir -p %s', META_DIR) SPL_JS = join(RUN_DIR, 'spl/spl_to_json.js') LOINC = join(RUN_DIR, 'spl/data/sections.csv') SPL_S3_BUCKET = 's3://openfda-data-spl/data/' SPL_S3_LOCAL_DIR = config.data_dir('spl/s3_sync') SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, 'change_log/SPLDocuments.csv') SPL_BATCH_DIR = join(META_DIR, 'batch') SPL_PROCESS_DIR = config.data_dir('spl/batches') common.shell_cmd('mkdir -p %s', SPL_S3_LOCAL_DIR) common.shell_cmd('mkdir -p %s', SPL_PROCESS_DIR)
def output(self): return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/extracted'), BATCH))
def output(self): return luigi.LocalTarget(config.data_dir('510k/raw'))
def output(self): return luigi.LocalTarget(config.data_dir('510k', 'json.db'))
def output(self): return luigi.LocalTarget(config.data_dir('registration/annotate.db'))
from os.path import basename, dirname, join import re import subprocess import urllib2 import urlparse import arrow import simplejson as json from openfda.tasks import DependencyTriggeredTask from openfda import common, config, parallel, spl from openfda.annotation_table import unii_harmonization from openfda.spl import process_barcodes, extract RUN_DIR = dirname(dirname(os.path.abspath(__file__))) data_dir = config.data_dir('harmonization') BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD') BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE) SPL_S3_DIR = config.data_dir('spl/s3_sync') TMP_DIR = config.tmp_dir() common.shell_cmd('mkdir -p %s', data_dir) common.shell_cmd('mkdir -p %s', BASE_DIR) common.shell_cmd('mkdir -p %s', TMP_DIR) SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db') DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/' PHARM_CLASS_DOWNLOAD = \ DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip'
import sys import time import arrow import elasticsearch import luigi from openfda import config, common, elasticsearch_requests, index_util, parallel from openfda.annotation_table.pipeline import CombineHarmonization from openfda.tasks import AlwaysRunTask from openfda.spl import annotate from openfda.parallel import IdentityReducer RUN_DIR = dirname(dirname(os.path.abspath(__file__))) META_DIR = config.data_dir("spl/meta") # Ensure meta directory is available for task tracking common.shell_cmd("mkdir -p %s", META_DIR) SPL_JS = join(RUN_DIR, "spl/spl_to_json.js") LOINC = join(RUN_DIR, "spl/data/sections.csv") SPL_S3_BUCKET = "s3://openfda-data-spl/data/" SPL_S3_LOCAL_DIR = config.data_dir("spl/s3_sync") SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, "change_log/SPLDocuments.csv") SPL_BATCH_DIR = join(META_DIR, "batch") SPL_PROCESS_DIR = config.data_dir("spl/batches") common.shell_cmd("mkdir -p %s", SPL_S3_LOCAL_DIR) common.shell_cmd("mkdir -p %s", SPL_PROCESS_DIR)
import arrow import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, index_util, parallel from openfda import device_common, download_util from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state META_DIR = config.data_dir('classification/meta') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_CLASS_ZIP = ('http://www.accessdata.fda.gov/premarket/' 'ftparea/foiclass.zip') class DownloadFoiClass(luigi.Task): def requires(self): return [] def output(self): return luigi.LocalTarget(config.data_dir('classification/raw')) def run(self):
def output(self): return luigi.LocalTarget(config.data_dir('res/batches/%s' % self.batch.strftime('%Y%m%d')))
def output(self): file_name = 'res/batches/%s/enforcement.csv' % self.batch.strftime( '%Y%m%d') return luigi.LocalTarget(config.data_dir(file_name))
from os.path import dirname, join import sys import arrow import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, elasticsearch_requests, index_util, parallel from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) META_DIR = config.data_dir('device_recall/meta') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_RECALL_BUCKET = 's3://openfda-device-recalls/' DEVICE_RECALL_LOCAL_DIR = config.data_dir('device_recall/s3_sync') class SyncS3DeviceRecall(AlwaysRunTask): bucket = DEVICE_RECALL_BUCKET local_dir = DEVICE_RECALL_LOCAL_DIR def _run(self): common.cmd(['mkdir', '-p', self.local_dir]) common.cmd(['aws', '--profile=' + config.aws_profile(), 's3', 'sync',
''' import collections import logging import os from os.path import basename, dirname, join import sys import luigi import simplejson as json from openfda import common, config, parallel from openfda.parallel import mapreduce, Collection, Mapper, PivotReducer RUN_DIR = dirname(dirname(os.path.abspath(__file__))) DATA_DIR = config.data_dir('device_harmonization') common.shell_cmd('mkdir -p %s', DATA_DIR) PLUCK_MAP = { 'device_pma': [ 'applicant', 'trade_name', 'generic_name', 'decision_date', 'advisory_committee', 'pma_number', 'product_code', 'advisory_committee', ], '510k': [ 'applicant',
def output(self): return luigi.LocalTarget(config.data_dir('device_recall/annotate.db'))
import glob import logging import os import re from urllib.parse import urljoin from urllib.request import urlopen from os.path import join, dirname import luigi from bs4 import BeautifulSoup from openfda import common, config, index_util, parallel from openfda.common import convert_unicode, newest_file_timestamp RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir('caers') DOWNLOAD_DIR = config.data_dir('caers/raw') common.shell_cmd('mkdir -p %s', BASE_DIR) CAERS_DOWNLOAD_PAGE_URL = 'https://www.fda.gov/food/compliance-enforcement-food/cfsan-adverse-event-reporting-system-caers' RENAME_MAP = { 'report id': 'report_number', 'caers created date': 'date_created', 'date of event': 'date_started', 'product type': 'role', 'product': 'name_brand', 'product code': 'industry_code', 'description': 'industry_name', 'patient age': 'age', 'age units': 'age_unit',
import arrow import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, elasticsearch_requests, index_util, parallel from openfda import download_util from openfda.device_clearance import transform from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state META_DIR = config.data_dir('510k/meta') common.shell_cmd('mkdir -p %s', META_DIR) CLEARED_DEVICE_URL = 'http://www.accessdata.fda.gov/premarket/ftparea/' CLEARED_DEV_ZIPS = [CLEARED_DEVICE_URL + 'pmn96cur.zip', CLEARED_DEVICE_URL + 'pmn9195.zip', CLEARED_DEVICE_URL + 'pmn8690.zip', CLEARED_DEVICE_URL + 'pmn8185.zip', CLEARED_DEVICE_URL + 'pmn7680.zip'] class Download_510K(luigi.Task): def requires(self): return []
import arrow import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, elasticsearch_requests, index_util, parallel from openfda import download_util from openfda.device_clearance import transform from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state META_DIR = config.data_dir('510k/meta') common.shell_cmd('mkdir -p %s', META_DIR) CLEARED_DEVICE_URL = 'http://www.accessdata.fda.gov/premarket/ftparea/' CLEARED_DEV_ZIPS = [ CLEARED_DEVICE_URL + 'pmn96cur.zip', CLEARED_DEVICE_URL + 'pmn9195.zip', CLEARED_DEVICE_URL + 'pmn8690.zip', CLEARED_DEVICE_URL + 'pmn8185.zip', CLEARED_DEVICE_URL + 'pmn7680.zip' ] class Download_510K(luigi.Task): def requires(self): return [] def output(self):
def output(self): return luigi.LocalTarget(config.data_dir('510k/extracted'))
import glob import logging import os import sys import traceback import arrow import luigi import xmltodict from openfda import common, config, index_util, parallel from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) DEVICE_UDI_BUCKET = 's3://cdrh-data' DEVICE_UDI_LOCAL_DIR = config.data_dir('device_udi/s3_sync') BATCH = arrow.utcnow().floor('day').format('YYYYMMDD') AWS_CLI = 'aws' class SyncS3DeviceUDI(luigi.Task): bucket = DEVICE_UDI_BUCKET local_dir = DEVICE_UDI_LOCAL_DIR aws = AWS_CLI def flag_file(self): return os.path.join(self.local_dir, '.last_sync_time') def complete(self): # Only run S3 sync once per day. if config.disable_downloads():
def output(self): return luigi.LocalTarget(config.data_dir(NSDE_EXTRACT_DB))
import json import os import re from os.path import join, dirname import arrow import luigi import pandas as pd from openfda import common, config, index_util, parallel from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(os.path.abspath(__file__)) S3_ACCESS_LOGS_BUCKET = 's3://openfda-logs/download/' S3_ACCESS_LOGS_DIR = config.data_dir('downloadstats/s3_logs_raw') S3_STATS_DB_DIR = config.data_dir('downloadstats/s3_stats.db') S3_ACCESS_LOGS_CUTOFF = arrow.get('2017-03-01') CF_ACCESS_LOGS_BUCKET = 's3://openfda-splash-logs/download-cf-logs/' CF_ACCESS_LOGS_DIR = config.data_dir('downloadstats/cf_logs_raw') CF_STATS_DB_DIR = config.data_dir('downloadstats/cf_stats.db') TOTAL_STATS_DB_DIR = config.data_dir('downloadstats/total_stats.db') ENDPOINT_INDEX_MAP = { 'animalandveterinary/event': 'animalandveterinarydrugevent', 'drug/event': 'drugevent', 'drug/label': 'druglabel', 'drug/enforcement': 'drugenforcement', 'drug/ndc': 'ndc',
import arrow import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, index_util, parallel from openfda import device_common, download_util from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state META_DIR = config.data_dir('classification/meta') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_CLASS_ZIP = ('https://www.accessdata.fda.gov/premarket/' 'ftparea/foiclass.zip') class DownloadFoiClass(luigi.Task): def requires(self): return [] def output(self): return luigi.LocalTarget(config.data_dir('classification/raw')) def run(self): output_filename = join(self.output().path,
def output(self): return luigi.LocalTarget(config.data_dir('classification/annotate.db'))
import time import urllib2 import arrow import elasticsearch import luigi from openfda import parallel, config, index_util, elasticsearch_requests from openfda.annotation_table.pipeline import CombineHarmonization from openfda.faers import annotate from openfda.faers import xml_to_json from openfda.tasks import AlwaysRunTask, DependencyTriggeredTask # this should be a symlink to wherever the real data directory is RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir() FAERS_HISTORIC = ( 'http://www.fda.gov/Drugs/GuidanceCompliance' 'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm083765.htm') FAERS_CURRENT = ( 'http://www.fda.gov/Drugs/GuidanceCompliance' 'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm082193.htm') MAX_RECORDS_PER_FILE = luigi.IntParameter(-1, is_global=True) class DownloadDataset(AlwaysRunTask): ''' This task downloads all datasets that have not yet been fetched. ''' def _fetch(self):
''' Device pipeline for downloading, transforming to JSON and loading COVID-19 Serological Testing Validation Project data into Elasticsearch. ''' import glob from os.path import dirname import luigi import arrow from openfda import common, config, index_util, parallel from openfda.tasks import AlwaysRunTask from openfda.common import first_file_timestamp SEROLOGY_TEST_BUCKET = 's3://openfda-covid19serology/' SEROLOGY_TEST_SYNC_DIR = config.data_dir('covid19serology/s3_sync') SEROLOGY_TEST_JSON_DB_DIR = config.data_dir('covid19serology/json.db') class SyncS3SerologyTest(AlwaysRunTask): def _run(self): common.cmd(['mkdir', '-p', SEROLOGY_TEST_SYNC_DIR]) common.cmd(['aws', '--profile=' + config.aws_profile(), 's3', 'sync', SEROLOGY_TEST_BUCKET, SEROLOGY_TEST_SYNC_DIR]) def output(self):
import csv import re import logging import os from os.path import dirname, join import arrow import datetime import luigi from openfda import common, config, index_util, parallel RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir('caers') common.shell_cmd('mkdir -p %s', BASE_DIR) S3_BUCKET = 's3://openfda-data-caers/' S3_LOCAL_DIR = config.data_dir('caers/s3_sync') # TODO(hansnelsen): initiate and resolve naming convention for this file and # s3 bucket. Currently, the file is downloaded from # s3://openfda-lonnie/caers/ (the naming of this file is # not consistent). The pipeline engineer downloads it, renames # it and then uploaded manually to the above bucket. CAERS_FILE = 'caers.csv' logging.info(S3_LOCAL_DIR, 'dir') common.shell_cmd('mkdir -p %s', S3_LOCAL_DIR) RENAME_MAP = { 'Report #': 'report_number', 'Created Date': 'date_created', 'Event Start Date': 'date_started',
import glob import os from os.path import dirname, join import luigi from openfda import common, config, download_util, index_util, parallel from openfda.common import first_file_timestamp from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.device_pma import transform RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state META_DIR = config.data_dir('device_pma/meta') RAW_DIR = config.data_dir('device_pma/raw') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_PMA_ZIP = 'https://www.accessdata.fda.gov/premarket/ftparea/pma.zip' class DownloadPMA(luigi.Task): def requires(self): return [] def output(self): return luigi.LocalTarget(RAW_DIR) def run(self): output_filename = join(self.output().path,
import os import re import sys import traceback import arrow import luigi from lxml import etree from openfda import common, config, index_util, parallel from openfda.adae import annotate from openfda.annotation_table.pipeline import CombineHarmonization from openfda.common import newest_file_timestamp ADAE_BUCKET = 's3://openfda-data-adae' ADAE_LOCAL_DIR = config.data_dir('adae/s3_sync') BATCH = arrow.utcnow().floor('day').format('YYYYMMDD') AWS_CLI = 'aws' # TODO: move to an external file once the list grows unmanageable. NULLIFIED = [ 'US-FDACVM-2018-US-045311.xml', 'US-FDACVM-2018-US-048571.xml', 'US-FDACVM-2018-US-046672.xml', 'US-FDACVM-2017-US-042492.xml', 'US-FDACVM-2018-US-044065.xml', 'US-FDACVM-2017-US-070108.xml', 'US-FDACVM-2017-US-002864.xml', 'US-FDACVM-2017-US-002866.xml', 'US-FDACVM-2017-US-052458.xml', 'US-FDACVM-2017-US-055193.xml', 'US-FDACVM-2017-US-043931.xml', 'US-FDACVM-2018-US-002321.xml', 'US-FDACVM-2018-US-063536.xml', 'US-FDACVM-2015-US-221044.xml', 'US-FDACVM2019-US-016263.xml', 'US-FDACVM-2016-US-062923.xml', 'US-FDACVM-2017-US-001483.xml', 'US-FDACVM-2017-US-009155.xml', 'US-FDACVM-2017-US-028125.xml', 'US-FDACVM-2017-US-033030.xml',
def output(self): return luigi.LocalTarget(config.data_dir('device_pma/json.db'))
def output(self): return luigi.LocalTarget( os.path.join(config.data_dir('adae/json.db'), BATCH))
def output(self): return luigi.LocalTarget( config.data_dir('res/batches/%s' % self.batch.strftime('%Y%m%d')))
from os.path import dirname, join import sys import arrow import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, elasticsearch_requests, index_util, parallel from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) META_DIR = config.data_dir('device_recall/meta') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_RECALL_BUCKET = 's3://openfda-device-recalls/' DEVICE_RECALL_LOCAL_DIR = config.data_dir('device_recall/s3_sync') class SyncS3DeviceRecall(AlwaysRunTask): bucket = DEVICE_RECALL_BUCKET local_dir = DEVICE_RECALL_LOCAL_DIR def _run(self): common.cmd(['mkdir', '-p', self.local_dir]) common.cmd([ 'aws', '--profile=' + config.aws_profile(), 's3', 'sync', self.bucket, self.local_dir
import arrow import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, download_util, elasticsearch_requests, index_util, parallel from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.device_pma import transform from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state META_DIR = config.data_dir('device_pma/meta') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_PMA_ZIP = 'https://www.accessdata.fda.gov/premarket/ftparea/pma.zip' class DownloadPMA(luigi.Task): def requires(self): return [] def output(self): return luigi.LocalTarget(config.data_dir('device_pma/raw')) def run(self): output_filename = join(self.output().path, DEVICE_PMA_ZIP.split('/')[-1]) common.download(DEVICE_PMA_ZIP, output_filename)
import urllib2 import arrow import elasticsearch import luigi from openfda import parallel, config, index_util, elasticsearch_requests from openfda.annotation_table.pipeline import CombineHarmonization from openfda.faers import annotate from openfda.faers import xml_to_json from openfda.tasks import AlwaysRunTask, DependencyTriggeredTask # this should be a symlink to wherever the real data directory is RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir() FAERS_HISTORIC = ('http://www.fda.gov/Drugs/GuidanceCompliance' 'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm083765.htm') FAERS_CURRENT = ('http://www.fda.gov/Drugs/GuidanceCompliance' 'RegulatoryInformation/Surveillance/AdverseDrugEffects/ucm082193.htm') MAX_RECORDS_PER_FILE = luigi.IntParameter(-1, is_global=True) class DownloadDataset(AlwaysRunTask): ''' This task downloads all datasets that have not yet been fetched. ''' def _fetch(self): for page in [self._faers_current.find_all(href=re.compile('.*.zip')), self._faers_historic.find_all(href=re.compile('.*.zip'))]: for a in page: