Ejemplo n.º 1
0
 def run(self):
   common.shell_cmd('mkdir -p %s', self.local_dir)
   soup = BeautifulSoup(urlopen(CAERS_DOWNLOAD_PAGE_URL).read(), 'lxml')
   for a in soup.find_all(title=re.compile('CAERS ASCII.*')):
     if 'Download CAERS ASCII' in re.sub(r'\s', ' ', a.text):
       fileURL = urljoin('https://www.fda.gov', a['href'])
       common.download(fileURL, join(self.output().path, a.attrs['title']+'.csv'))
Ejemplo n.º 2
0
    def run(self):
        logging.basicConfig(level=logging.INFO)
        output_dir = TOBACCO_RAW_DIR
        os.system('mkdir -p %s' % output_dir)

        # Download all csv source files (current year and archived years).
        soup = BeautifulSoup(
            urlopen(TOBACCO_PROBLEM_DOWNLOAD_PAGE).read(), 'lxml')
        for a in soup.find_all(title=re.compile('\d{4}.*tppr', re.IGNORECASE)):
            file_name = a['title'] if '.csv' in a['title'] else (a['title'] +
                                                                 '.csv')
            common.download(urljoin('https://www.fda.gov', a['href']),
                            join(output_dir, file_name))

        # Combine CSV files into a single json.
        all_csv_files = [
            i for i in glob.glob((output_dir + '/*.{}').format('csv'))
        ]
        logging.info("Reading csv files: %s", (all_csv_files))
        os.system('mkdir -p %s' % dirname(self.output().path))
        df = pd.concat(
            pd.read_csv(f, encoding='cp1252', skiprows=3)
            for f in all_csv_files)
        df.to_json(self.output().path, orient='records')
        with open(self.output().path, "w") as f:
            for row in df.iterrows():
                row[1].to_json(f)
                f.write("\n")
Ejemplo n.º 3
0
  def run(self):
    fileURL = None
    soup = BeautifulSoup(urlopen(SUBSTANCE_DATA_DOWNLOAD_PAGE_URL).read(), 'lxml')
    for a in soup.find_all(href=re.compile('.*.gsrs')):
      if 'Full Public Data Dump' in a.text:
        fileURL = urljoin(GINAS_ROOT_URL, a['href'])

    common.download(fileURL, self.output().path)
Ejemplo n.º 4
0
 def run(self):
   common.shell_cmd('mkdir -p %s', self.local_dir)
   soup = BeautifulSoup(urllib2.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml')
   for a in soup.find_all(href=re.compile('.*.zip')):
     if '_human_' in a.text:
       try:
         common.download(a['href'], join(self.local_dir, a['href'].split('/')[-1]))
       except ProcessException as e:
         logging.error("Could not download a DailyMed SPL archive: {0}: {1}".format(a['href'], e))
Ejemplo n.º 5
0
 def run(self):
     soup = BeautifulSoup(
         urllib2.urlopen(CLEARED_DEVICE_URL).read(), 'lxml')
     for a in soup.find_all(href=re.compile('.*.zip')):
         if a.text.startswith('PMN') and a.text != 'PMNLSTMN.ZIP':
             fileURL = a['href']
             common.download(
                 fileURL, join(self.output().path,
                               a['href'].split('/')[-1]))
Ejemplo n.º 6
0
 def run(self):
   zip_urls = []
   soup = BeautifulSoup(urlopen(DEVICE_REG_PAGE).read())
   for a in soup.find_all(href=re.compile('.*.zip')):
     zip_urls.append(a['href'])
   if not zip_urls:
     logging.info('No Registration Zip Files Found At %s' % DEVICE_REG_PAGE)
   for zip_url in zip_urls:
     filename = zip_url.split('/')[-1]
     common.download(zip_url, join(self.output().path, filename))
Ejemplo n.º 7
0
 def _run(self):
   zip_urls = []
   soup = BeautifulSoup(urllib2.urlopen(DEVICE_DOWNLOAD_PAGE).read())
   for a in soup.find_all(href=re.compile('.*.zip')):
     zip_urls.append(a['href'])
   if not zip_urls:
     logging.fatal('No MAUDE Zip Files Found At %s' % DEVICE_CLASS_DOWNLOAD)
   for zip_url in zip_urls:
     filename = zip_url.split('/')[-1]
     common.download(zip_url, join(self.output().path, filename))
Ejemplo n.º 8
0
 def run(self):
     zip_urls = []
     soup = BeautifulSoup(urllib2.urlopen(DEVICE_REG_PAGE).read())
     for a in soup.find_all(href=re.compile(".*.zip")):
         zip_urls.append(a["href"])
     if not zip_urls:
         logging.info("No Registration Zip Files Found At %s" % DEVICE_REG_PAGE)
     for zip_url in zip_urls:
         filename = zip_url.split("/")[-1]
         common.download(zip_url, join(self.output().path, filename))
Ejemplo n.º 9
0
 def _run(self):
   zip_urls = []
   soup = BeautifulSoup(urllib2.urlopen(DEVICE_DOWNLOAD_PAGE).read())
   for a in soup.find_all(href=re.compile('.*.zip')):
     zip_urls.append(a['href'])
   if not zip_urls:
     logging.fatal('No MAUDE Zip Files Found At %s' % DEVICE_CLASS_DOWNLOAD)
   for zip_url in zip_urls:
     filename = zip_url.split('/')[-1]
     common.download(zip_url, join(self.output().path, filename))
Ejemplo n.º 10
0
    def run(self):
        zip_url = None
        soup = BeautifulSoup(urllib2.urlopen(NDC_DOWNLOAD_PAGE).read(), 'lxml')
        for a in soup.find_all(href=re.compile('.*.zip')):
            if 'NDC Database File' in a.text:
                zip_url = urlparse.urljoin('http://www.fda.gov', a['href'])
                break

        if not zip_url:
            logging.fatal('NDC database file not found!')

        common.download(zip_url, self.output().path)
Ejemplo n.º 11
0
  def run(self):
    zip_url = None
    soup = BeautifulSoup(urllib2.urlopen(NDC_DOWNLOAD_PAGE).read(), 'lxml')
    for a in soup.find_all(href=re.compile('.*.zip')):
      if 'NDC Database File' in a.text:
        zip_url = urlparse.urljoin('http://www.fda.gov', a['href'])
        break

    if not zip_url:
      logging.fatal('NDC database file not found!')

    common.download(zip_url, self.output().path)
Ejemplo n.º 12
0
 def run(self):
   # TODO(hansnelsen): copied from the FAERS pipeline, consider refactoring
   #                   into a generalized approach
   zip_urls = []
   soup = BeautifulSoup(urllib2.urlopen(DEVICE_DOWNLOAD_PAGE).read())
   for a in soup.find_all(href=re.compile('.*.zip')):
     zip_urls.append(a['href'])
   if not zip_urls:
     logging.fatal('No MAUDE Zip Files Found At %s' % DEVICE_CLASS_DOWNLOAD)
   for zip_url in zip_urls:
     filename = zip_url.split('/')[-1]
     common.download(zip_url, join(self.output().path, filename))
Ejemplo n.º 13
0
 def run(self):
     # TODO(hansnelsen): copied from the FAERS pipeline, consider refactoring
     #                   into a generalized approach
     zip_urls = []
     soup = BeautifulSoup(urllib2.urlopen(DEVICE_DOWNLOAD_PAGE).read())
     for a in soup.find_all(href=re.compile('.*.zip')):
         zip_urls.append(a['href'])
     if not zip_urls:
         logging.fatal('No MAUDE Zip Files Found At %s' %
                       DEVICE_CLASS_DOWNLOAD)
     for zip_url in zip_urls:
         filename = zip_url.split('/')[-1]
         common.download(zip_url, join(self.output().path, filename))
Ejemplo n.º 14
0
    def _download_with_retry(self, url, target_name):
        if os.path.exists(target_name):
            return

        for i in range(10):
            try:
                logging.info('Downloading: ' + url)
                common.download(url, target_name)
                subprocess.check_call('unzip -t %s' % target_name, shell=True)
                return
            except:
                logging.info(
                    'Problem while unzipping[download URL:%s, zip file:%s], retrying...',
                    url, target_name)
        logging.fatal(
            'Zip File: %s from URL :%s is not valid, stop all processing',
            target_name, url)
Ejemplo n.º 15
0
  def run(self):
    finished_ndc_url = None
    unfinished_ndc_url = None

    soup = BeautifulSoup(urlopen(NDC_DOWNLOAD_PAGE).read(), 'lxml')
    for a in soup.find_all(href=re.compile('.*.zip')):
      if 'NDC Database File' in a.text and 'text' in a['href']:
        finished_ndc_url = urljoin('https://www.fda.gov', a['href'])
      if 'NDC Unfinished' in a.text and 'unfinished.zip' in a['href']:
        unfinished_ndc_url = urljoin('https://www.fda.gov', a['href'])

    if not finished_ndc_url:
      logging.fatal('NDC finished database file not found!')
    if not unfinished_ndc_url:
      logging.fatal('NDC unfinished drugs database file not found!')

    common.download(finished_ndc_url, join(RAW_DIR, 'finished.zip'))
    common.download(unfinished_ndc_url, join(RAW_DIR, 'unfinished.zip'))
Ejemplo n.º 16
0
  def run(self):
    logging.basicConfig(level=logging.INFO)

    zip_filename = config.data_dir('nsde/raw/nsde.zip')
    output_dir = config.data_dir('nsde/raw')
    os.system('mkdir -p %s' % output_dir)
    common.download(NSDE_DOWNLOAD, zip_filename)
    os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals())

    csv_file = join(output_dir, self.csv_file_name)
    logging.info("Reading csv file: %s", (csv_file))
    os.system('mkdir -p %s' % dirname(self.output().path))
    df = pd.read_csv(csv_file, encoding='utf-8-sig')
    df.to_json(self.output().path, orient='records')
    with open(self.output().path, "w") as f:
      for row in df.iterrows():
        row[1].to_json(f)
        f.write("\n")
Ejemplo n.º 17
0
 def run(self):
   common.download(self.url, os.path.join(self.local_dir, 'registration_listing.txt'))
Ejemplo n.º 18
0
 def run(self):
   output_filename = join(self.output().path, DEVICE_PMA_ZIP.split('/')[-1])
   common.download(DEVICE_PMA_ZIP, output_filename)
Ejemplo n.º 19
0
 def run(self):
   for url in SPL_DOWNLOADS:
     filename = join(self.output().path, url.split('/')[-1])
     common.download(url, filename)
Ejemplo n.º 20
0
  def run(self):
    output_dir = self.output().path

    for zip_url in CLEARED_DEV_ZIPS:
      output_filename = join(output_dir, zip_url.split('/')[-1])
      common.download(zip_url, output_filename)
Ejemplo n.º 21
0
 def run(self):
     output_dir = dirname(self.output().path)
     zip_filename = join(output_dir, 'nsde.zip')
     common.download(NSDE_DOWNLOAD, zip_filename)
     os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals())
     os.rename(glob.glob(join(output_dir, '*.csv'))[0], self.output().path)
Ejemplo n.º 22
0
 def run(self):
     output_filename = join(self.output().path,
                            DEVICE_PMA_ZIP.split('/')[-1])
     common.download(DEVICE_PMA_ZIP, output_filename)
Ejemplo n.º 23
0
 def run(self):
   common.download(PHARM_CLASS_DOWNLOAD, self.output().path)
Ejemplo n.º 24
0
 def run(self):
   common.download(RXNORM_DOWNLOAD, self.output().path)
Ejemplo n.º 25
0
    def run(self):
        output_dir = self.output().path

        for zip_url in CLEARED_DEV_ZIPS:
            output_filename = join(output_dir, zip_url.split('/')[-1])
            common.download(zip_url, output_filename)
Ejemplo n.º 26
0
 def run(self):
     common.download(RXNORM_DOWNLOAD, self.output().path)
Ejemplo n.º 27
0
 def run(self):
     common.download(PHARM_CLASS_DOWNLOAD, self.output().path)
Ejemplo n.º 28
0
 def run(self):
     common.download(DOWNLOAD_FILE, RAW_DATA_FILE)