def extract_report(report):
    report_path = path_for(report, report['file_type'])
    real_report_path = os.path.abspath(
        os.path.expandvars(os.path.join(utils.data_dir(), report_path)))

    text_path = "%s.txt" % os.path.splitext(report_path)[0]
    real_text_path = os.path.abspath(
        os.path.expandvars(os.path.join(utils.data_dir(), text_path)))

    if os.path.exists(real_text_path):
        # This report has already had its text extracted
        return text_path

    file_type_lower = report['file_type'].lower()
    if file_type_lower == "pdf":
        utils.text_from_pdf(real_report_path, real_text_path)
        return text_path
    elif file_type_lower == "doc":
        utils.text_from_doc(real_report_path, real_text_path)
        return text_path
    elif file_type_lower in FILE_EXTENSIONS_HTML:
        utils.text_from_html(real_report_path, real_text_path)
        return text_path
    else:
        logging.warn("Unknown file type, don't know how to extract text!")
        return None
def extract_report(report):
  report_path = path_for(report, report['file_type'])
  real_report_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), report_path)))

  text_path = "%s.txt" % os.path.splitext(report_path)[0]
  real_text_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), text_path)))

  if os.path.exists(real_text_path):
    # This report has already had its text extracted
    return text_path

  file_type_lower = report['file_type'].lower()
  if file_type_lower == "pdf":
    if utils.check_pdf_decryption(real_report_path):
      real_decrypted_path = real_report_path[:-4] + ".decrypted.pdf"
      if os.path.isfile(real_decrypted_path) or utils.decrypt_pdf(real_report_path, real_decrypted_path):
        utils.text_from_pdf(real_decrypted_path, real_text_path)
      return text_path
    else:
      utils.text_from_pdf(real_report_path, real_text_path)
      return text_path
  elif file_type_lower == "doc":
    utils.text_from_doc(real_report_path, real_text_path)
    return text_path
  elif file_type_lower == "docx":
    utils.text_from_docx(real_report_path, real_text_path)
    return text_path
  elif file_type_lower in FILE_EXTENSIONS_HTML:
    utils.text_from_html(real_report_path, real_text_path)
    return text_path
  else:
    logging.warn("Unknown file type, don't know how to extract text!")
    return None
Exemple #3
0
def extract_metadata(report):
    report_path = path_for(report, report['file_type'])

    file_type_lower = report['file_type'].lower()
    if file_type_lower == "pdf":
        real_report_path = os.path.abspath(
            os.path.expandvars(os.path.join(utils.data_dir(), report_path)))
        if utils.check_pdf_decryption(real_report_path):
            real_decrypted_path = real_report_path[:-4] + ".decrypted.pdf"
            decrypted_path = report_path[:-4] + ".decrypted.pdf"
            if os.path.isfile(real_decrypted_path) or utils.decrypt_pdf(
                    real_report_path, real_decrypted_path):
                metadata = utils.metadata_from_pdf(decrypted_path)
            else:
                metadata = None
        else:
            metadata = utils.metadata_from_pdf(report_path)
        if metadata:
            report['pdf'] = metadata
            return metadata
    elif file_type_lower == "doc":
        metadata = utils.metadata_from_doc(report_path)
        if metadata:
            report['doc'] = metadata
            return metadata
    elif file_type_lower == "docx":
        metadata = utils.metadata_from_docx(report_path)
        if metadata:
            report['docx'] = metadata
            return metadata
    elif file_type_lower in FILE_EXTENSIONS_HTML:
        return None
    else:
        logging.warn("Unknown file type, don't know how to extract metadata!")
        return None
Exemple #4
0
 def scan_disk(self, inspector, scraper):
     self.disk[inspector] = {}
     data_dir = utils.data_dir()
     inspector_path = os.path.join(data_dir, inspector)
     if os.path.isdir(inspector_path):
         for year_folder in os.listdir(inspector_path):
             year_disk = int(year_folder)
             year_path = os.path.join(inspector_path, year_folder)
             if os.path.isdir(year_path):
                 for report_id_disk in os.listdir(year_path):
                     report_path = os.path.join(year_path, report_id_disk)
                     report_id_disk = CaseInsensitiveString(report_id_disk)
                     if os.path.isdir(report_path):
                         if report_id_disk in self.disk[inspector]:
                             year_last = self.disk[inspector][
                                 report_id_disk]
                             msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\
                                     (inspector,
                                     report_id_disk,
                                     year_last,
                                     year_disk)
                             print(msg)
                             admin.log_duplicate_id(inspector,
                                                    report_id_disk, msg)
                         self.disk[inspector][report_id_disk] = year_disk
def extract_metadata(report):
  report_path = path_for(report, report['file_type'])

  file_type_lower = report['file_type'].lower()
  if file_type_lower == "pdf":
    real_report_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), report_path)))
    if utils.check_pdf_decryption(real_report_path):
      real_decrypted_path = real_report_path[:-4] + ".decrypted.pdf"
      decrypted_path = report_path[:-4] + ".decrypted.pdf"
      if os.path.isfile(real_decrypted_path) or utils.decrypt_pdf(real_report_path, real_decrypted_path):
        metadata = utils.metadata_from_pdf(decrypted_path)
      else:
        metadata = None
    else:
      metadata = utils.metadata_from_pdf(report_path)
    if metadata:
      report['pdf'] = metadata
      return metadata
  elif file_type_lower == "doc":
    metadata = utils.metadata_from_doc(report_path)
    if metadata:
      report['doc'] = metadata
      return metadata
  elif file_type_lower == "docx":
    metadata = utils.metadata_from_docx(report_path)
    if metadata:
      report['docx'] = metadata
      return metadata
  elif file_type_lower in FILE_EXTENSIONS_HTML:
    return None
  else:
    logging.warn("Unknown file type, don't know how to extract metadata!")
    return None
def write_report(report):
  data_path = path_for(report, "json")

  utils.write(
    utils.json_for(report),
    os.path.join(utils.data_dir(), data_path)
  )
  return data_path
def write_report(report):
  data_path = path_for(report, "json")

  utils.write(
    utils.json_for(report),
    "%s/%s" % (utils.data_dir(), data_path)
  )
  return data_path
def download_report(report):
    report_path = path_for(report, report["file_type"])
    binary = report["file_type"].lower() == "pdf"

    result = utils.download(report["url"], os.path.join(utils.data_dir(), report_path), {"binary": binary})
    if result:
        return report_path
    else:
        return None
def download_report(report):
    report_path = path_for(report, report['file_type'])
    binary = (report['file_type'].lower() in ('pdf', 'doc', 'ppt'))

    result = utils.download(report['url'],
                            os.path.join(utils.data_dir(), report_path),
                            {'binary': binary})
    if result:
        return report_path
    else:
        return None
def check_uniqueness(inspector, report_id, report_year):
    '''Given the name of an inspector, the ID of a report, and the year of the
  report, this function will check whether a duplicate report_id exists on-disk
  under a different year, or whether a duplicate report_id has been saved this
  session, in the same year or any other year. The index of reports already
  saved is lazily built on the first call from each inspector. Duplicate
  reports detected here will be collected, and a summary will be sent via
  admin.notify().'''

    # Be conservative, don't allow report_id to only differ in case
    report_id = report_id.lower()

    # Lazily set up data structures and read existing IDs from disk
    if inspector not in _uniqueness_storage_runtime:
        _uniqueness_storage_runtime[inspector] = set()
    if inspector not in _uniqueness_storage_disk:
        _uniqueness_storage_disk[inspector] = {}
        data_dir = utils.data_dir()
        inspector_path = os.path.join(data_dir, inspector)
        if os.path.isdir(inspector_path):
            for year_folder in os.listdir(inspector_path):
                year_disk = int(year_folder)
                year_path = os.path.join(inspector_path, year_folder)
                if os.path.isdir(year_path):
                    for report_id_disk in os.listdir(year_path):
                        report_path = os.path.join(year_path, report_id_disk)
                        if os.path.isdir(report_path):
                            if report_id_disk in _uniqueness_storage_disk[
                                    inspector]:
                                msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\
                                        (inspector,
                                        report_id_disk,
                                        _uniqueness_storage_disk[inspector][report_id_disk],
                                        year_disk)
                                print(msg)
                                _uniqueness_messages.append(msg)
                            _uniqueness_storage_disk[inspector][
                                report_id_disk] = year_disk

    if report_id in _uniqueness_storage_runtime[inspector]:
        msg = "[%s] Duplicate report_id: %s has been used twice this session" % \
                (inspector, report_id)
        print(msg)
        _uniqueness_messages.append(msg)
    elif report_id in _uniqueness_storage_disk[inspector]:
        if report_year != _uniqueness_storage_disk[inspector][report_id]:
            msg = "[%s] Duplicate report_id: %s is saved under %d and %d" % \
                    (inspector,
                    report_id,
                    _uniqueness_storage_disk[inspector][report_id],
                    report_year)
            print(msg)
            _uniqueness_messages.append(msg)
    _uniqueness_storage_runtime[inspector].add(report_id)
def check_uniqueness(inspector, report_id, report_year):
  '''Given the name of an inspector, the ID of a report, and the year of the
  report, this function will check whether a duplicate report_id exists on-disk
  under a different year, or whether a duplicate report_id has been saved this
  session, in the same year or any other year. The index of reports already
  saved is lazily built on the first call from each inspector. Duplicate
  reports detected here will be collected, and a summary will be sent via
  admin.notify().'''

  # Be conservative, don't allow report_id to only differ in case
  report_id = report_id.lower()

  # Lazily set up data structures and read existing IDs from disk
  if inspector not in _uniqueness_storage_runtime:
    _uniqueness_storage_runtime[inspector] = set()
  if inspector not in _uniqueness_storage_disk:
    _uniqueness_storage_disk[inspector] = {}
    data_dir = utils.data_dir()
    inspector_path = os.path.join(data_dir, inspector)
    if os.path.isdir(inspector_path):
      for year_folder in os.listdir(inspector_path):
        year_disk = int(year_folder)
        year_path = os.path.join(inspector_path, year_folder)
        if os.path.isdir(year_path):
          for report_id_disk in os.listdir(year_path):
            report_path = os.path.join(year_path, report_id_disk)
            if os.path.isdir(report_path):
              if report_id_disk in _uniqueness_storage_disk[inspector]:
                msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\
                        (inspector,
                        report_id_disk,
                        _uniqueness_storage_disk[inspector][report_id_disk],
                        year_disk)
                print(msg)
                _uniqueness_messages.append(msg)
              _uniqueness_storage_disk[inspector][report_id_disk] = year_disk

  if report_id in _uniqueness_storage_runtime[inspector]:
    msg = "[%s] Duplicate report_id: %s has been used twice this session" % \
            (inspector, report_id)
    print(msg)
    _uniqueness_messages.append(msg)
  elif report_id in _uniqueness_storage_disk[inspector]:
    if report_year != _uniqueness_storage_disk[inspector][report_id]:
      msg = "[%s] Duplicate report_id: %s is saved under %d and %d" % \
              (inspector,
              report_id,
              _uniqueness_storage_disk[inspector][report_id],
              report_year)
      print(msg)
      _uniqueness_messages.append(msg)
  _uniqueness_storage_runtime[inspector].add(report_id)
def download_report(report):
  report_path = path_for(report, report['file_type'])
  binary = (report['file_type'].lower() in ('pdf', 'doc', 'ppt'))

  result = utils.download(
    report['url'],
    os.path.join(utils.data_dir(), report_path),
    {'binary': binary}
  )
  if result:
    return report_path
  else:
    return None
Exemple #13
0
def download_report(report, caller_scraper=None):
    report_path = path_for(report, report['file_type'])
    binary = (report['file_type'].lower()
              in ('pdf', 'doc', 'ppt', 'docx', 'xls'))

    result = utils.download(report['url'],
                            os.path.join(utils.data_dir(), report_path),
                            {'binary': binary},
                            scraper_slug=caller_scraper)
    if result:
        return report_path
    else:
        return None
def download_report(report):
  report_path = path_for(report, report['file_type'])
  binary = (report['file_type'] == 'pdf')

  result = utils.download(
    report['url'],
    "%s/%s" % (utils.data_dir(), report_path),
    {'binary': binary}
  )
  if result:
    return report_path
  else:
    return None
def download_report(report, caller_scraper=None):
  report_path = path_for(report, report['file_type'])
  binary = (report['file_type'].lower() in ('pdf', 'doc', 'ppt', 'docx', 'xls'))

  result = utils.download(
    report['url'],
    os.path.join(utils.data_dir(), report_path),
    {'binary': binary},
    scraper_slug=caller_scraper
  )
  if result:
    return report_path
  else:
    return None
def extract_metadata(report):
    report_path = path_for(report, report["file_type"])
    real_report_path = os.path.join(utils.data_dir(), report_path)

    file_type_lower = report["file_type"].lower()
    if file_type_lower == "pdf":
        metadata = utils.metadata_from_pdf(report_path)
        if metadata:
            report["pdf"] = metadata
            return metadata
    elif file_type_lower == "htm" or file_type_lower == "html":
        return None
    else:
        logging.warn("Unknown file type, don't know how to extract metadata!")
        return None
def extract_metadata(report):
  report_path = path_for(report, report['file_type'])
  real_report_path = os.path.join(utils.data_dir(), report_path)

  file_type_lower = report['file_type'].lower()
  if file_type_lower == "pdf":
    metadata = utils.metadata_from_pdf(report_path)
    if metadata:
      report['pdf'] = metadata
      return metadata
  elif file_type_lower in FILE_EXTENSIONS_HTML:
    return None
  else:
    logging.warn("Unknown file type, don't know how to extract metadata!")
    return None
 def scan_disk(self, inspector, scraper):
   self.disk[inspector] = {}
   data_dir = utils.data_dir()
   inspector_path = os.path.join(data_dir, inspector)
   if os.path.isdir(inspector_path):
     for year_folder in os.listdir(inspector_path):
       year_disk = int(year_folder)
       year_path = os.path.join(inspector_path, year_folder)
       if os.path.isdir(year_path):
         for report_id_disk in os.listdir(year_path):
           report_path = os.path.join(year_path, report_id_disk)
           report_id_disk = CaseInsensitiveString(report_id_disk)
           if os.path.isdir(report_path):
             if report_id_disk in self.disk[inspector]:
               year_last = self.disk[inspector][report_id_disk]
               msg = "[%s] Duplicate report_id: %s is saved under %d and %d" %\
                       (inspector,
                       report_id_disk,
                       year_last,
                       year_disk)
               print(msg)
               admin.log_duplicate_id(inspector, report_id_disk, msg)
             self.disk[inspector][report_id_disk] = year_disk
Exemple #19
0
def report_from(tds, options):
  report = {
    'inspector': 'dod',
    'inspector_url': 'http://www.dodig.mil/',
    'agency': 'dod',
    'agency_name': 'Department of Defense',
  }

  title_link = tds[2].select('a')[0]
  title = title_link.text.strip().replace('\r\n', ' ')
  landing_url = urljoin(BASE_URL, title_link['href'])

  if landing_url in LANDING_PAGE_BLACKLIST:
    return

  published_date = datetime.datetime.strptime(tds[0].text.strip(), '%m-%d-%Y')
  published_on = published_date.strftime('%Y-%m-%d')

  topic = tds[1].text

  report_id = tds[2].select('strong')
  if report_id:
    report_id = report_id[0].text.strip()
  else:
    title_slug = re.sub(r'\W', '', title[:16])
    report_id = (published_on + '-' + title_slug)

  # helper: use --report_id to skip all but that one
  only_id = options.get('report_id')
  if only_id and (only_id != report_id):
    return

  # helper: use --skip_downloaded to skip reports whose PDFs are on disk
  #   (drastically reduces calls to DOD landing pages)
  if options.get('skip_downloaded'):
    pdf_path = inspector.path_for({
      'report_id': report_id,
      'year': str(published_date.year),
      'inspector': 'dod'
    }, 'pdf')

    if os.path.exists(os.path.join(utils.data_dir(), pdf_path)):
      logging.warn("\tSkipping previously downloaded report, as asked.")
      return

  report_url, summary, maybe_unreleased, skip = fetch_from_landing_page(landing_url)

  if skip:
    return

  if (report_url is None) and maybe_unreleased:
    report['unreleased'] = True

  # broken reports: mark as unreleased, but also mark as broken
  # blacklisted reports, or, from now on, anything in 2001 and before
  # I'll investigate the batch of 'missing' later.
  if (report_url is None) and ((report_id in BLACKLIST) or (published_date.year <= 2001)):
    report['unreleased'] = True
    report['missing'] = True

  # giving up on any more Guam errors, we've caught as many cases
  # as we reasonably can, and there are Guam entries that aren't reports.
  elif (report_url is None) and (re.search("guam", landing_url)):
    return

  office = tds[3].text.strip()

  report.update({
    'report_id': report_id,
    'url': report_url,
    'landing_url': landing_url,
    'summary': summary,
    'title': title,
    'topic': topic,
    'office': office,
    'published_on': published_on
  })
  return report
Exemple #20
0
def report_from(tds, options):
  report = {
    'inspector': 'dod',
    'inspector_url': 'http://www.dodig.mil/',
    'agency': 'dod',
    'agency_name': 'Department of Defense',
  }

  title_link = tds[2].select('a')[0]
  title = title_link.text.strip().replace('\r\n', ' ')
  landing_url = urljoin(BASE_URL, title_link['href'])

  published_date = datetime.datetime.strptime(tds[0].text.strip(), '%m-%d-%Y')
  published_on = published_date.strftime('%Y-%m-%d')

  topic = tds[1].text

  report_id = tds[2].select('strong')
  if report_id:
    report_id = report_id[0].text.strip()
  else:
    title_slug = re.sub(r'\W', '', title[:16])
    report_id = (published_on + '-' + title_slug)

  # helper: use --report_id to skip all but that one
  only_id = options.get('report_id')
  if only_id and (only_id != report_id):
    return

  # helper: use --skip_downloaded to skip reports whose PDFs are on disk
  #   (drastically reduces calls to DOD landing pages)
  if options.get('skip_downloaded'):
    pdf_path = inspector.path_for({
      'report_id': report_id,
      'year': str(published_date.year),
      'inspector': 'dod'
    }, 'pdf')

    if os.path.exists(os.path.join(utils.data_dir(), pdf_path)):
      logging.warn("\tSkipping previously downloaded report, as asked.")
      return

  report_url, summary, maybe_unreleased, skip = fetch_from_landing_page(landing_url)

  if skip:
    return

  if (report_url is None) and maybe_unreleased:
    report['unreleased'] = True

  # broken reports: mark as unreleased, but also mark as broken
  # blacklisted reports, or, from now on, anything in 2001 and before
  # I'll investigate the batch of 'missing' later.
  if (report_url is None) and ((report_id in BLACKLIST) or (published_date.year <= 2001)):
    report['unreleased'] = True
    report['missing'] = True

  # giving up on any more Guam errors, we've caught as many cases
  # as we reasonably can, and there are Guam entries that aren't reports.
  elif (report_url is None) and (re.search("guam", landing_url)):
    return

  office = tds[3].text.strip()

  report.update({
    'report_id': report_id,
    'url': report_url,
    'landing_url': landing_url,
    'summary': summary,
    'title': title,
    'topic': topic,
    'office': office,
    'published_on': published_on
  })
  return report
def write_report(report):
    data_path = path_for(report, "json")

    utils.write(utils.json_for(report),
                os.path.join(utils.data_dir(), data_path))
    return data_path