def extract_report(report):
    report_path = path_for(report, report['file_type'])
    real_report_path = os.path.abspath(
        os.path.expandvars(os.path.join(utils.data_dir(), report_path)))

    text_path = "%s.txt" % os.path.splitext(report_path)[0]
    real_text_path = os.path.abspath(
        os.path.expandvars(os.path.join(utils.data_dir(), text_path)))

    if os.path.exists(real_text_path):
        # This report has already had its text extracted
        return text_path

    file_type_lower = report['file_type'].lower()
    if file_type_lower == "pdf":
        utils.text_from_pdf(real_report_path, real_text_path)
        return text_path
    elif file_type_lower == "doc":
        utils.text_from_doc(real_report_path, real_text_path)
        return text_path
    elif file_type_lower in FILE_EXTENSIONS_HTML:
        utils.text_from_html(real_report_path, real_text_path)
        return text_path
    else:
        logging.warn("Unknown file type, don't know how to extract text!")
        return None
def extract_report(report):
  report_path = path_for(report, report['file_type'])
  real_report_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), report_path)))

  text_path = "%s.txt" % os.path.splitext(report_path)[0]
  real_text_path = os.path.abspath(os.path.expandvars(os.path.join(utils.data_dir(), text_path)))

  if os.path.exists(real_text_path):
    # This report has already had its text extracted
    return text_path

  file_type_lower = report['file_type'].lower()
  if file_type_lower == "pdf":
    if utils.check_pdf_decryption(real_report_path):
      real_decrypted_path = real_report_path[:-4] + ".decrypted.pdf"
      if os.path.isfile(real_decrypted_path) or utils.decrypt_pdf(real_report_path, real_decrypted_path):
        utils.text_from_pdf(real_decrypted_path, real_text_path)
      return text_path
    else:
      utils.text_from_pdf(real_report_path, real_text_path)
      return text_path
  elif file_type_lower == "doc":
    utils.text_from_doc(real_report_path, real_text_path)
    return text_path
  elif file_type_lower == "docx":
    utils.text_from_docx(real_report_path, real_text_path)
    return text_path
  elif file_type_lower in FILE_EXTENSIONS_HTML:
    utils.text_from_html(real_report_path, real_text_path)
    return text_path
  else:
    logging.warn("Unknown file type, don't know how to extract text!")
    return None
def extract_report(report):
  report_path = path_for(report, report['file_type'])

  if report['file_type'] == "pdf":
    return utils.text_from_pdf(report_path)
  elif report['file_type'].startswith("htm"):
    return utils.text_from_html(report_path)
  else:
    logging.warn("Unknown file type, don't know how to extract text!")
    return None
def extract_report(report):
    report_path = path_for(report, report["file_type"])

    file_type_lower = report["file_type"].lower()
    if file_type_lower == "pdf":
        return utils.text_from_pdf(report_path)
    elif file_type_lower.startswith("htm"):
        return utils.text_from_html(report_path)
    else:
        logging.warn("Unknown file type, don't know how to extract text!")
        return None
def extract_report(report):
  report_path = path_for(report, report['file_type'])

  file_type_lower = report['file_type'].lower()
  if file_type_lower == "pdf":
    return utils.text_from_pdf(report_path)
  elif file_type_lower in FILE_EXTENSIONS_HTML:
    return utils.text_from_html(report_path)
  else:
    logging.warn("Unknown file type, don't know how to extract text!")
    return None
Exemple #6
0
def extract_report(report):
    report_path = path_for(report, report['file_type'])

    file_type_lower = report['file_type'].lower()
    if file_type_lower == "pdf":
        return utils.text_from_pdf(report_path)
    elif file_type_lower == "doc":
        return utils.text_from_doc(report_path)
    elif file_type_lower in FILE_EXTENSIONS_HTML:
        return utils.text_from_html(report_path)
    else:
        logging.warn("Unknown file type, don't know how to extract text!")
        return None