Esempio n. 1
0
def run(options):

  ig_list = options.get("inspectors")

  for inspector, url in URLS.items():
    if (not ig_list) or (inspector in ig_list):
      logging.debug("[%s] Checking..." % inspector)
      result = utils.scraper.urlopen(url)
      match = PAGE_NOT_FOUND_STRING_RE.search(result)
      if not match:
        print("False negative for %s" % inspector)

  data_dir = utils.data_dir()
  for inspector in os.listdir(data_dir):
    if (not ig_list or inspector in ig_list) and inspector in IGS_WITH_BAD_404:
      inspector_path = os.path.join(data_dir, inspector)
      if os.path.isdir(inspector_path):
        for dirpath, dirnames, filenames in os.walk(inspector_path):
          for filename in filenames:
            path = os.path.join(dirpath, filename)
            try:
              f = open(path, 'r', encoding='utf-8')
              for line in f:
                if PAGE_NOT_FOUND_STRING_RE.search(line):
                  print("Soft 404 found: %s" % path)
            except UnicodeDecodeError:
              f = open(path, 'rb')
              for line in f:
                if PAGE_NOT_FOUND_BYTES_RE.search(line):
                  print("Soft 404 found: %s" % path)
Esempio n. 2
0
def run(options):

    ig_list = options.get("inspectors")

    for inspector, url in URLS.items():
        if (not ig_list) or (inspector in ig_list):
            logging.debug("[%s] Checking..." % inspector)
            result = utils.scraper.urlopen(url)
            match = PAGE_NOT_FOUND_STRING_RE.search(result)
            if not match:
                print("False negative for %s" % inspector)

    data_dir = utils.data_dir()
    for inspector in os.listdir(data_dir):
        if (not ig_list
                or inspector in ig_list) and inspector in IGS_WITH_BAD_404:
            inspector_path = os.path.join(data_dir, inspector)
            if os.path.isdir(inspector_path):
                for dirpath, dirnames, filenames in os.walk(inspector_path):
                    for filename in filenames:
                        path = os.path.join(dirpath, filename)
                        try:
                            f = open(path, 'r', encoding='utf-8')
                            for line in f:
                                if PAGE_NOT_FOUND_STRING_RE.search(line):
                                    print("Soft 404 found: %s" % path)
                        except UnicodeDecodeError:
                            f = open(path, 'rb')
                            for line in f:
                                if PAGE_NOT_FOUND_BYTES_RE.search(line):
                                    print("Soft 404 found: %s" % path)
def run(options):
  data_dir = utils.data_dir()
  ig_list = options.get("inspectors")

  report_id_history = {}
  for inspector in os.listdir(data_dir):
    logging.debug("[%s] Checking..." % inspector)

    if not ig_list or inspector in ig_list:
      inspector_path = os.path.join(data_dir, inspector)
      if os.path.isdir(inspector_path):
        for year in os.listdir(inspector_path):
          year_path = os.path.join(inspector_path, year)
          if os.path.isdir(year_path):
            for report in os.listdir(year_path):
              report_path = os.path.join(year_path, report)
              if os.path.isdir(report_path):
                json_path = os.path.join(report_path, "report.json")
                if os.path.isfile(json_path):
                  report_data = json.load(open(json_path, "r", encoding="utf-8"))
                  report_id = report_data["report_id"]
                  if report_id in report_id_history:
                    report_id_history[report_id].append(json_path)
                    print("Duplicate report_id %s in %s" % (repr(report_id), ", ".join(report_id_history[report_id])))
                  else:
                    report_id_history[report_id] = [json_path]
        if "global" not in options:
          report_id_history = {}
def run(options):

    ig_list = options.get("inspectors")

    for inspector, url in URLS.items():
        if (not ig_list) or (inspector in ig_list):
            logging.debug("[%s] Checking..." % inspector)
            result = None
            status_code_rewritten = False
            while True:
                try:
                    verify_options = utils.domain_verify_options(url)
                    response = utils.scraper.get(url, verify=verify_options)
                    result = response.text
                    break
                except scrapelib.HTTPError as e:
                    if e.response.status_code == 404:
                        status_code_rewritten = True
                        if 'location' in e.response.headers:
                            url = e.response.headers['location']
                            continue
                    result = e.body
                    break

            if not status_code_rewritten:
                print(
                    "False negative for %s (handler did not rewrite error code)"
                    % inspector)

            match = PAGE_NOT_FOUND_STRING_RE.search(result)
            if not match:
                print(
                    "False negative for %s (regular expression did not match error "
                    "page contents)" % inspector)

    data_dir = utils.data_dir()
    for inspector in os.listdir(data_dir):
        if (not ig_list
                or inspector in ig_list) and inspector in IGS_WITH_BAD_404:
            inspector_path = os.path.join(data_dir, inspector)
            if os.path.isdir(inspector_path):
                for dirpath, dirnames, filenames in os.walk(inspector_path):
                    for filename in filenames:
                        path = os.path.join(dirpath, filename)
                        try:
                            f = open(path, 'r', encoding='utf-8')
                            for line in f:
                                if PAGE_NOT_FOUND_STRING_RE.search(line):
                                    print("Soft 404 found: %s" % path)
                        except UnicodeDecodeError:
                            f = open(path, 'rb')
                            for line in f:
                                if PAGE_NOT_FOUND_BYTES_RE.search(line):
                                    print("Soft 404 found: %s" % path)
Esempio n. 5
0
def run(options):
    ig_list = options.get("inspectors")

    data_dir = utils.data_dir()
    for inspector in os.listdir(data_dir):
        if not ig_list or inspector in ig_list:
            logging.debug("[%s] Checking..." % inspector)

            inspector_path = os.path.join(data_dir, inspector)
            if os.path.isdir(inspector_path):
                for dirpath, dirnames, filenames in os.walk(inspector_path):
                    for filename in filenames:
                        _, extension = os.path.splitext(filename.lower())
                        if extension == ".pdf":
                            try:
                                original = os.path.join(dirpath, filename)
                                decrypted_file, decrypted_path = tempfile.mkstemp(
                                    suffix=".pdf")
                                os.close(decrypted_file)
                                decrypted_file = None
                                logging.debug("Decrypting %s to %s" %
                                              (original, decrypted_path))
                                subprocess.check_call([
                                    "qpdf", "--decrypt", original,
                                    decrypted_path
                                ])
                                try:
                                    extract_dir = tempfile.mkdtemp()
                                    logging.debug(
                                        "Extracting %s to %s" %
                                        (decrypted_path, extract_dir))
                                    subprocess.check_call([
                                        "pdftk", decrypted_path, "unpack_files"
                                    ],
                                                          cwd=extract_dir)
                                    attachments = os.listdir(extract_dir)
                                    if attachments:
                                        print(
                                            "%s has the following attachments: %s"
                                            %
                                            (original, ', '.join(attachments)))
                                finally:
                                    shutil.rmtree(extract_dir)
                            except subprocess.CalledProcessError as e:
                                print(e)
                            finally:
                                try:
                                    if decrypted_file:
                                        os.close(decrypted_file)
                                        decrypted_file = None
                                finally:
                                    os.remove(decrypted_path)
def run(options):

  ig_list = options.get("inspectors")

  for inspector, url in URLS.items():
    if (not ig_list) or (inspector in ig_list):
      logging.debug("[%s] Checking..." % inspector)
      result = None
      status_code_rewritten = False
      while True:
        try:
          verify_options = utils.domain_verify_options(url)
          response = utils.scraper.get(url, verify=verify_options)
          result = response.text
          break
        except scrapelib.HTTPError as e:
          if e.response.status_code == 404:
            status_code_rewritten = True
            if 'location' in e.response.headers:
              url = e.response.headers['location']
              continue
          result = e.body
          break

      if not status_code_rewritten:
        print("False negative for %s (handler did not rewrite error code)" %
              inspector)

      match = PAGE_NOT_FOUND_STRING_RE.search(result)
      if not match:
        print("False negative for %s (regular expression did not match error "
              "page contents)" % inspector)

  data_dir = utils.data_dir()
  for inspector in os.listdir(data_dir):
    if (not ig_list or inspector in ig_list) and inspector in IGS_WITH_BAD_404:
      inspector_path = os.path.join(data_dir, inspector)
      if os.path.isdir(inspector_path):
        for dirpath, dirnames, filenames in os.walk(inspector_path):
          for filename in filenames:
            path = os.path.join(dirpath, filename)
            try:
              f = open(path, 'r', encoding='utf-8')
              for line in f:
                if PAGE_NOT_FOUND_STRING_RE.search(line):
                  print("Soft 404 found: %s" % path)
            except UnicodeDecodeError:
              f = open(path, 'rb')
              for line in f:
                if PAGE_NOT_FOUND_BYTES_RE.search(line):
                  print("Soft 404 found: %s" % path)
def run(options):
  ig_list = options.get("inspectors")

  dedup = Deduplicator()
  data_dir = utils.data_dir()
  for inspector in os.listdir(data_dir):
    if not ig_list or inspector in ig_list:
      logging.debug("[%s] Checking..." % inspector)

      inspector_path = os.path.join(data_dir, inspector)
      if os.path.isdir(inspector_path):
        for dirpath, dirnames, filenames in os.walk(inspector_path):
          for filename in filenames:
            result = dedup.add_and_check_file(os.path.join(dirpath, filename))
            if result:
              print("Duplicate files: " + ", ".join(result))
Esempio n. 8
0
def run(options):
    ig_list = options.get("inspectors")

    dedup = Deduplicator()
    data_dir = utils.data_dir()
    for inspector in os.listdir(data_dir):
        if not ig_list or inspector in ig_list:
            logging.debug("[%s] Checking..." % inspector)

            inspector_path = os.path.join(data_dir, inspector)
            if os.path.isdir(inspector_path):
                for dirpath, dirnames, filenames in os.walk(inspector_path):
                    for filename in filenames:
                        result = dedup.add_and_check_file(
                            os.path.join(dirpath, filename))
                        if result:
                            print("Duplicate files: " + ", ".join(result))
def run(options):
  ig_list = options.get("inspectors")

  data_dir = utils.data_dir()
  for inspector in os.listdir(data_dir):
    if not ig_list or inspector in ig_list:
      logging.debug("[%s] Checking..." % inspector)

      inspector_path = os.path.join(data_dir, inspector)
      if os.path.isdir(inspector_path):
        for dirpath, dirnames, filenames in os.walk(inspector_path):
          for filename in filenames:
            _, extension = os.path.splitext(filename.lower())
            if extension == ".pdf":
              try:
                original = os.path.join(dirpath, filename)
                decrypted_file, decrypted_path = tempfile.mkstemp(suffix=".pdf")
                os.close(decrypted_file)
                decrypted_file = None
                logging.debug("Decrypting %s to %s" % (original, decrypted_path))
                subprocess.check_call(["qpdf", "--decrypt", original, decrypted_path])
                try:
                  extract_dir = tempfile.mkdtemp()
                  logging.debug("Extracting %s to %s" % (decrypted_path, extract_dir))
                  subprocess.check_call(["pdftk", decrypted_path, "unpack_files"], cwd=extract_dir)
                  attachments = os.listdir(extract_dir)
                  if attachments:
                    print("%s has the following attachments: %s" % (original, ', '.join(attachments)))
                finally:
                  shutil.rmtree(extract_dir)
              except subprocess.CalledProcessError as e:
                print(e)
              finally:
                try:
                  if decrypted_file:
                    os.close(decrypted_file)
                    decrypted_file = None
                finally:
                  os.remove(decrypted_path)
Esempio n. 10
0
def run(options):
    data_dir = utils.data_dir()
    ig_list = options.get("inspectors")

    report_id_history = {}
    for inspector in os.listdir(data_dir):
        logging.debug("[%s] Checking..." % inspector)

        if not ig_list or inspector in ig_list:
            inspector_path = os.path.join(data_dir, inspector)
            if os.path.isdir(inspector_path):
                for year in os.listdir(inspector_path):
                    year_path = os.path.join(inspector_path, year)
                    if os.path.isdir(year_path):
                        for report in os.listdir(year_path):
                            report_path = os.path.join(year_path, report)
                            if os.path.isdir(report_path):
                                json_path = os.path.join(
                                    report_path, "report.json")
                                if os.path.isfile(json_path):
                                    report_data = json.load(
                                        open(json_path, "r", encoding="utf-8"))
                                    report_id = report_data["report_id"]
                                    if report_id in report_id_history:
                                        report_id_history[report_id].append(
                                            json_path)
                                        print(
                                            "Duplicate report_id %s in %s" %
                                            (repr(report_id), ", ".join(
                                                report_id_history[report_id])))
                                    else:
                                        report_id_history[report_id] = [
                                            json_path
                                        ]
                if "global" not in options:
                    report_id_history = {}