def filter_links(link_list, base_url): href_list = [element.get('href') for element in link_list] for i in range(len(href_list)): if href_list[i].startswith("http://go.usa.gov/"): href_list[i] = utils.resolve_redirect(href_list[i]) href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list] filtered_list = [href for href in href_list \ if href and href not in BLACKLIST_REPORT_URLS and \ not href.startswith("mailto:")] filtered_list = list(set(filtered_list)) return filtered_list
def filter_links(link_list, base_url): href_list = [element.get("href") for element in link_list] for i in range(len(href_list)): if href_list[i].startswith("http://go.usa.gov/"): href_list[i] = utils.resolve_redirect(href_list[i]) href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list] filtered_list = [ href for href in href_list if href and href not in BLACKLIST_REPORT_URLS and not href.startswith("mailto:") ] filtered_list = list(set(filtered_list)) return filtered_list
def report_from(result, year_range): link = result.find("a") report_url = urljoin(REPORTS_URL, link.get('href')) report_url = urlunparse(list(urlparse(report_url)[:4]) + ["", ""]) if report_url in BLACKLIST_REPORT_URLS: return # Follow redirects to get real file names if report_url.startswith("https://www.cpsc.gov/Media/"): report_url = utils.resolve_redirect(report_url) # URLs with /PageFiles in them need to use the filename and its # directory to be unique. Other URLs can just use the filename. if "PageFiles" in report_url: # e.g. /../132643/fy11fisma.pdf -> 132643-fy11fisma.pdf report_filename = str.join("-", report_url.split("/")[-2:]) else: report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) title = link.text if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] else: date_spans = result.select(".date-display-single") if date_spans: published_on_text = date_spans[0].text published_on = datetime.datetime.strptime(published_on_text, '%A, %B %d, %Y') else: admin.log_no_date("cpsc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report_type = report_type_from_title(title) report = { 'inspector': 'cpsc', 'inspector_url': 'https://www.cpsc.gov/About-CPSC/Inspector-General/', 'agency': 'cpsc', 'agency_name': 'Consumer Product Safety Commission', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def filter_links(link_list, base_url): link_list = [ link for link in link_list if "Report in Brief" not in link.text ] href_list = [element.get('href') for element in link_list] for i in range(len(href_list)): while href_list[i].startswith( ("http://go.usa.gov/", "https://go.usa.gov/")): href_list[i] = utils.resolve_redirect(href_list[i]) href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list] filtered_list = [ href for href in href_list if href and href not in BLACKLIST_REPORT_URLS and not href.startswith("mailto:") and not href.endswith(".jpg") ] filtered_list = list(set(filtered_list)) return filtered_list
def filter_links(link_list, base_url): link_list = [link for link in link_list if "Report in Brief" not in link.text and "related audit" not in link.text and "(OEI-04-15-00430)" not in link.text and "this graphic" not in link.text] href_list = [element.get('href') for element in link_list] for i in range(len(href_list)): while href_list[i].startswith(("http://go.usa.gov/", "https://go.usa.gov/")): href_list[i] = utils.resolve_redirect(href_list[i]) href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list] filtered_list = [href for href in href_list if href and href not in BLACKLIST_REPORT_URLS and not href.startswith("mailto:") and not href.endswith((".jpg", ".xlsx"))] filtered_list = list(set(filtered_list)) return filtered_list