def main(): parser = argparse.ArgumentParser(description="""Extract paper urls in the parent page. Example: python download_paper_urls.py \ --page_url "http://papers.nips.cc/book/advances-in-neural-information-processing-systems-26-2013" \ --link_parent_selector "div.main ul li" \ --link_selector "a:eq(0)" \ --output_path data/nips-2013.txt""") parser.add_argument("--page_url", required=True, type=str, help="URL of the page that contains paper urls") parser.add_argument("--link_parent_selector", required=True, type=str, help="CSS selector of the link parent element") parser.add_argument("--link_selector", required=True, type=str, help="CSS selector of the link element") parser.add_argument("--output_path", required=True, type=str, help="Path of file to save the result") args = parser.parse_args() urls = extract_page_urls(args.page_url, args.link_parent_selector, args.link_selector) urls = list(urls) sys.stderr.write("Extracted {} urls from {}\n".format( len(urls), args.page_url)) with codecs.open(args.output_path, "w", "utf8") as output_file: urls = map(lambda url: urlparse.urljoin(args.page_url, url), urls) output_file.write("\n".join(urls))
def test_extract_page_urls(): page_url = "http://papers.nips.cc/book/advances-in-neural-information-processing-systems-26-2013" link_parent_selector = "div.main ul li" link_selector = "a:eq(0)" urls = list(extract_page_urls(page_url, link_parent_selector, link_selector)) actual = len(urls) expected = 360 assert_equals(actual, expected)