def get_urls(uri, output): print "Spidering %s, getting %d maximum pages and following %d links deep." % (uri, MAX_NUMBER_OF_PAGES_TO_CRAWL, MAX_NUMBER_OF_LINKS_TO_FOLLOW) urls = spider.weburls(uri, width=MAX_NUMBER_OF_PAGES_TO_CRAWL, depth=MAX_NUMBER_OF_LINKS_TO_FOLLOW) spider.urls = urls print "Generating report..." spider.webreport(output) print "Report of URLs written to %s" % output
import spider from pprint import pprint if __name__ == '__main__': a = spider.ftpurls('ftp://localhost/') print 1; pprint(a) a = spider.ftppaths('ftp://localhost') print 2; pprint(a) a = spider.weburls('http://localhost/') print 3; pprint(a) a = spider.weburls('http://localhost/', 200, 5, 3) print 4; pprint(a) spider.ftpmirror('e:\\ftp\\', 14, 'ftp://localhost/') a = spider.ftpspider('ftp://localhost/') print 5; pprint(a) a = spider.webpaths('http://localhost/') print 6; pprint(a) spider.webreport('e:\\web1.txt', 'http://localhost/') spider.webmirror('e:\\web\\', 18, 'http://localhost/') a = spider.webspider('http://localhost/') print 7; pprint(a) spider.urlreport('e:\\web2.txt', 'http://localhost/',) spider.badurlreport('e:\\web3.txt', 'http://localhost/') spider.badhtmreport('e:\\web4.txt', 'http://localhost/') spider.redireport('e:\\web5.txt', 'http://localhost/') spider.outreport('e:\\web6.txt', 'http://localhost') spider.othereport('e:\\web7.txt', 'http://localhost/') a = spider.Spider('ftp://localhost/', 200, 16) a.ftppaths() print 1; pprint(a.paths) a.ftpurls()
import spider urllist = spider.weburls(base="https://arxiv.org/catchup")