def go_index(page): """ go_index(...) recursive function that will scrape web pages for every url it finds it will go in and call itself to continue caution this is a big memory hog and IT WILL fail eventually """ try: scraper = WebScraper() if scraper.scrape(page): domain = indexer.get_domain(page) indexer.index_domain(domain) indexer.index_file(domain, domain, True) urls = scraper.get_link_urls() if indexer.has_crawable(urls): for url in urls: title = url.encode(encoding="utf-8", errors="replace") hash_path = "{0}.link".format(indexer.do_hash(title)) path = os.path.join(domain, hash_path) if not os.path.exists(path) and url != scraper.page: indexer.index_file(url, domain, True) go_index(url) else: print("Can not scrape requested page {0}".format(page)) except RuntimeError: # Figure out a way to respawn in another thread print("Runtime Error occurred. Killing Script") sys.exit()
def do_index(config, testrun): dbpath = testrun.dbpath(config) indexlogpath = testrun.indexlogpath(config) if not config.preserve or \ not os.path.exists(dbpath) or \ not os.path.exists(indexlogpath): if os.path.exists(dbpath): shutil.rmtree(dbpath) if os.path.exists(indexlogpath): os.unlink(indexlogpath) print "Starting index run (creating %s)" % dbpath indexer.index_file(inputfile=testrun.inputfile, dbpath=dbpath, logpath=indexlogpath, flushspeed=testrun.flushspeed, description=testrun.description, maxdocs=testrun.maxdocs, logspeed=testrun.logspeed) print "Ending index run"
crawler_backlog = {} crawler_data = [] seed = "http://www.newhaven.edu/" crawler_backlog[seed]=0 print("Creating Web Pickle....") visit_url(seed, "www.newhaven.edu") #create raw_web.pickle with web contents List of Tuples out = open("raw_web.pickle", "bw") pickle.dump(crawler_data,out) out.close() print("Creating Data Pickle....") data_load.traverse("fortune1") #creates raw_data.pickle with file contents List of Tuples print("Indexing Web Pickle....") indexer.index_file("raw_web.pickle","out_data") print("Indexing Data Pickle....") indexer.index_file("raw_data.pickle","out_data") getWeather("West Haven","CT") searcher.searchFile("out_data")
import data_load import searcher import indexer #data_load.traverse("fortune1") wordDictionary = indexer.index_file("raw_data.pickle", "the_shelve") searcher.searchFile("the_shelve")