def main(args=None): """The main routine""" reload(sys) sys.setdefaultencoding('utf8') parser = argparse.ArgumentParser(description='Web mining excersise 1') initArgParser(parser) args = parser.parse_args() if args.console == False and args.file == None: parser.exit("Error, Invalid output target!") with Emitter(args.console, args.file) as output: output.clear() c = WebCrawler(args, depth=args.depth) start = time.time() while not c.done: c.crawl() end = time.time() print "Exec time: ", end - start
def crawl(url, config, skip_delay=False): ''' RQ worker function which extracts URLs from the page contents at given the URL, then passes new URLs to both the CRAWL and PROCESS queues for futher action. ''' DELAY = int(config.get('crawler', 'crawl_delay')) MAX_DOCS = int(config.get('crawler', 'max_docs')) FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file') TARGET_DOMAIN = config.get('crawler', 'target_domain') ROBOTS_LOC = config.get('crawler', 'robots_loc') if not skip_delay: sleep(float(DELAY)) wc = WebCrawler() urls = wc.crawl(url) rp = robotparser.RobotFileParser(ROBOTS_LOC) rp.read() dl = DocList(FRNT_LIST_FILE) if len(dl) < MAX_DOCS: redis_conn = Redis() for url in urls: did = md5(url).hexdigest() domain = urlsplit(url).netloc try: fetchable = rp.can_fetch('*', url) except KeyError: fetchable = False if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable: dl.append(url) cq = Queue('crawl', connection=redis_conn) cq.enqueue(crawl, args=( url, config )); pq = Queue('process', connection=redis_conn) pq.enqueue(process, args=( url, config ));
def crawl(url, config, skip_delay=False): ''' RQ worker function which extracts URLs from the page contents at given the URL, then passes new URLs to both the CRAWL and PROCESS queues for futher action. ''' DELAY = int(config.get('crawler', 'crawl_delay')) MAX_DOCS = int(config.get('crawler', 'max_docs')) FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file') TARGET_DOMAIN = config.get('crawler', 'target_domain') ROBOTS_LOC = config.get('crawler', 'robots_loc') if not skip_delay: sleep(float(DELAY)) wc = WebCrawler() urls = wc.crawl(url) rp = robotparser.RobotFileParser(ROBOTS_LOC) rp.read() dl = DocList(FRNT_LIST_FILE) if len(dl) < MAX_DOCS: redis_conn = Redis() for url in urls: did = md5(url).hexdigest() domain = urlsplit(url).netloc try: fetchable = rp.can_fetch('*', url) except KeyError: fetchable = False if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable: dl.append(url) cq = Queue('crawl', connection=redis_conn) cq.enqueue(crawl, args=(url, config)) pq = Queue('process', connection=redis_conn) pq.enqueue(process, args=(url, config))
# Hint: # 1. While your solution must handle the case for Web(size=123, degree=5) in # the test script, you may want to use different size and degree settings # for faster tests and for better test coverage. import time from crawler import WebCrawler from web import Web size = 1000 degree = 10 web = Web(size=size, degree=degree) crawler = WebCrawler() start = time.time() urls = crawler.crawl(web) finish = time.time() print("Time took to crawl the URLs: ", finish - start) print("Number of URLs found: ", len(urls)) assert len(urls) == size
from crawler import WebCrawler import datetime # runner file if __name__ == '__main__': search_url = argv[1] or 'https://github.com' print('WebCrawler started, scanning {0}'.format(search_url)) # initiate a crawl, the crawler encapsulates the event loop c = WebCrawler(search_url) c.crawl()
from crawler import WebCrawler wc = WebCrawler() urls = wc.crawl('https://en.wikipedia.org/wiki/Main_Page') for url in urls: print url len(urls)