Exemple #1
0
def main(args=None):
    """The main routine"""
    reload(sys)
    sys.setdefaultencoding('utf8')

    parser = argparse.ArgumentParser(description='Web mining excersise 1')
    initArgParser(parser)
    args = parser.parse_args()

    if args.console == False and args.file == None:
        parser.exit("Error, Invalid output target!")
    with Emitter(args.console, args.file) as output:
        output.clear()

    c = WebCrawler(args, depth=args.depth)
    start = time.time()
    while not c.done:
        c.crawl()
    end = time.time()
    print "Exec time: ", end - start
Exemple #2
0
def crawl(url, config, skip_delay=False):
    '''
    RQ worker function which extracts URLs from the page contents at given the
    URL, then passes new URLs to both the CRAWL and PROCESS queues for futher
    action.
    '''

    DELAY = int(config.get('crawler', 'crawl_delay'))
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file')
    TARGET_DOMAIN = config.get('crawler', 'target_domain')
    ROBOTS_LOC = config.get('crawler', 'robots_loc')

    if not skip_delay:
        sleep(float(DELAY))

    wc = WebCrawler()
    urls = wc.crawl(url)
    rp = robotparser.RobotFileParser(ROBOTS_LOC)
    rp.read()
    
    dl = DocList(FRNT_LIST_FILE)
    if len(dl) < MAX_DOCS:
        redis_conn = Redis()
        for url in urls:
            did = md5(url).hexdigest()
            domain = urlsplit(url).netloc
            try:
                fetchable = rp.can_fetch('*', url)
            except KeyError:
                fetchable = False
            if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable:
                dl.append(url)
                cq = Queue('crawl', connection=redis_conn)
                cq.enqueue(crawl, args=(
                    url,
                    config
                ));
                pq = Queue('process', connection=redis_conn)
                pq.enqueue(process, args=(
                    url,
                    config
                ));
Exemple #3
0
def crawl(url, config, skip_delay=False):
    '''
    RQ worker function which extracts URLs from the page contents at given the
    URL, then passes new URLs to both the CRAWL and PROCESS queues for futher
    action.
    '''

    DELAY = int(config.get('crawler', 'crawl_delay'))
    MAX_DOCS = int(config.get('crawler', 'max_docs'))
    FRNT_LIST_FILE = config.get('crawler', 'url_frontier_file')
    TARGET_DOMAIN = config.get('crawler', 'target_domain')
    ROBOTS_LOC = config.get('crawler', 'robots_loc')

    if not skip_delay:
        sleep(float(DELAY))

    wc = WebCrawler()
    urls = wc.crawl(url)
    rp = robotparser.RobotFileParser(ROBOTS_LOC)
    rp.read()

    dl = DocList(FRNT_LIST_FILE)
    if len(dl) < MAX_DOCS:
        redis_conn = Redis()
        for url in urls:
            did = md5(url).hexdigest()
            domain = urlsplit(url).netloc
            try:
                fetchable = rp.can_fetch('*', url)
            except KeyError:
                fetchable = False
            if (did not in dl) and (domain == TARGET_DOMAIN) and fetchable:
                dl.append(url)
                cq = Queue('crawl', connection=redis_conn)
                cq.enqueue(crawl, args=(url, config))
                pq = Queue('process', connection=redis_conn)
                pq.enqueue(process, args=(url, config))
Exemple #4
0
# Hint:
#   1. While your solution must handle the case for Web(size=123, degree=5) in
#      the test script, you may want to use different size and degree settings
#      for faster tests and for better test coverage.

import time

from crawler import WebCrawler
from web import Web

size = 1000
degree = 10
web = Web(size=size, degree=degree)
crawler = WebCrawler()
start = time.time()
urls = crawler.crawl(web)
finish = time.time()
print("Time took to crawl the URLs: ", finish - start)
print("Number of URLs found: ", len(urls))
assert len(urls) == size
Exemple #5
0
from crawler import WebCrawler
import datetime

# runner file

if __name__ == '__main__':
    search_url = argv[1] or 'https://github.com'
    print('WebCrawler started, scanning {0}'.format(search_url))
    # initiate a crawl, the crawler encapsulates the event loop
    c = WebCrawler(search_url)
    c.crawl()
Exemple #6
0
from crawler import WebCrawler

wc = WebCrawler()
urls = wc.crawl('https://en.wikipedia.org/wiki/Main_Page')
for url in urls:
    print url
len(urls)