Example #1
0
def fetch_and_extract(url, data_queue):
    r = urllib2.urlopen(url)
    html = r.read()

    hrefs = demo_helpers.extract_hrefs(html)
    #print('fetch ', url, html, hrefs)
    data_queue.put_nowait((url, hrefs))
Example #2
0
def url_worker(name, tq, dq):
    print('worker start', name)
    while True:
        url = tq.get()
        print('worker {} get {}'.format(name, url))

        r = urllib2.urlopen(url)
        html = r.read()

        hrefs = demo_helpers.extract_hrefs(html)

        dq.put_nowait((url, hrefs))
Example #3
0
def url_worker(name, processed_urls, add_to_all, q):
    print("worker start", name)
    while True:
        url = q.get()
        print("worker {} get {}".format(name, url))

        r = urllib2.urlopen(url)
        html = r.read()

        hrefs = demo_helpers.extract_hrefs(html)
        # print('fetch ', url, html, hrefs)
        for sub_url in hrefs:
            add_to_all(sub_url)
            if sub_url not in processed_urls:
                q.put_nowait(sub_url)

        if url in processed_urls:
            print("Duplicate processed url {}".format(url))
        else:
            processed_urls.add(url)
        q.task_done()