Beispiel #1
0
def crawl():
    #Use logger to debug the code
    Logger.debug('Hello google')
    #Get the html knowledge by parsing url
    soup = get_b_soup('http://www.google.com')
    #Send the data to output pipe line
    Queues.send_to_output(soup.head.title.text)
    Queues.send_to_output('http://www.google.com' + soup.head.meta['content'])
Beispiel #2
0
def crawl():
    #Use logger to debug the code
    Logger.debug('Hello google')
    #Get the html knowledge by parsing url
    soup = get_b_soup('http://www.google.com')
    #Send the data to output pipe line
    Queues.send_to_output(soup.head.title.text)
    Queues.send_to_output('http://www.google.com' + soup.head.meta['content'])
Beispiel #3
0
def collect(url_queue):
    #Use logger to debug the code
    Logger.debug('Hello amazon')
    #Get the html knowledge by parsing url
    soup = get_b_soup(base_path)
    #Travel with html knowledge
    for all_url in soup.findAll('a', href=re.compile('/gp/product/')):
        #REQUIRED
        #Send all url via url_queue
        url_queue.put(base_path + all_url['href'])
Beispiel #4
0
def crawl_thread(url, module):
    Logger.debug('Thread start')
    encoding = getattr(module, 'ENCODING', None)
    try:
        soup = get_b_soup(url, encoding=encoding)
        module.crawl(soup)
    except:
        Logger.error('Crawl error url: ' + url)
        Logger.error(traceback.format_exc())
        return
    Logger.debug('Thread done')
Beispiel #5
0
def crawl_thread(url, module):
    Logger.debug('Thread start')
    encoding = getattr(module, 'ENCODING', None)
    try:
        soup = get_b_soup(url, encoding=encoding)
        module.crawl(soup)
    except:
        Logger.error('Crawl error url: ' + url)
        Logger.error(traceback.format_exc())
        return
    Logger.debug('Thread done')