Python Crawler.crawl Examples

Programming Language: Python

Namespace/Package Name: lib.crawler

Class/Type: Crawler

Method/Function: crawl

Examples at hotexamples.com: 4

Python Crawler.crawl - 4 examples found. These are the top rated real world Python examples of lib.crawler.Crawler.crawl extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Crawler(16)

grab_pagelinks_th_auto(3)

crawl(3)

set_did(3)

then(2)

crawl_and_save_articles_and_keywords(2)

crawl_link(1)

get_alinks(1)

get_all_created_pages_links(1)

add_to_list(1)

goto_pages(1)

grab_content_th_auto(1)

crawl_like(1)

logout(1)

start(1)

get_counter(1)

Example #1

Show file

def crawl():
    crawler = Crawler()

    param_did = input("预先输入本用户cookie中的did值：")
    crawler.set_did(param_did)

    uid = input("输入此次要爬取的用户id：")
    crawler.add_to_list(uid)

    crawler.crawl()

    input("请按回车键退出......")

Example #2

Show file

def crawl():
    crawler = Crawler(False)
    crawler.set_did(param_did)
    crawler.crawl()

Example #3

Show file

File: crawler.py Project: mshe666/python-simple-web-crawler

def main():

    opts, args = parse_options()

    url = args[0]

    if opts.links:
        getLinks(url)
        raise SystemExit, 0

    depth_limit = opts.depth_limit
    confine_prefix = opts.confine
    exclude = opts.exclude

    sTime = time.time()

    print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
    crawler = Crawler(url, depth_limit, confine_prefix, exclude)
    crawler.crawl()

    # create log directory
    if not os.path.exists(LOG_DIRECTORY):
        os.makedirs(LOG_DIRECTORY)

    num_links = 0
    if opts.out_urls:
        for url_crawl in crawler.urls_seen:

            parsed_uri = urlparse.urlparse(url_crawl)

            # only base url
            if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''),
                            url):  # and not opts.skip_host:
                continue

            if not opts.out_path:
                print url_crawl
            else:
                domain = '{uri.netloc}'.format(uri=parsed_uri)
                log_file = "%s/%s.log" % (LOG_DIRECTORY, domain)

                logging.basicConfig(
                    filename=log_file,
                    filemode='w+',
                    level=logging.DEBUG,
                    format=
                    '%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p')

                try:
                    directory = opts.out_path + domain + '/'
                    path = directory + toSeoFriendly(url_crawl, 50) + '.html'

                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    r = requests.get(url_crawl,
                                     allow_redirects=True,
                                     timeout=30)
                    if not os.path.exists(path):
                        target = open(path, 'w')
                        target.write(r.text.encode('utf-8'))
                        target.close()

                        num_links = num_links + 1
                        logging.debug("Saving: {0}".format(url_crawl))

                except IOError as e:
                    logging.error("IOError: {0} {1}".format(url, e.message))
                    pass

                except Exception as e:
                    logging.error("Error({0}): {1}".format(
                        url, e.__doc__, e.message),
                                  exc_info=True)
                    pass

    if opts.out_links:
        print "\n".join([str(l) for l in crawler.links_remembered])

    if opts.out_dot:
        d = DotWriter()
        d.asDot(crawler.links_remembered)

    eTime = time.time()
    tTime = eTime - sTime

    print >> sys.stderr, "Found:    %d" % num_links
    print >> sys.stderr, "Stats:    (%d/s after %0.2fs)" % (int(
        math.ceil(float(num_links) / tTime)), tTime)

Example #4

Show file

File: crawler.py Project: Bigwayseo/python-simple-web-crawler

def main():   

    opts, args = parse_options()

    url = args[0]

    if opts.links:
        getLinks(url)
        raise SystemExit, 0

    depth_limit = opts.depth_limit
    confine_prefix = opts.confine
    exclude = opts.exclude

    sTime = time.time()

    print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
    crawler = Crawler(url, depth_limit, confine_prefix, exclude)
    crawler.crawl()

    # create log directory
    if not os.path.exists(LOG_DIRECTORY):
        os.makedirs(LOG_DIRECTORY)

    num_links = 0
    if opts.out_urls:
        for url_crawl in crawler.urls_seen:

            parsed_uri = urlparse.urlparse(url_crawl)
            
            # only base url
            if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''), url): # and not opts.skip_host:
                continue

            if not opts.out_path:
                print url_crawl
            else:
                domain = '{uri.netloc}'.format(uri=parsed_uri)
                log_file = "%s/%s.log" % (LOG_DIRECTORY, domain)

                logging.basicConfig(
                    filename=log_file,
                    filemode='w+',
                    level=logging.DEBUG,
                    format='%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p'
                )
                
                try:
                    directory = opts.out_path + domain + '/'
                    path = directory + toSeoFriendly(url_crawl, 50) + '.html'
                    
                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    r = requests.get(url_crawl, allow_redirects=True, timeout=30)
                    if not os.path.exists(path):
                        target = open(path, 'w')
                        target.write(r.text.encode('utf-8'))
                        target.close()

                        num_links = num_links + 1
                        logging.debug("Saving: {0}".format(url_crawl))

                except IOError as e:
                    logging.error("IOError: {0} {1}".format(url, e.message))
                    pass

                except Exception as e:
                    logging.error("Error({0}): {1}".format(url, e.__doc__, e.message), exc_info=True)
                    pass

    if opts.out_links:
        print "\n".join([str(l) for l in crawler.links_remembered])
        
    if opts.out_dot:
        d = DotWriter()
        d.asDot(crawler.links_remembered)

    eTime = time.time()
    tTime = eTime - sTime

    print >> sys.stderr, "Found:    %d" % num_links
    print >> sys.stderr, "Stats:    (%d/s after %0.2fs)" % (
            int(math.ceil(float(num_links) / tTime)), tTime)