Python Spider.crawl_page Examples

Programming Language: Python

Namespace/Package Name: spider

Class/Type: Spider

Method/Function: crawl_page

Examples at hotexamples.com: 37

The python spider.Spider.crawl_page function is a method used to crawl through web pages in a spider or web scraping script written in Python. It is a core component of the spider or crawler, allowing it to navigate through different URLs and extract relevant data from each page. This function typically includes tasks such as making HTTP requests, processing the HTML content, and extracting desired information based on specific patterns or rules set by the developer. By utilizing the crawl_page function, developers can automate the process of retrieving and analyzing data from multiple web pages efficiently.

Python Spider.crawl_page - 37 examples found. These are the top rated real world Python examples of spider.Spider.crawl_page extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Spider(30)

crawl_page(30)

crawl(14)

__init__(8)

craw(4)

Search(4)

crawl_genre(3)

build_node(3)

analyse(3)

process_page(2)

court(2)

add_url(2)

content_list(2)

GetInfo(2)

crowl(1)

crowl_page(1)

GET(1)

crawled_page(1)

createResultExcel(1)

get2l_url(1)

crawledPage(1)

crawle_page_in_queue(1)

crawl_weather(1)

crawl_video_urls(1)

crawl_robots(1)

data(1)

getfilename(1)

get3l_url(1)

post(1)

update(1)

startCrawl(1)

setworkdir(1)

setfilename(1)

setDaemon(1)

responseCallback(1)

parse_blog(1)

getSoup(1)

linkCallback(1)

levelCallback(1)

is_valid(1)

is_outgoing(1)

htmlCallback(1)

get_pdfs(1)

crawl_page_graph(1)

crawl_async_slots(1)

crawl_next_page_from_queue(1)

authorized(1)

Process(1)

ReturnValues(1)

Text(1)

Example #1

Show file

def work(k):
    """Here the threads do the main job, threads gets on url page from queue,
     after crawl the page and remove it from queue"""
    for i in range(k):
        url = fifo_queue.get()
        Spider.crawl_page(threading.Thread().name, url)
        fifo_queue.task_done()

Example #2

Show file

def go():
    """Get link from queue and put it to Spider"""

    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #3

Show file

File: main.py Project: nenad1001/CrazyS

def work():
    while True:
        item = thread_queue.get()
        url = item['url']
        distance = item['distance']
        Spider.crawl_page(threading.current_thread().name, url, distance)
        thread_queue.task_done()

Example #4

Show file

def doCrawling():
    while 1:
        #Remove from the queue
        url = queue.get()
        #We will use use the name of current thread to see what is going on
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #5

Show file

File: main.py Project: barseptimiu/WebCrawler

 def work(self):
     while True:
         url = self.queue.get()
         # self.output.append(threading.current_thread().name + ' now crawling ' + url)
         # self.output.append('Queue: ' + str(len(Spider.queue)) + ' | Crawled: ' + str(len(Spider.crawled)))
         Spider.crawl_page(threading.current_thread().name, url)
         self.queue.task_done()

Example #6

Show file

def work(Num=30):
    print("Work")
    while True and Num > 0:
        link = QUEUE.get()
        Spider.crawl_page(threading.current_thread().name, link)
        QUEUE.task_done()
        Num -= 1

Example #7

Show file

def work():
    while True:
        url = queue.get()
        if url is None:
            break
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #8

Show file

File: main.py Project: nenad1002/CrazyS

def work():
    while True:
        item = thread_queue.get()
        url = item['url']
        distance = item['distance']
        Spider.crawl_page(threading.current_thread().name, url, distance)
        thread_queue.task_done()

Example #9

Show file

File: main.py Project: AllenDrake2016/Readme

def work():
    print('main.py/work()')
    while True:
        url=queue.get()
        Spider.crawl_page(threading.current_thread().name,url)
        queue.task_done()
    print('main.py/work()/end')

Example #10

Show file

File: main.py Project: yahiaelfellah/Crawler

def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
        if queue.empty():
            print("end of process......")
            break

Example #11

Show file

File: main.py Project: ashnajain/Web-crawler

 def work():
     max=10
     i=0
     while i<max:
         url = queue.get()
         Spider.crawl_page(threading.current_thread().name, url)
         queue.task_done()
         i=i+1

Example #12

Show file

File: main.py Project: Catbip/web_crawler

def work():
    """
    Do next job in queue
    """
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #13

Show file

File: main.py Project: YoomarXD/Spider

def work():
    while True:
        url = queue.get()
        try:
            Spider.crawl_page(threading.current_thread().name, url)
            queue.task_done()
        except Exception as e:
            print('Error in',
                  threading.current_thread().name, 'crawling', url, '\n\t', e)

Example #14

Show file

File: main.py Project: teslaboy1/TourismScraper

def work():
    """
    do the next job in the queue
    """
    while True:
        url = queue.get()
        #spider to crawl the url using the current thread created.
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #15

Show file

File: main.py Project: helloworld0909/crawler

def work():
    count = 0
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        count += 1
        if count % 100 == 0:
            Spider.update_files()
        queue.task_done()

Example #16

Show file

def work():
    # As long as work is not done
    while True:
        # Get the next URL to be parsed
        url = queue.get()
        # Crawl the webpage at that url
        Spider.crawl_page(threading.current_thread().name, url)
        # return when task is done
        queue.task_done()

Example #17

Show file

def work():
    while True:
        url = queue.get()
        if(url == ''):
            print('URL not found')
        else:
            spider = Spider(url)
            spider.crawl_page(threading.current_thread().name)
            print(spider.socialLinks())
        queue.task_done()

Example #18

Show file

def work():
    while True:
        time.sleep(SLEEP_TIME)
        url = queue.get()
        print (threading.current_thread().name + ' now crawling ' + url)
        print ('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled)))
        Spider.crawl_page(threading.current_thread().name, url)
        print 'Crawled web' + str(Spider.file_order)
        Spider.file_order += 1
        queue.task_done()

Example #19

Show file

File: main.py Project: tskluzac/Manual_Data_Scraper

def work():

    while True:
        url = queue.get()
        #tup = (url, )

        #print(tup) ### This print all website

        print(url, "main.work()")
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #20

Show file

File: main.py Project: sd1186/Crawler

def create_jobs():
    logging.error("In Create Jobs")
    for link in file_to_set(QUEUE_FILE):
        logging.error("Link {}".format(link))
        Spider.crawl_page(link,link)
        queue.put(link)

    queue.join()
    logging.error("After Queue Join")
    logging.error(queue)
    crawl()

Example #21

Show file

File: main.py Project: DanElias/Information_Retrieval

def work():
    global total_retrieved_pages
    while True:
        print("retrieved pages: " + str(total_retrieved_pages))
        url = queue.get()
        if (total_retrieved_pages < max_retrieved_pages):
            Spider.crawl_page(threading.current_thread().name, url)
        else:
            Spider.crawl_page_graph(threading.current_thread().name, url)
        print("Crawling task by thread is done")
        queue.task_done()
    '''

Example #22

Show file

File: main.py Project: mahi1997/Comp-network-project

def work():
    global working
    while working:
        url = queue.get()

        #if(queue.empty()):
        # working=False
        #sys.exit()
        #print("###############################################################\n###############################################################\n###############################################################\n")

        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #23

Show file

def do_task(url):
    content_type = ''
    try:
        resp = requests.head(url)
        # checking if the link is a webpage or a file
        content_type = resp.headers['Content-Type']

        if 'text/html' in content_type:
            Spider.crawl_page(threading.current_thread().name, url)
        else:
            # the link contains a file
            Spider.crawl_file(threading.current_thread().name, url,
                              content_type)

    except Exception as e:
        pass

Example #24

Show file

File: main.py Project: Changjinxing/titleCrawler

def work():
    while True:
        url = queue.get()
        table_name = 'url_title_rel'
        title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name)
        #print title
        queue.task_done()

Example #25

Show file

File: detected.py Project: nguyenanh1997/spider

class Detected:
    PROJECT_NAME = ''
    HOMEPAGE = ''
    DOMAIN_NAME = ''
    QUEUE_FILE = ''
    CRAWLED_FILE = ''
    NUMBER_OF_THREADS = 8
    queue = Queue()
    spi = None

    def __init__(self, project_name, homepage):
        self.PROJECT_NAME = project_name
        self.HOMEPAGE = homepage

        self.DOMAIN_NAME = get_domain_name(self.HOMEPAGE)
        self.QUEUE_FILE = self.PROJECT_NAME + '/queue.txt'
        self.CRAWLED_FILE = self.PROJECT_NAME + '/crawled.txt'

        self.spi = Spider(self.PROJECT_NAME, self.HOMEPAGE, self.DOMAIN_NAME)

    # Create worker threads (will die when main exits)
    def create_workers(self):
        for _ in range(self.NUMBER_OF_THREADS):
            t = threading.Thread(target=self.work)
            t.daemon = True
            t.start()

    # Do the next job in the queue
    def work(self):
        while True:
            url = self.queue.get()
            self.spi.crawl_page(threading.current_thread().name, url)
            self.queue.task_done()

    # Each queued link is a new job
    def create_jobs(self):
        for link in file_to_set(self.QUEUE_FILE):
            self.queue.put(link)
        self.queue.join()
        self.crawl()

    # Check if there are items in the queue, if so crawl them
    def crawl(self):
        queued_links = file_to_set(self.QUEUE_FILE)
        if len(queued_links) > 0:
            print(str(len(queued_links)) + ' links in the queue')
            self.create_jobs()

Example #26

Show file

def work():
    while True:
        url = queue.get()
        global ALLNUM
        try:
            if (lock.acquire()):
                response = urlopen(url, timeout=3)
                childrenlink = Spider.gather_links(url)
                childrenfile = open(
                    PROJECT_NAME + '/childrenlink/' + str(ALLNUM) + '.txt',
                    'w')
                childrenfile.write(url + '\n')
                for each_child in childrenlink:
                    if ('javascript' not in each_child):
                        childrenfile.write(each_child + '\n', )
                childrenfile.close()

                #write html file by utf8 encoding
                html_byte = response.read()
                chardit1 = chardet.detect(html_byte)
                file1 = open(
                    PROJECT_NAME + '/html/utf8/' + str(ALLNUM) + '.html', 'wb')
                html_string = html_byte.decode(
                    chardit1['encoding']).encode('utf-8')
                file1.write(html_string)
                file1.close()

                #for smj encode as GBK
                file2 = open(
                    PROJECT_NAME + '/html/gbk/' + str(ALLNUM) + '.html', 'wb')
                html_string = html_byte.decode(chardit1['encoding'],
                                               'ignore').encode(
                                                   'gbk', 'ignore')
                file2.write(html_string)
                file2.close()

        except Exception as e:
            print(str(e))
            queue.task_done()
            lock.release()
        else:
            append_link(url)
            ALLNUM = ALLNUM + 1
            Spider.crawl_page(threading.current_thread().name, url)
            queue.task_done()
            lock.release()

Example #27

Show file

File: main.py Project: 99sbr/Scrapy-Spider

def work():
    while True:
        url=queue.get()
        Spider.crawl_page(threading.current_thread().name,url)
    queue.task.done()

Example #28

Show file

def work():
    global links_found
    #   While not reaching link limit
    while queue.qsize() < params.max_links:
        #   Grab the links from file
        queued_links = file_to_set(QUEUE_FILE)
        links_found = len(queued_links)

        #   Take a link and crawl it
        url = queue.get()
        start_crawl_time = time.time()
        Spider.crawl_page(threading.current_thread().name, url)
        end_crawl_time = time.time()

        #   Log the task time
        with open(params.projectName + '/timeLog.txt', 'a') as file:
            file.write(threading.current_thread().name + ": " +
                       str(round(end_crawl_time - start_crawl_time, 2)) +
                       ' seconds\n')

        #   Flag the queue that a task is done
        queue.task_done()

Example #29

Show file

File: crawler.py Project: roshanlam/HonorsPy

 def work(self):
     while True:
         url = self.queue.get()
         Spider.crawl_page(threading.current_thread().name, url)
         self.queue.task_done()

Example #30

Show file

def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
        print('calling work function')

Example #31

Show file

File: main.py Project: enricoaquilina/web-crawler

def work():
    while True:
        link = queue.get()
        Spider.crawl_page(threading.current_thread().name, link)
        queue.task_done()

Example #32

Show file

def work():
    while True:
        url = queue.get()
        print("work-------url:", url)
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #33

Show file

File: main.py Project: fainle/practice

def spider(domain, url, depth):
	"""爬虫测试"""
	spider_engine = Spider(domain)
	spider_engine.crawl_page([url], depth)

Example #34

Show file

File: main.py Project: mocmeo/crawler_script

def work():
    while True:
        url = q.get()
        Spider.crawl_page(threading.currentThread().name, url)
        q.task_done()

Example #35

Show file

File: main.py Project: agiridh/Web-Crawler

def work():
    while True:
        url = queue.get()  # get url from queue.txt
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()  # end task for worker. It can now find more jobs, if available

Example #36

Show file

def work():
    while 1:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #37

Show file

File: main.py Project: Doedling/crawler

def get_next(url_set):
    for url in url_set:
        waiting_list.append(url)
    current_url = waiting_list.pop()
    Spider.crawl_page(current_url)