Ejemplo n.º 1
0
def work(k):
    """Here the threads do the main job, threads gets on url page from queue,
     after crawl the page and remove it from queue"""
    for i in range(k):
        url = fifo_queue.get()
        Spider.crawl_page(threading.Thread().name, url)
        fifo_queue.task_done()
Ejemplo n.º 2
0
def go():
    """Get link from queue and put it to Spider"""

    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 3
0
def work():
    while True:
        item = thread_queue.get()
        url = item['url']
        distance = item['distance']
        Spider.crawl_page(threading.current_thread().name, url, distance)
        thread_queue.task_done()
Ejemplo n.º 4
0
def doCrawling():
    while 1:
        #Remove from the queue
        url = queue.get()
        #We will use use the name of current thread to see what is going on
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 5
0
 def work(self):
     while True:
         url = self.queue.get()
         # self.output.append(threading.current_thread().name + ' now crawling ' + url)
         # self.output.append('Queue: ' + str(len(Spider.queue)) + ' | Crawled: ' + str(len(Spider.crawled)))
         Spider.crawl_page(threading.current_thread().name, url)
         self.queue.task_done()
Ejemplo n.º 6
0
def work(Num=30):
    print("Work")
    while True and Num > 0:
        link = QUEUE.get()
        Spider.crawl_page(threading.current_thread().name, link)
        QUEUE.task_done()
        Num -= 1
Ejemplo n.º 7
0
def work():
    while True:
        url = queue.get()
        if url is None:
            break
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 8
0
def work():
    while True:
        item = thread_queue.get()
        url = item['url']
        distance = item['distance']
        Spider.crawl_page(threading.current_thread().name, url, distance)
        thread_queue.task_done()
Ejemplo n.º 9
0
def work():
    print('main.py/work()')
    while True:
        url=queue.get()
        Spider.crawl_page(threading.current_thread().name,url)
        queue.task_done()
    print('main.py/work()/end')
Ejemplo n.º 10
0
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
        if queue.empty():
            print("end of process......")
            break
Ejemplo n.º 11
0
 def work():
     max=10
     i=0
     while i<max:
         url = queue.get()
         Spider.crawl_page(threading.current_thread().name, url)
         queue.task_done()
         i=i+1
Ejemplo n.º 12
0
def work():
    """
    Do next job in queue
    """
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 13
0
def work():
    while True:
        url = queue.get()
        try:
            Spider.crawl_page(threading.current_thread().name, url)
            queue.task_done()
        except Exception as e:
            print('Error in',
                  threading.current_thread().name, 'crawling', url, '\n\t', e)
Ejemplo n.º 14
0
def work():
    """
    do the next job in the queue
    """
    while True:
        url = queue.get()
        #spider to crawl the url using the current thread created.
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 15
0
def work():
    count = 0
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        count += 1
        if count % 100 == 0:
            Spider.update_files()
        queue.task_done()
Ejemplo n.º 16
0
def work():
    # As long as work is not done
    while True:
        # Get the next URL to be parsed
        url = queue.get()
        # Crawl the webpage at that url
        Spider.crawl_page(threading.current_thread().name, url)
        # return when task is done
        queue.task_done()
Ejemplo n.º 17
0
def work():
    while True:
        url = queue.get()
        if(url == ''):
            print('URL not found')
        else:
            spider = Spider(url)
            spider.crawl_page(threading.current_thread().name)
            print(spider.socialLinks())
        queue.task_done()
Ejemplo n.º 18
0
def work():
    while True:
        time.sleep(SLEEP_TIME)
        url = queue.get()
        print (threading.current_thread().name + ' now crawling ' + url)
        print ('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled)))
        Spider.crawl_page(threading.current_thread().name, url)
        print 'Crawled web' + str(Spider.file_order)
        Spider.file_order += 1
        queue.task_done()
Ejemplo n.º 19
0
def work():

    while True:
        url = queue.get()
        #tup = (url, )

        #print(tup) ### This print all website

        print(url, "main.work()")
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 20
0
def create_jobs():
    logging.error("In Create Jobs")
    for link in file_to_set(QUEUE_FILE):
        logging.error("Link {}".format(link))
        Spider.crawl_page(link,link)
        queue.put(link)

    queue.join()
    logging.error("After Queue Join")
    logging.error(queue)
    crawl()
Ejemplo n.º 21
0
def work():
    global total_retrieved_pages
    while True:
        print("retrieved pages: " + str(total_retrieved_pages))
        url = queue.get()
        if (total_retrieved_pages < max_retrieved_pages):
            Spider.crawl_page(threading.current_thread().name, url)
        else:
            Spider.crawl_page_graph(threading.current_thread().name, url)
        print("Crawling task by thread is done")
        queue.task_done()
    '''
Ejemplo n.º 22
0
def work():
    global working
    while working:
        url = queue.get()

        #if(queue.empty()):
        # working=False
        #sys.exit()
        #print("###############################################################\n###############################################################\n###############################################################\n")

        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 23
0
def do_task(url):
    content_type = ''
    try:
        resp = requests.head(url)
        # checking if the link is a webpage or a file
        content_type = resp.headers['Content-Type']

        if 'text/html' in content_type:
            Spider.crawl_page(threading.current_thread().name, url)
        else:
            # the link contains a file
            Spider.crawl_file(threading.current_thread().name, url,
                              content_type)

    except Exception as e:
        pass
Ejemplo n.º 24
0
def work():
    while True:
        url = queue.get()
        table_name = 'url_title_rel'
        title = Spider.crawl_page(threading.current_thread().name, url, DB_FILE_PATH, table_name)
        #print title
        queue.task_done()
Ejemplo n.º 25
0
class Detected:
    PROJECT_NAME = ''
    HOMEPAGE = ''
    DOMAIN_NAME = ''
    QUEUE_FILE = ''
    CRAWLED_FILE = ''
    NUMBER_OF_THREADS = 8
    queue = Queue()
    spi = None

    def __init__(self, project_name, homepage):
        self.PROJECT_NAME = project_name
        self.HOMEPAGE = homepage

        self.DOMAIN_NAME = get_domain_name(self.HOMEPAGE)
        self.QUEUE_FILE = self.PROJECT_NAME + '/queue.txt'
        self.CRAWLED_FILE = self.PROJECT_NAME + '/crawled.txt'

        self.spi = Spider(self.PROJECT_NAME, self.HOMEPAGE, self.DOMAIN_NAME)

    # Create worker threads (will die when main exits)
    def create_workers(self):
        for _ in range(self.NUMBER_OF_THREADS):
            t = threading.Thread(target=self.work)
            t.daemon = True
            t.start()

    # Do the next job in the queue
    def work(self):
        while True:
            url = self.queue.get()
            self.spi.crawl_page(threading.current_thread().name, url)
            self.queue.task_done()

    # Each queued link is a new job
    def create_jobs(self):
        for link in file_to_set(self.QUEUE_FILE):
            self.queue.put(link)
        self.queue.join()
        self.crawl()

    # Check if there are items in the queue, if so crawl them
    def crawl(self):
        queued_links = file_to_set(self.QUEUE_FILE)
        if len(queued_links) > 0:
            print(str(len(queued_links)) + ' links in the queue')
            self.create_jobs()
Ejemplo n.º 26
0
def work():
    while True:
        url = queue.get()
        global ALLNUM
        try:
            if (lock.acquire()):
                response = urlopen(url, timeout=3)
                childrenlink = Spider.gather_links(url)
                childrenfile = open(
                    PROJECT_NAME + '/childrenlink/' + str(ALLNUM) + '.txt',
                    'w')
                childrenfile.write(url + '\n')
                for each_child in childrenlink:
                    if ('javascript' not in each_child):
                        childrenfile.write(each_child + '\n', )
                childrenfile.close()

                #write html file by utf8 encoding
                html_byte = response.read()
                chardit1 = chardet.detect(html_byte)
                file1 = open(
                    PROJECT_NAME + '/html/utf8/' + str(ALLNUM) + '.html', 'wb')
                html_string = html_byte.decode(
                    chardit1['encoding']).encode('utf-8')
                file1.write(html_string)
                file1.close()

                #for smj encode as GBK
                file2 = open(
                    PROJECT_NAME + '/html/gbk/' + str(ALLNUM) + '.html', 'wb')
                html_string = html_byte.decode(chardit1['encoding'],
                                               'ignore').encode(
                                                   'gbk', 'ignore')
                file2.write(html_string)
                file2.close()

        except Exception as e:
            print(str(e))
            queue.task_done()
            lock.release()
        else:
            append_link(url)
            ALLNUM = ALLNUM + 1
            Spider.crawl_page(threading.current_thread().name, url)
            queue.task_done()
            lock.release()
Ejemplo n.º 27
0
def work():
    while True:
        url=queue.get()
        Spider.crawl_page(threading.current_thread().name,url)
    queue.task.done()





    
    
    
    
    
    
    
    
    
Ejemplo n.º 28
0
def work():
    global links_found
    #   While not reaching link limit
    while queue.qsize() < params.max_links:
        #   Grab the links from file
        queued_links = file_to_set(QUEUE_FILE)
        links_found = len(queued_links)

        #   Take a link and crawl it
        url = queue.get()
        start_crawl_time = time.time()
        Spider.crawl_page(threading.current_thread().name, url)
        end_crawl_time = time.time()

        #   Log the task time
        with open(params.projectName + '/timeLog.txt', 'a') as file:
            file.write(threading.current_thread().name + ": " +
                       str(round(end_crawl_time - start_crawl_time, 2)) +
                       ' seconds\n')

        #   Flag the queue that a task is done
        queue.task_done()
Ejemplo n.º 29
0
 def work(self):
     while True:
         url = self.queue.get()
         Spider.crawl_page(threading.current_thread().name, url)
         self.queue.task_done()
Ejemplo n.º 30
0
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
        print('calling work function')
Ejemplo n.º 31
0
def work():
    while True:
        link = queue.get()
        Spider.crawl_page(threading.current_thread().name, link)
        queue.task_done()
Ejemplo n.º 32
0
def work():
    while True:
        url = queue.get()
        print("work-------url:", url)
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 33
0
def spider(domain, url, depth):
	"""爬虫测试"""
	spider_engine = Spider(domain)
	spider_engine.crawl_page([url], depth)
Ejemplo n.º 34
0
def work():
    while True:
        url = q.get()
        Spider.crawl_page(threading.currentThread().name, url)
        q.task_done()
Ejemplo n.º 35
0
def work():
    while True:
        url = queue.get()  # get url from queue.txt
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()  # end task for worker. It can now find more jobs, if available
Ejemplo n.º 36
0
def work():
    while 1:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 37
0
def get_next(url_set):
    for url in url_set:
        waiting_list.append(url)
    current_url = waiting_list.pop()
    Spider.crawl_page(current_url)