Ejemplo n.º 1
0
    def request_uniq(self):
        if not self.is_data_loaded:
            self.load_data()

        # fill queue with only unique urls
        for i in self.get_unique_urls():
            self.queue.put({"host": self.host, "url": i})

        # add 'None' to queue - stops threads when no items are left
        for i in range(self.threads):
            self.queue.put(None)

        # start the threads
        for i in range(self.threads):
            w = RequesterThread(i, self.queue, self.cache, self.requested)
            w.daemon = True
            self.workers.append(w)
            w.start()

        # join when all work is done
        self.queue.join()

        # convert queue to list
        # pair items from self.get_all_items with the urls requested
        result_list = []
        all_items = self.get_all_items()
        while not self.requested.empty():
            url, response = self.requested.get()
            for item in all_items:
                if item['url'] == url:
                    item["response"] = response
                    result_list.append(item)

        return result_list
Ejemplo n.º 2
0
    def run(self):
        for req in self.cache.get_responses():
            # only scrape pages that can contain links/references
            if 'text/html' in req.headers['content-type']:
                self.parser.feed(str(req.content))

                for i in self.parser.get_results():

                    # ensure that only ressources located on the domain /sub-domain is requested
                    if i.startswith('http'):
                        parts = i.split('/')
                        host = parts[2]

                        # if the ressource is out side of the domain, skip it
                        if not host in self.host.split('/')[2]:
                            continue

                        # else update the url so that it only contains the relative location
                        else:
                            i = '/'.join(parts[3:])

                    self.queue.put({"host": self.host, "url": i})

        # add 'None' to queue - stops threads when no items are left
        for i in range(self.threads):
            self.queue.put(None)

        # start the threads
        for i in range(self.threads):
            w = RequesterThread(i, self.queue, self.cache, self.results)
            w.daemon = True
            self.workers.append(w)
            w.start()

        self.queue.join()
Ejemplo n.º 3
0
	def run(self):
		for req in self.cache.get_responses():
			# only scrape pages that can contain links/references
			if 'text/html' in req.headers['content-type']:
				self.parser.feed(str(req.content))
				
				for i in self.parser.get_results():
					
					# ensure that only ressources located on the domain /sub-domain is requested 
					if i.startswith('http'):
						parts = i.split('/')
						host = parts[2]

						# if the ressource is out side of the domain, skip it
						if not host in self.host.split('/')[2]:
							continue

						# else update the url so that it only contains the relative location
						else:
							i = '/'.join(parts[3:])

					self.queue.put( {"host": self.host, "url": i} )
				
		# add 'None' to queue - stops threads when no items are left
		for i in range(self.threads): self.queue.put( None )

		# start the threads
		for i in range(self.threads):
			w = RequesterThread(i, self.queue, self.cache, self.results)
			w.daemon = True
			self.workers.append(w)
			w.start()

		self.queue.join()
Ejemplo n.º 4
0
	def request_uniq(self):
		if not self.is_data_loaded:
			self.load_data()

		# fill queue with only unique urls
		for i in self.get_unique_urls():
			self.queue.put( {"host": self.host, "url": i} )

		# add 'None' to queue - stops threads when no items are left
		for i in range(self.threads): self.queue.put( None )

		# start the threads
		for i in range(self.threads):
			w = RequesterThread(i, self.queue, self.cache, self.requested)
			w.daemon = True
			self.workers.append(w)
			w.start()

		# join when all work is done
		self.queue.join()

		# convert queue to list
		# pair items from self.get_all_items with the urls requested
		result_list = []
		all_items = self.get_all_items()
		while not self.requested.empty():
			url, response = self.requested.get()
			for item in all_items:
				if item['url'] == url:
					item["response"] = response
					result_list.append( item )
		
		return result_list