class Crawler(ttk.Frame): def __init__(self, parent, *args, **kwargs): ttk.Frame.__init__(self, parent, *args, **kwargs) self.root = parent self.queue_count = tkinter.IntVar() self.crawled_count = tkinter.IntVar() self.pdf_count = tkinter.IntVar() self.webpage_count = tkinter.IntVar() self.media_count = tkinter.IntVar() self.error_count = tkinter.IntVar() self.message = tkinter.StringVar(value="") self.PATH = tkinter.StringVar(value="") self.done = False self.n = 0 self.init_gui() def init_gui(self): self.root.title("Crawler") self.grid(column=0, row=0, sticky='nsew') self.url = ttk.Entry(self, width=30) self.url.grid(column=1, row=1, sticky="w") self.num_crawlers = ttk.Entry(self, width=5) self.num_crawlers.insert(0, "16") self.num_crawlers.grid(column=1, row=2, stick="w") ttk.Label(self, text='Crawler').grid(column=0, row=0, columnspan=4) ttk.Label(self, text='Base URL').grid(column=0, row=1, stick="ew") ttk.Label(self, text='Number of crawlers').grid(column=0, row=2, stick="ew") self.start_button = ttk.Button(self, text='Start Crawl', command=self.start) self.start_button.grid(column=0, row=3, columnspan=4) ttk.Separator(self, orient='horizontal').grid(column=0, row=4, columnspan=4, sticky='ew') ttk.Label(self, text='Queue').grid(column=0, row=5) ttk.Label(self, textvariable=self.queue_count).grid(column=1, row=5) ttk.Label(self, text='Crawled').grid(column=0, row=6) ttk.Label(self, textvariable=self.crawled_count).grid(column=1, row=6) ttk.Label(self, text='PDF').grid(column=0, row=7) ttk.Label(self, textvariable=self.pdf_count).grid(column=1, row=7) ttk.Label(self, text='Webpages').grid(column=0, row=8) ttk.Label(self, textvariable=self.webpage_count).grid(column=1, row=8) ttk.Label(self, text='Media').grid(column=0, row=9) ttk.Label(self, textvariable=self.media_count).grid(column=1, row=9) ttk.Label(self, text='Errors').grid(column=0, row=10) ttk.Label(self, textvariable=self.error_count).grid(column=1, row=10) ttk.Separator(self, orient='horizontal').grid(column=0, row=11, columnspan=4, sticky='ew') ttk.Label(self, text='Logs path').grid(column=0, row=12) ttk.Label(self, textvariable=self.PATH).grid(column=1, row=12, stick="w") ttk.Label(self, textvariable=self.message).grid(column=0, row=14, columnspan=4, sticky="ew") for child in self.winfo_children(): child.grid_configure(padx=5, pady=5) def start(self): self.HOMEPAGE = self.url.get() if not '://' in self.HOMEPAGE: self.HOMEPAGE = "http://" + self.HOMEPAGE + '/' self.DOMAIN_NAME = get_domain_name(self.HOMEPAGE) self.PROJECT_NAME = self.DOMAIN_NAME[:self.DOMAIN_NAME.index('.')] self.PATH.set( os.path.join(os.path.dirname(os.path.realpath(__file__)), self.PROJECT_NAME)) self.QUEUE_FILE = 'projects/' + self.PROJECT_NAME + '/queue.txt' self.CRAWLED_FILE = 'projects/' + self.PROJECT_NAME + '/crawled.txt' self.SUMMARY_FILE = 'projects/' + self.PROJECT_NAME + '/summary.txt' self.MEDIA_FILE = 'projects/' + self.PROJECT_NAME + '/media.txt' self.NUMBER_OF_THREADS = int(self.num_crawlers.get()) self.queue = Queue() self.spider = Spider(self.PROJECT_NAME, self.HOMEPAGE, self.DOMAIN_NAME) self.create_workers() self.start_button.destroy() clock = threading.Thread(target=self.clock) crawl = threading.Thread(target=self.crawl) clock.start() crawl.start() # Create worker threads (will die when main exits) def create_workers(self): for _ in range(self.NUMBER_OF_THREADS): t = threading.Thread(target=self.work) t.daemon = True t.start() # Do the next job in the queue def work(self): while True: url = self.queue.get() Spider.crawl_page(threading.current_thread().name, url) self.queue.task_done() def update_nums(self): data = self.spider.data() self.queue_count.set(data['queue']) self.crawled_count.set(data['crawled']) self.pdf_count.set(data['pdf']) self.webpage_count.set(data['webpage']) self.media_count.set(data['media']) self.error_count.set(data['error']) self.update_idletasks() def clock(self): self.n += 1 if self.n > 3: self.n = 1 dots = "" for _ in range(self.n): dots = dots + "." if self.done: self.message.set("Complete!") self.update_nums() self.clean_files() else: self.message.set("Running" + dots) self.update_nums() self.after(500, self.clock) def clean_files(self): make_file_readable(self.CRAWLED_FILE) make_file_readable(self.MEDIA_FILE) def crawl(self): queued_links = file_to_set(self.QUEUE_FILE) while len(queued_links) > 0: queued_links = file_to_set(self.QUEUE_FILE) for link in queued_links: self.queue.put(link) print(link) self.queue.join() print(str(len(queued_links)) + ' links in the queue') if len(queued_links) == 0: self.done = True