def compare_site_thread_csv(file, progress_var=None, step=100.0): if status["INTERFACE_MODE"]: thread_pool_csv = ThreadPool(settings["THREADPOOL_SIZE"]) else: thread_pool_csv = ThreadPool(20) f = open(file, 'r') # calculate the step for each site row_count = sum(1 for row in f) site_step = step / row_count f.close() f = open(file, 'r') rows = csv.reader(f) for row in rows: compare_site_thread(row[0], row[1], progress_var=progress_var, step=site_step, thread_pool_csv=thread_pool_csv) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: f.close() return f.close() thread_pool_csv.wait_completion() thread_pool_csv.destroy()
def compare_site_thread(old_url, new_url, progress_var=None, step=100.0, thread_pool_csv=None): # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: return # checking multiple sites mode if thread_pool_csv: thread_pool = thread_pool_csv else: thread_pool = ThreadPool(settings["THREADPOOL_SIZE"]) create_path() ind = 0 old_url = old_url.strip() new_url = new_url.strip() # remove the "/" at the end of the url if old_url[-1] == '/': old_url = old_url[:-1] if new_url[-1] == '/': new_url = new_url[:-1] # add "http://" before url if not old_url.startswith("http"): old_url = "http://" + old_url if not new_url.startswith("http"): new_url = "http://" + new_url # print out the information for old and new sites entry_print("-----------------------------------------------------", True) entry_print("Old URL: " + old_url, True) entry_print("New URL: " + new_url, True) entry_print("-----------------------------------------------------", True) setup_step = step * 0.01 if progress_var: progress_var.set(progress_var.get() + setup_step) # check if the new site needs login new_test = get_soup(new_url) if new_test: title = new_test.find("title") if title and title.get_text().strip() == "Login": entry_print( "New site needs login. Please use login mode to check this site!\n", True) return -1 setup_step = step * 0.01 if progress_var: progress_var.set(progress_var.get() + setup_step) # get the subpages of old and new sites try: sites = get_sites(old_url) except AttributeError: entry_print( "Can't find the site map from " + old_url + ". Please check if the url is valid!", True) thread_pool.destroy() return old_blog = get_blog_site(old_url) new_blog = get_blog_site(new_url) # check program status if status["INTERFACE_MODE"] and not status["CHECKING_STATUS"]: thread_pool.destroy() return blog_exists = False if old_blog and new_blog: blog_exists = True # if urls for subpages are not found if sites is None: record_error(new_url, "sites") if progress_var: progress_var.set(progress_var.get() + step) return False # if blog page is not found if old_blog is not None and new_blog is None: record_error(new_url, "blog") elif old_blog is None and new_blog is not None: record_error(old_url, "blog") setup_step = step * 0.02 if progress_var: progress_var.set(progress_var.get() + setup_step) # print out site information entry_print("Site Information: ", True) # calculate the step for each page step *= 0.96 if blog_exists: page_step = step / 2 / (len(sites) + 1) entry_print("Old Blog: " + old_blog, True) entry_print("New Blog: " + new_blog, True) else: page_step = step / (len(sites) + 1) entry_print("Number of non-blog pages: " + str(len(sites)), True) # check the homepage thread_pool.add_task(compare_homepage, old_url=old_url, new_url=new_url, progress_var=progress_var, step=page_step) # check all the sites in sitemap for site in sites: ind += 1 if site.startswith("/home") or site.startswith("/main"): continue old_link = old_url + site new_link = new_url + site thread_pool.add_task(compare_page, old_url=old_link, new_url=new_link, progress_var=progress_var, step=page_step) # check all the blog pages if blog_exists: old_blog_soup = get_soup(old_blog) new_blog_soup = get_soup(new_blog) compare_blog(old_blog_soup, new_blog_soup, old_blog, new_blog, progress_var=progress_var, step=step / 2) # single site mode if not thread_pool_csv: thread_pool.wait_completion() thread_pool.destroy() entry_print("-----------------------------------------------------\n") return True