def get_all_comments(restaurants_url, pages_tracker={}, max_workers=64): restaurants_url_to_do_iterator = iter(restaurants_url) pages_comments = [] pbar = tqdm(total=len(restaurants_url)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {} for restaurant_url, page_number in itertools.islice( restaurants_url_to_do_iterator, max_workers ): futures_executor = executor.submit( get_page_comments, restaurant_url, page_number ) futures.update({futures_executor: restaurant_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) restaurant_url = futures[future] futures.pop(future) try: comments = future.result() except Exception as exc: tqdm.write(f"{restaurant_url} generated an exception: {exc}") else: if pages_tracker: pages_tracker[restaurant_url][1] += 1 if ( pages_tracker[restaurant_url][1] >= pages_tracker[restaurant_url][0] ): with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, restaurant_url, True, ) pages_comments.append(comments) with DimnaDatabase(db_path, logger) as db: for comment, rating in comments["comments"]: db.insert_rating( base_url, comment.replace("\x00", ""), rating ) for restaurant_url, page_number in itertools.islice( restaurants_url_to_do_iterator, len(done) ): futures_executor = executor.submit( get_page_comments, restaurant_url, page_number ) futures.update({futures_executor: restaurant_url}) pbar.close() return pages_comments
def find_all_offers(): with DimnaDatabase(db_path, logger) as db: old_offers = [row[1] for row in db.pages_url(base_url)] urls = list() for city in cities: for category in categories: urls.append(f"https://{base_url}/{city}/{category}") with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: future_to_url = {} futures = [] for url in urls: futures_executor = executor.submit(find_offers, url=url) future_to_url.update({futures_executor: url}) futures.append(futures_executor) for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): url = future_to_url[future] try: offers = future.result() except Exception as exc: tqdm.write(f"{url} generated an exception: {exc}") else: with DimnaDatabase(db_path, logger) as db: for page in offers: if page not in old_offers: db.insert_single_page_url(base_url, page, False) with DimnaDatabase(db_path, logger) as db: db.insert_last_scrap_time(base_url, datetime.now())
def find_all_doctors_url(base_url, cities_url, max_workers=128): cities_url_iterator = iter(cities_url) pbar = tqdm(total=len(cities_url)) with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers) as executor: futures = {} for city_url in itertools.islice(cities_url_iterator, max_workers): futures_executor = executor.submit(find_doctors_url, base_url=base_url, city_url=city_url) futures.update({futures_executor: city_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED) for future in done: pbar.update(1) city_url = futures[future] futures.pop(future) try: doctors_url = future.result() except Exception as exc: tqdm.write(f"{city_url} generated an exception: {exc}") else: with DimnaDatabase(db_path, logger) as db: db.insert_all_pages_url(base_url, doctors_url) for city_url in itertools.islice(cities_url_iterator, len(done)): futures_executor = executor.submit(find_doctors_url, base_url=base_url, city_url=city_url) futures.update({futures_executor: city_url}) pbar.close()
def scrap_all_comments(base_url, urls, max_workers=256): urls_to_do = [url for (_, url, is_visited) in urls if not is_visited] urls_to_do_iterator = iter(urls_to_do) pbar = tqdm(initial=len(urls) - len(urls_to_do), total=len(urls)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {} for url in itertools.islice(urls_to_do_iterator, max_workers): futures_executor = executor.submit(scrap_comments, url=url) futures.update({futures_executor: url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) url = futures[future] futures.pop(future) try: comments = future.result() except Exception as exc: tqdm.write(f"{url} generated an exception: {exc}") else: with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, url, True, ) if comments: db.insert_all_rating(base_url, comments) for url in itertools.islice(urls_to_do_iterator, len(done)): futures_executor = executor.submit(scrap_comments, url=url) futures.update({futures_executor: url}) pbar.close()
def find_all_comments_pages(pages_url, max_workers=128): book_url_to_do = [ book_url for (_, book_url, is_visited) in pages_url if not is_visited ] book_url_to_do_iterator = iter(book_url_to_do) pbar = tqdm(initial=len(pages_url) - len(book_url_to_do), total=len(pages_url)) comments_url = list() with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {} for book_url in itertools.islice(book_url_to_do_iterator, max_workers): book_id, book_name = book_url.split("/")[-2:] first_comment_url = f"{comments_base_url}/{book_id}/{book_name}.json" futures_executor = executor.submit( find_number_of_comments, comment_url=first_comment_url ) futures.update({futures_executor: book_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) book_url = futures[future] futures.pop(future) book_id, book_name = book_url.split("/")[-2:] try: num_pages = future.result() except Exception as exc: tqdm.write(f"{book_url} generated an exception: {exc}") else: if num_pages: for page in range(1, num_pages + 1): comment_url = f"{comments_base_url}/{book_id}/{book_name}.json?p={page}" comments_url.append([book_url, comment_url]) else: with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, book_url, True, ) for book_url in itertools.islice(book_url_to_do_iterator, len(done)): book_id, book_name = book_url.split("/")[-2:] first_comment_url = f"{comments_base_url}/{book_id}/{book_name}.json" futures_executor = executor.submit( find_number_of_comments, comment_url=first_comment_url ) futures.update({futures_executor: book_url}) pbar.close() return comments_url
def scrap_all_rattings(pages_url): visited_urls = [] pages_url = [list(row) for row in pages_url] random.shuffle(pages_url) with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: future_to_url = {} futures = [] for _, url, is_visited in pages_url: if not is_visited: futures_executor = executor.submit(scrap_rattings, url=url) future_to_url.update({futures_executor: url}) futures.append(futures_executor) for future in tqdm( concurrent.futures.as_completed(futures), initial=len(pages_url) - len(futures), total=len(pages_url), ): url = future_to_url[future] try: ratings = future.result() except Exception as exc: tqdm.write(f"{url} generated an exception: {exc}") else: if url not in visited_urls: with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, url, True, ) for comment, rate in ratings: # Regex replace multiple punctuations and normalize comment = re.sub(r"[،؟\?\.\!]+(?=[،؟\?\.\!])", "", normalizer(comment)) db.insert_rating( base_url, comment, rate, ) visited_urls.append(url)
def scrap_all_comments(comments_url, max_workers=128): comments_url_iterator = iter(comments_url) pbar = tqdm(total=len(comments_url)) with concurrent.futures.ThreadPoolExecutor(max_workers=128) as executor: futures = {} for book_url, comment_url in itertools.islice( comments_url_iterator, max_workers ): futures_executor = executor.submit(scrap_comments, comment_url=comment_url) futures.update({futures_executor: book_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) book_url = futures[future] futures.pop(future) try: comments = future.result() except Exception as exc: tqdm.write(f"{book_url} generated an exception: {exc}") else: with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, book_url, True, ) db.insert_all_rating(base_url, comments) for book_url, comment_url in itertools.islice( comments_url_iterator, len(done) ): futures_executor = executor.submit( scrap_comments, comment_url=comment_url ) futures.update({futures_executor: book_url}) pbar.close()
logfile_path = os.path.join(dir_path, "logs", f"{base_url}.log") if not os.path.exists(os.path.dirname(logfile_path)): os.mkdir(os.path.dirname(logfile_path)) logger.basicConfig( level=logger.INFO, # handlers=[logger.FileHandler(logfile_path), logger.StreamHandler()], handlers=[logger.FileHandler(logfile_path)], format="%(asctime)s %(levelname)s %(message)s", ) SEARCH_FOR_NEW_URLS = False last_week = datetime.now() - timedelta(days=1) with DimnaDatabase(db_path, logger) as db: last_scrap_time = db.last_scrap_time(base_url) if last_scrap_time: if last_week >= last_scrap_time: SEARCH_FOR_NEW_URLS = True else: print(f"Loading {base_url} pages from db🦁") else: SEARCH_FOR_NEW_URLS = True if SEARCH_FOR_NEW_URLS: print(f"Finding all doctors on {base_url}🦦...") cities_url = find_cities_url(base_url) find_all_doctors_url(base_url, cities_url) with DimnaDatabase(db_path, logger) as db: