Beispiel #1
0
def get_all_comments(restaurants_url, pages_tracker={}, max_workers=64):

    restaurants_url_to_do_iterator = iter(restaurants_url)
    pages_comments = []
    pbar = tqdm(total=len(restaurants_url))
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:

        futures = {}
        for restaurant_url, page_number in itertools.islice(
            restaurants_url_to_do_iterator, max_workers
        ):

            futures_executor = executor.submit(
                get_page_comments, restaurant_url, page_number
            )
            futures.update({futures_executor: restaurant_url})
        while futures:
            done, _ = concurrent.futures.wait(
                futures, return_when=concurrent.futures.FIRST_COMPLETED
            )
            for future in done:
                pbar.update(1)
                restaurant_url = futures[future]
                futures.pop(future)

                try:
                    comments = future.result()
                except Exception as exc:
                    tqdm.write(f"{restaurant_url} generated an exception: {exc}")
                else:

                    if pages_tracker:
                        pages_tracker[restaurant_url][1] += 1
                        if (
                            pages_tracker[restaurant_url][1]
                            >= pages_tracker[restaurant_url][0]
                        ):
                            with DimnaDatabase(db_path, logger) as db:
                                db.update_page_visit_status(
                                    base_url, restaurant_url, True,
                                )

                    pages_comments.append(comments)

                    with DimnaDatabase(db_path, logger) as db:
                        for comment, rating in comments["comments"]:
                            db.insert_rating(
                                base_url, comment.replace("\x00", ""), rating
                            )

            for restaurant_url, page_number in itertools.islice(
                restaurants_url_to_do_iterator, len(done)
            ):
                futures_executor = executor.submit(
                    get_page_comments, restaurant_url, page_number
                )
                futures.update({futures_executor: restaurant_url})

    pbar.close()
    return pages_comments
Beispiel #2
0
def find_all_offers():

    with DimnaDatabase(db_path, logger) as db:
        old_offers = [row[1] for row in db.pages_url(base_url)]

    urls = list()
    for city in cities:
        for category in categories:
            urls.append(f"https://{base_url}/{city}/{category}")

    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
        future_to_url = {}
        futures = []
        for url in urls:
            futures_executor = executor.submit(find_offers, url=url)
            future_to_url.update({futures_executor: url})
            futures.append(futures_executor)
        for future in tqdm(concurrent.futures.as_completed(futures),
                           total=len(futures)):
            url = future_to_url[future]
            try:
                offers = future.result()
            except Exception as exc:
                tqdm.write(f"{url} generated an exception: {exc}")
            else:
                with DimnaDatabase(db_path, logger) as db:
                    for page in offers:
                        if page not in old_offers:
                            db.insert_single_page_url(base_url, page, False)

    with DimnaDatabase(db_path, logger) as db:
        db.insert_last_scrap_time(base_url, datetime.now())
Beispiel #3
0
def find_all_doctors_url(base_url, cities_url, max_workers=128):
    cities_url_iterator = iter(cities_url)
    pbar = tqdm(total=len(cities_url))
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=max_workers) as executor:

        futures = {}
        for city_url in itertools.islice(cities_url_iterator, max_workers):
            futures_executor = executor.submit(find_doctors_url,
                                               base_url=base_url,
                                               city_url=city_url)
            futures.update({futures_executor: city_url})
        while futures:
            done, _ = concurrent.futures.wait(
                futures, return_when=concurrent.futures.FIRST_COMPLETED)
            for future in done:
                pbar.update(1)
                city_url = futures[future]
                futures.pop(future)
                try:
                    doctors_url = future.result()
                except Exception as exc:
                    tqdm.write(f"{city_url} generated an exception: {exc}")
                else:
                    with DimnaDatabase(db_path, logger) as db:
                        db.insert_all_pages_url(base_url, doctors_url)
            for city_url in itertools.islice(cities_url_iterator, len(done)):
                futures_executor = executor.submit(find_doctors_url,
                                                   base_url=base_url,
                                                   city_url=city_url)
                futures.update({futures_executor: city_url})
    pbar.close()
Beispiel #4
0
def scrap_all_comments(base_url, urls, max_workers=256):

    urls_to_do = [url for (_, url, is_visited) in urls if not is_visited]
    urls_to_do_iterator = iter(urls_to_do)

    pbar = tqdm(initial=len(urls) - len(urls_to_do), total=len(urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:

        futures = {}
        for url in itertools.islice(urls_to_do_iterator, max_workers):
            futures_executor = executor.submit(scrap_comments, url=url)
            futures.update({futures_executor: url})
        while futures:
            done, _ = concurrent.futures.wait(
                futures, return_when=concurrent.futures.FIRST_COMPLETED
            )
            for future in done:
                pbar.update(1)
                url = futures[future]
                futures.pop(future)
                try:
                    comments = future.result()
                except Exception as exc:
                    tqdm.write(f"{url} generated an exception: {exc}")
                else:
                    with DimnaDatabase(db_path, logger) as db:
                        db.update_page_visit_status(
                            base_url, url, True,
                        )
                        if comments:
                            db.insert_all_rating(base_url, comments)
            for url in itertools.islice(urls_to_do_iterator, len(done)):
                futures_executor = executor.submit(scrap_comments, url=url)
                futures.update({futures_executor: url})
    pbar.close()
Beispiel #5
0
def find_all_comments_pages(pages_url, max_workers=128):
    book_url_to_do = [
        book_url for (_, book_url, is_visited) in pages_url if not is_visited
    ]
    book_url_to_do_iterator = iter(book_url_to_do)
    pbar = tqdm(initial=len(pages_url) - len(book_url_to_do), total=len(pages_url))
    comments_url = list()
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        for book_url in itertools.islice(book_url_to_do_iterator, max_workers):
            book_id, book_name = book_url.split("/")[-2:]
            first_comment_url = f"{comments_base_url}/{book_id}/{book_name}.json"
            futures_executor = executor.submit(
                find_number_of_comments, comment_url=first_comment_url
            )
            futures.update({futures_executor: book_url})
        while futures:
            done, _ = concurrent.futures.wait(
                futures, return_when=concurrent.futures.FIRST_COMPLETED
            )
            for future in done:
                pbar.update(1)
                book_url = futures[future]
                futures.pop(future)
                book_id, book_name = book_url.split("/")[-2:]
                try:
                    num_pages = future.result()
                except Exception as exc:
                    tqdm.write(f"{book_url} generated an exception: {exc}")
                else:
                    if num_pages:
                        for page in range(1, num_pages + 1):
                            comment_url = f"{comments_base_url}/{book_id}/{book_name}.json?p={page}"
                            comments_url.append([book_url, comment_url])
                    else:
                        with DimnaDatabase(db_path, logger) as db:
                            db.update_page_visit_status(
                                base_url, book_url, True,
                            )
            for book_url in itertools.islice(book_url_to_do_iterator, len(done)):
                book_id, book_name = book_url.split("/")[-2:]
                first_comment_url = f"{comments_base_url}/{book_id}/{book_name}.json"
                futures_executor = executor.submit(
                    find_number_of_comments, comment_url=first_comment_url
                )
                futures.update({futures_executor: book_url})
    pbar.close()
    return comments_url
Beispiel #6
0
def scrap_all_rattings(pages_url):
    visited_urls = []

    pages_url = [list(row) for row in pages_url]
    random.shuffle(pages_url)

    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        future_to_url = {}
        futures = []
        for _, url, is_visited in pages_url:
            if not is_visited:
                futures_executor = executor.submit(scrap_rattings, url=url)
                future_to_url.update({futures_executor: url})
                futures.append(futures_executor)
        for future in tqdm(
                concurrent.futures.as_completed(futures),
                initial=len(pages_url) - len(futures),
                total=len(pages_url),
        ):
            url = future_to_url[future]
            try:
                ratings = future.result()
            except Exception as exc:
                tqdm.write(f"{url} generated an exception: {exc}")
            else:
                if url not in visited_urls:

                    with DimnaDatabase(db_path, logger) as db:
                        db.update_page_visit_status(
                            base_url,
                            url,
                            True,
                        )

                        for comment, rate in ratings:
                            # Regex replace multiple punctuations and normalize
                            comment = re.sub(r"[،؟\?\.\!]+(?=[،؟\?\.\!])", "",
                                             normalizer(comment))
                            db.insert_rating(
                                base_url,
                                comment,
                                rate,
                            )
                    visited_urls.append(url)
Beispiel #7
0
def scrap_all_comments(comments_url, max_workers=128):
    comments_url_iterator = iter(comments_url)
    pbar = tqdm(total=len(comments_url))
    with concurrent.futures.ThreadPoolExecutor(max_workers=128) as executor:

        futures = {}
        for book_url, comment_url in itertools.islice(
            comments_url_iterator, max_workers
        ):
            futures_executor = executor.submit(scrap_comments, comment_url=comment_url)
            futures.update({futures_executor: book_url})
        while futures:
            done, _ = concurrent.futures.wait(
                futures, return_when=concurrent.futures.FIRST_COMPLETED
            )
            for future in done:
                pbar.update(1)
                book_url = futures[future]
                futures.pop(future)
                try:
                    comments = future.result()
                except Exception as exc:
                    tqdm.write(f"{book_url} generated an exception: {exc}")
                else:
                    with DimnaDatabase(db_path, logger) as db:
                        db.update_page_visit_status(
                            base_url, book_url, True,
                        )
                        db.insert_all_rating(base_url, comments)
            for book_url, comment_url in itertools.islice(
                comments_url_iterator, len(done)
            ):
                futures_executor = executor.submit(
                    scrap_comments, comment_url=comment_url
                )
                futures.update({futures_executor: book_url})
    pbar.close()
Beispiel #8
0
    logfile_path = os.path.join(dir_path, "logs", f"{base_url}.log")
    if not os.path.exists(os.path.dirname(logfile_path)):
        os.mkdir(os.path.dirname(logfile_path))

    logger.basicConfig(
        level=logger.INFO,
        # handlers=[logger.FileHandler(logfile_path), logger.StreamHandler()],
        handlers=[logger.FileHandler(logfile_path)],
        format="%(asctime)s %(levelname)s %(message)s",
    )

    SEARCH_FOR_NEW_URLS = False

    last_week = datetime.now() - timedelta(days=1)

    with DimnaDatabase(db_path, logger) as db:
        last_scrap_time = db.last_scrap_time(base_url)

    if last_scrap_time:
        if last_week >= last_scrap_time:
            SEARCH_FOR_NEW_URLS = True
        else:
            print(f"Loading {base_url} pages from db🦁")
    else:
        SEARCH_FOR_NEW_URLS = True

    if SEARCH_FOR_NEW_URLS:
        print(f"Finding all doctors on {base_url}🦦...")
        cities_url = find_cities_url(base_url)
        find_all_doctors_url(base_url, cities_url)
        with DimnaDatabase(db_path, logger) as db: