コード例 #1
0
def get_all_catalog_urls(catalog_url=BASE_CATALOG_URL):
    """
    Scrapes all catalog pages from the base catalog page
    :return: list of urls
    """
    filepath = os.path.join(ROOT_DIR, "static/catalog_urls.pickle")
    if file_exists(filepath):
        return read_pickled_object(filepath)
    else:
        print("Starting catalog page scraping!\n")
        print("-------------------------\n")
        base_html = get_html()
        base_soup = BeautifulSoup(base_html, 'lxml')
        page_list = [catalog_url]

        nextpage = base_soup.find('a', class_='paging-next')
        while nextpage:
            href = nextpage.get('href')
            new_page = f'{BASE_URL}{href}'
            print(f"found new catalog page {new_page}")
            page_list.append(new_page)

            new_html = get_html(new_page)
            new_soup = BeautifulSoup(new_html, 'lxml')
            nextpage = new_soup.find('a', class_='paging-next')

        pickle_object(page_list, filepath)
        print(f"Written catalog pages to {filepath}!")
        return page_list
コード例 #2
0
def get_hotel_review_pages(catalog_url_list=get_all_catalog_urls()):
    """
    Scrapes all hotel review pages from a list of catalog pages.
    :param catalog_url_list: list of urls
    :return: list of urls
    """
    filepath = os.path.join(ROOT_DIR, "static/hotel_review_urls.pickle")
    if file_exists(filepath):
        return read_pickled_object(filepath)
    else:
        print("Starting hotel review page scraping!")
        print("-------------------------\n")
        hotel_review_page_list = []
        for catalog_url in catalog_url_list:
            print(f"Scraping hotel links from {catalog_url}")
            catalog_html = get_html(catalog_url)
            catalog_soup = BeautifulSoup(catalog_html, 'lxml')
            hotel_page_list = catalog_soup.find_all(
                'a', class_='hotel_name_link url')
            hotel_page_list = [
                i.get('href').replace("#hotelTmpl", "#tab-reviews")
                for i in hotel_page_list
            ]
            hotel_page_list = [
                f"{BASE_URL}{i}".replace("\n", "") for i in hotel_page_list
            ]
            hotel_review_page_list.extend(hotel_page_list)

        pickle_object(hotel_review_page_list, filepath)
        print(f"Written hotel review pages to {filepath}!")
        return hotel_review_page_list
コード例 #3
0
def df_to_geojson(df):
    """
    Writes a dataframe to geojson.
    :param df: dataframe of distinct hotels
    """
    filepath = os.path.join(ROOT_DIR, "static/all_hotels.geojson")
    if file_exists(filepath):
        pass
    else:

        def append_feature(x):
            feature = geojson.Feature(geometry=geojson.Point(
                (x["lng"], x["lat"])),
                                      properties={
                                          'count': x["count"],
                                          '_id': str(x["_id"]),
                                          'Hotel_Address': x['Hotel_Address'],
                                          'Average_Score': x['Average_Score'],
                                          'Hotel_Name': x['Hotel_Name']
                                      })
            features.append(feature)

        features = []
        df.apply(append_feature, axis=1)
        feature_collection = geojson.FeatureCollection(features)

        with open(filepath, 'w', encoding='utf8') as f:
            geojson.dump(feature_collection,
                         f,
                         sort_keys=True,
                         ensure_ascii=False)
コード例 #4
0
def get_reviews(review_urls=get_hotel_review_pages()):
    """
    Scrapes review text from a hotel page (with the review tab open)
    :param review_urls: list of hotel page urls
    :return: list of reviews
    """
    filepath = os.path.join(ROOT_DIR, "static/scraped_reviews.pickle")
    if file_exists(filepath):
        return read_pickled_object(filepath)
    else:
        print("Starting review scraping!\n")
        print("-------------------------\n")
        review_list = []
        for review_url in review_urls:
            print(f"Scraping reviews for page {review_url}")
            review_html = get_html(review_url)
            review_soup = BeautifulSoup(review_html, 'lxml')
            # Skip hotels with no average score, as this indicates they have no reviews
            if score_badge := review_soup.find(
                    "div", class_="bui-review-score__badge"):
                average_score = score_badge.text.strip()
                hotel_name = review_soup.find(
                    id="hp_hotel_name_reviews").text.strip()
                hotel_address = review_soup.find(
                    "span", class_="hp_address_subtitle").text.strip()

                review_blocks = review_soup.select(".c-review-block")
                for r in review_blocks:
                    nationality = r.find("span",
                                         class_="bui-avatar-block__subtitle")
                    if nationality:
                        nationality = nationality.text.strip()
                    else:
                        nationality = "Nothing"
                    score = r.find(class_="bui-review-score__badge")
                    if score:
                        score = score.text.strip()
                    else:
                        score = "Nothing"
                    positive_review = r.find(class_="c-review__row")
                    if positive_review:
                        positive_review = positive_review.p.find(
                            class_="c-review__body").text.strip()
                    else:
                        positive_review = "Nothing"
                    negative_review = r.find(class_="lalala")
                    if negative_review:
                        negative_review = negative_review.p.find(
                            class_="c-review__body").text.strip()
                    else:
                        negative_review = "Nothing"
                    review = [
                        hotel_address, average_score, hotel_name, nationality,
                        negative_review, positive_review, score
                    ]
                    print(f'Adding review for hotel "{hotel_name}"')
                    review_list.append(review)
        pickle_object(review_list, filepath)
        print(f"Written reviews to {filepath}!")
        return review_list
コード例 #5
0
def read_geojson():
    """
    Reads geojson hotel data from file
    :return: geojson as json
    """
    filepath = os.path.join(ROOT_DIR, "static/all_hotels.geojson")
    if file_exists(filepath):
        with open(filepath) as f:
            return json.load(f)
コード例 #6
0
def clean_df_text(df):
    """
    clean review column
    :param df: dataframe
    :return: dataframe
    """
    filepath = os.path.join(ROOT_DIR, "static/cleaned_df.pickle")
    if file_exists(filepath):
        return read_pickled_dataframe(filepath)
    else:
        print("\nPre-processing text...")
        df['Review'] = df['Review_Original'].apply(pre_process_text)

        pickle_dataframe(df, filepath)
        print(f"\nWritten reviews to {filepath}!")
        return df
コード例 #7
0
def preliminary_clean(df):
    """
    Perform early cleaning to allow for labeling
    :param df: dataframe
    :return: dataframe
    """
    filepath = os.path.join(ROOT_DIR, "static/preliminary_clean_df.pickle")
    if file_exists(filepath):
        return read_pickled_dataframe(filepath)
    else:
        print("\nMerging columns...")
        df['Review_Original'] = df['Positive_Review'] + ' ' + df[
            'Negative_Review']
        df = df.drop('Positive_Review', 1)
        df = df.drop('Negative_Review', 1)

        print("\nRemoving non-english reviews...")
        df = df[df['Review_Original'].apply(lambda x: is_en(x))]
        pickle_dataframe(df, filepath)
        print(f"\nWritten reviews to {filepath}!")
        return df
コード例 #8
0
def label_sentiment(df):
    """
    Adds a new column to the dataframe, labeling the sentiment of the reviews using TextBlob PatternAnalyzer
    :param df: dataframe
    :return: dataframe
    """
    def return_sentiment(text):
        """
        Judges sentiment of string, 1 being positive, and 0 being negative
        :param text: string
        :return: 1 or 0
        """
        obj = TextBlob(str(text))
        return 1 if obj.polarity >= 0 else 0

    filepath = os.path.join(ROOT_DIR, "static/labeled_df.pickle")
    if file_exists(filepath):
        return read_pickled_dataframe(filepath)
    else:
        df['Sentiment'] = df['Review'].apply(return_sentiment)
        pickle_dataframe(df, filepath)
        print(f"\nWritten reviews to {filepath}!")
        return df
コード例 #9
0
def clean_and_label(df):
    """
    Explodes negative and positive review columns into 2 rows and labels the sentiment at the same time, then cleans
    the text.
    :param df: dataframe
    :return: dataframe
    """
    filepath = os.path.join(ROOT_DIR, "static/cleaned_df.pickle")
    if file_exists(filepath):
        return read_pickled_dataframe(filepath)
    else:
        print("\nSplitting columns...")
        df_size = len(df)
        columns = [
            'Hotel_Address', 'Review_Date', 'Average_Score', 'Hotel_Name',
            'Reviewer_Nationality', 'Reviewer_Score', 'lat', 'lng', 'Review',
            'Sentiment'
        ]
        row_list = []

        for index, row in df.iterrows():
            print(f"Splitting row {index}//{df_size}")
            row1 = {
                'Hotel_Address': row['Hotel_Address'],
                'Review_Date': row['Review_Date'],
                'Average_Score': row['Average_Score'],
                'Hotel_Name': row['Hotel_Name'],
                'Reviewer_Nationality': row['Reviewer_Nationality'],
                'Reviewer_Score': row['Reviewer_Score'],
                'lat': row['lat'],
                'lng': row['lng'],
                'Review': row['Positive_Review'],
                'Sentiment': 1
            }

            row2 = {
                'Hotel_Address': row['Hotel_Address'],
                'Review_Date': row['Review_Date'],
                'Average_Score': row['Average_Score'],
                'Hotel_Name': row['Hotel_Name'],
                'Reviewer_Nationality': row['Reviewer_Nationality'],
                'Reviewer_Score': row['Reviewer_Score'],
                'lat': row['lat'],
                'lng': row['lng'],
                'Review': row['Negative_Review'],
                'Sentiment': 0
            }
            row_list.append(row1)
            row_list.append(row2)
        new_df = pd.DataFrame(row_list, columns=columns)

        print("\nPre-processing text...")
        new_df['Review'] = new_df['Review'].apply(pre_process_text)

        print("\nDropping reviews with custom stop-words...")
        # Custom stop-words based on manual observation of the data
        custom_stop_words = ['negative', 'nothing', 'positive', 'n']
        new_df = new_df[~new_df['Review'].isin(custom_stop_words)]

        print(f"\nWritten reviews to {filepath}!")
        pickle_dataframe(new_df, filepath)
        return new_df