def get_all_catalog_urls(catalog_url=BASE_CATALOG_URL): """ Scrapes all catalog pages from the base catalog page :return: list of urls """ filepath = os.path.join(ROOT_DIR, "static/catalog_urls.pickle") if file_exists(filepath): return read_pickled_object(filepath) else: print("Starting catalog page scraping!\n") print("-------------------------\n") base_html = get_html() base_soup = BeautifulSoup(base_html, 'lxml') page_list = [catalog_url] nextpage = base_soup.find('a', class_='paging-next') while nextpage: href = nextpage.get('href') new_page = f'{BASE_URL}{href}' print(f"found new catalog page {new_page}") page_list.append(new_page) new_html = get_html(new_page) new_soup = BeautifulSoup(new_html, 'lxml') nextpage = new_soup.find('a', class_='paging-next') pickle_object(page_list, filepath) print(f"Written catalog pages to {filepath}!") return page_list
def get_hotel_review_pages(catalog_url_list=get_all_catalog_urls()): """ Scrapes all hotel review pages from a list of catalog pages. :param catalog_url_list: list of urls :return: list of urls """ filepath = os.path.join(ROOT_DIR, "static/hotel_review_urls.pickle") if file_exists(filepath): return read_pickled_object(filepath) else: print("Starting hotel review page scraping!") print("-------------------------\n") hotel_review_page_list = [] for catalog_url in catalog_url_list: print(f"Scraping hotel links from {catalog_url}") catalog_html = get_html(catalog_url) catalog_soup = BeautifulSoup(catalog_html, 'lxml') hotel_page_list = catalog_soup.find_all( 'a', class_='hotel_name_link url') hotel_page_list = [ i.get('href').replace("#hotelTmpl", "#tab-reviews") for i in hotel_page_list ] hotel_page_list = [ f"{BASE_URL}{i}".replace("\n", "") for i in hotel_page_list ] hotel_review_page_list.extend(hotel_page_list) pickle_object(hotel_review_page_list, filepath) print(f"Written hotel review pages to {filepath}!") return hotel_review_page_list
def df_to_geojson(df): """ Writes a dataframe to geojson. :param df: dataframe of distinct hotels """ filepath = os.path.join(ROOT_DIR, "static/all_hotels.geojson") if file_exists(filepath): pass else: def append_feature(x): feature = geojson.Feature(geometry=geojson.Point( (x["lng"], x["lat"])), properties={ 'count': x["count"], '_id': str(x["_id"]), 'Hotel_Address': x['Hotel_Address'], 'Average_Score': x['Average_Score'], 'Hotel_Name': x['Hotel_Name'] }) features.append(feature) features = [] df.apply(append_feature, axis=1) feature_collection = geojson.FeatureCollection(features) with open(filepath, 'w', encoding='utf8') as f: geojson.dump(feature_collection, f, sort_keys=True, ensure_ascii=False)
def get_reviews(review_urls=get_hotel_review_pages()): """ Scrapes review text from a hotel page (with the review tab open) :param review_urls: list of hotel page urls :return: list of reviews """ filepath = os.path.join(ROOT_DIR, "static/scraped_reviews.pickle") if file_exists(filepath): return read_pickled_object(filepath) else: print("Starting review scraping!\n") print("-------------------------\n") review_list = [] for review_url in review_urls: print(f"Scraping reviews for page {review_url}") review_html = get_html(review_url) review_soup = BeautifulSoup(review_html, 'lxml') # Skip hotels with no average score, as this indicates they have no reviews if score_badge := review_soup.find( "div", class_="bui-review-score__badge"): average_score = score_badge.text.strip() hotel_name = review_soup.find( id="hp_hotel_name_reviews").text.strip() hotel_address = review_soup.find( "span", class_="hp_address_subtitle").text.strip() review_blocks = review_soup.select(".c-review-block") for r in review_blocks: nationality = r.find("span", class_="bui-avatar-block__subtitle") if nationality: nationality = nationality.text.strip() else: nationality = "Nothing" score = r.find(class_="bui-review-score__badge") if score: score = score.text.strip() else: score = "Nothing" positive_review = r.find(class_="c-review__row") if positive_review: positive_review = positive_review.p.find( class_="c-review__body").text.strip() else: positive_review = "Nothing" negative_review = r.find(class_="lalala") if negative_review: negative_review = negative_review.p.find( class_="c-review__body").text.strip() else: negative_review = "Nothing" review = [ hotel_address, average_score, hotel_name, nationality, negative_review, positive_review, score ] print(f'Adding review for hotel "{hotel_name}"') review_list.append(review) pickle_object(review_list, filepath) print(f"Written reviews to {filepath}!") return review_list
def read_geojson(): """ Reads geojson hotel data from file :return: geojson as json """ filepath = os.path.join(ROOT_DIR, "static/all_hotels.geojson") if file_exists(filepath): with open(filepath) as f: return json.load(f)
def clean_df_text(df): """ clean review column :param df: dataframe :return: dataframe """ filepath = os.path.join(ROOT_DIR, "static/cleaned_df.pickle") if file_exists(filepath): return read_pickled_dataframe(filepath) else: print("\nPre-processing text...") df['Review'] = df['Review_Original'].apply(pre_process_text) pickle_dataframe(df, filepath) print(f"\nWritten reviews to {filepath}!") return df
def preliminary_clean(df): """ Perform early cleaning to allow for labeling :param df: dataframe :return: dataframe """ filepath = os.path.join(ROOT_DIR, "static/preliminary_clean_df.pickle") if file_exists(filepath): return read_pickled_dataframe(filepath) else: print("\nMerging columns...") df['Review_Original'] = df['Positive_Review'] + ' ' + df[ 'Negative_Review'] df = df.drop('Positive_Review', 1) df = df.drop('Negative_Review', 1) print("\nRemoving non-english reviews...") df = df[df['Review_Original'].apply(lambda x: is_en(x))] pickle_dataframe(df, filepath) print(f"\nWritten reviews to {filepath}!") return df
def label_sentiment(df): """ Adds a new column to the dataframe, labeling the sentiment of the reviews using TextBlob PatternAnalyzer :param df: dataframe :return: dataframe """ def return_sentiment(text): """ Judges sentiment of string, 1 being positive, and 0 being negative :param text: string :return: 1 or 0 """ obj = TextBlob(str(text)) return 1 if obj.polarity >= 0 else 0 filepath = os.path.join(ROOT_DIR, "static/labeled_df.pickle") if file_exists(filepath): return read_pickled_dataframe(filepath) else: df['Sentiment'] = df['Review'].apply(return_sentiment) pickle_dataframe(df, filepath) print(f"\nWritten reviews to {filepath}!") return df
def clean_and_label(df): """ Explodes negative and positive review columns into 2 rows and labels the sentiment at the same time, then cleans the text. :param df: dataframe :return: dataframe """ filepath = os.path.join(ROOT_DIR, "static/cleaned_df.pickle") if file_exists(filepath): return read_pickled_dataframe(filepath) else: print("\nSplitting columns...") df_size = len(df) columns = [ 'Hotel_Address', 'Review_Date', 'Average_Score', 'Hotel_Name', 'Reviewer_Nationality', 'Reviewer_Score', 'lat', 'lng', 'Review', 'Sentiment' ] row_list = [] for index, row in df.iterrows(): print(f"Splitting row {index}//{df_size}") row1 = { 'Hotel_Address': row['Hotel_Address'], 'Review_Date': row['Review_Date'], 'Average_Score': row['Average_Score'], 'Hotel_Name': row['Hotel_Name'], 'Reviewer_Nationality': row['Reviewer_Nationality'], 'Reviewer_Score': row['Reviewer_Score'], 'lat': row['lat'], 'lng': row['lng'], 'Review': row['Positive_Review'], 'Sentiment': 1 } row2 = { 'Hotel_Address': row['Hotel_Address'], 'Review_Date': row['Review_Date'], 'Average_Score': row['Average_Score'], 'Hotel_Name': row['Hotel_Name'], 'Reviewer_Nationality': row['Reviewer_Nationality'], 'Reviewer_Score': row['Reviewer_Score'], 'lat': row['lat'], 'lng': row['lng'], 'Review': row['Negative_Review'], 'Sentiment': 0 } row_list.append(row1) row_list.append(row2) new_df = pd.DataFrame(row_list, columns=columns) print("\nPre-processing text...") new_df['Review'] = new_df['Review'].apply(pre_process_text) print("\nDropping reviews with custom stop-words...") # Custom stop-words based on manual observation of the data custom_stop_words = ['negative', 'nothing', 'positive', 'n'] new_df = new_df[~new_df['Review'].isin(custom_stop_words)] print(f"\nWritten reviews to {filepath}!") pickle_dataframe(new_df, filepath) return new_df