def splinter_scrape_ta_reviews(city='', state='', write_to_db=False, start_num=0, end_num=-1):
    """PURPOSE: To """
    engine = cadb.connect_aws_db(write_unicode=True)
    blinks = get_hotel_urls(city, state, engine)

    # only do the specified hotel range
    if start_num != 0:
        blinks = blinks[start_num:]
    if end_num != -1:
        if len(blinks) < end_num:
            print('end_num exceeded number of hotels. resetting to max.')
            end_num = len(blinks)
        blinks = blinks[:end_num]

    br = Browser()

    donebids = get_done_business_ids(city, engine)

    for hotel_id, biz_id, link in blinks:
        # check to see if there are already reviews for that hotel
        if int(biz_id) not in donebids:
            bigdf = scrape_hotel(link, br, engine)
            bigdf['hotel_id'] = hotel_id
            bigdf['business_id'] = biz_id
            bigdf['biz_review_id'] = np.int64(bigdf['biz_review_id'].values)
            bigdf = remove_duplicates(bigdf, city, engine)
            if write_to_db:
                try:
                    bigdf.to_sql('ta_reviews', engine, if_exists='append', index=False)
                except:
                    print('WRITING TO DB FAILED!!!')
        else:
            print('business_id {} already scraped.'.format(biz_id))
Ejemplo n.º 2
0
def retrieve_best_hotels(city, state=''):
    """PURPOSE: To """
    engine = cadb.connect_aws_db(write_unicode=True)
    conn = engine.connect()
    cmd = 'SELECT * FROM yelp_reviews'

    yelp_reviews = pd.read_sql(cmd, engine)

    cmd = 'SELECT * FROM yelp_hotels'
    yelp_hotels = pd.read_sql(cmd, engine)
    yelp = pd.merge(yelp_hotels, yelp_reviews, on='business_id', how='inner')

    yelp_city = yelp[yelp['hotel_city'] == city.strip()]

    yelp_dog_review = yelp_city[yelp_city['review_text'].str.contains('dog')].copy().reset_index()

    average_dog_ratings = [np.mean(yelp_dog_review[yelp_dog_review['hotel_id'] == hotel_id]['review_rating'].values) for hotel_id in np.unique(yelp_dog_review['hotel_id'])]

    unique_hotels = yelp_dog_review[yelp_dog_review['hotel_id'].isin(np.unique(yelp_dog_review['hotel_id']))].copy()

    unique_hotels.drop_duplicates(cols='hotel_id', inplace=True)

    unique_hotels['average_rating'] = average_dog_ratings

    best_dog_hotel_names = unique_hotels.sort(columns='average_rating', ascending=False)['hotel_name'].head(10).values

    best_dog_hotel_ratings = np.round(unique_hotels.sort(columns='average_rating', ascending=False)['average_rating'].head(10).values, 1)

    string_ratings = [str(rat) for rat in best_dog_hotel_ratings]

    #print('best dog hotels:')
    #print(best_dog_hotel_names)

    return best_dog_hotel_names, string_ratings
def get_biz_ids(city, engine):
    cmd = "SELECT business_id FROM ta_hotels "
    cmd += "where hotel_city = "
    cmd += '"' + (" ").join(city.split("_")) + '"'
    try:
        xstng_bizs = [int(biz_id[0]) for biz_id in pd.read_sql_query(cmd, engine).values]
    except:
        engine = cadb.connect_aws_db(write_unicode=True)
        xstng_bizs = [int(biz_id[0]) for biz_id in pd.read_sql_query(cmd, engine).values]
    return xstng_bizs
def get_biz_review_ids(city, engine):
    cmd = 'select biz_review_id from ta_reviews r inner join '
    cmd += 'ta_hotels h on r.business_id=h.business_id '
    cmd += 'where h.hotel_city = '
    cmd += '"'+(' ').join(city.split('_'))+'"'
    try:
        xstng_revs = [int(rev_id[0]) for rev_id in pd.read_sql_query(cmd, engine).values]
    except:
        engine = cadb.connect_aws_db(write_unicode=True)
        xstng_revs = [int(rev_id[0]) for rev_id in pd.read_sql_query(cmd, engine).values]
    return xstng_revs
Ejemplo n.º 5
0
def retrieve_best_hotels3(city, state='',
                          revdb='ta',
                          max_revs=10):
    """
    PURPOSE:
    To retrieve the reviews for all hotels for a given city,
    compute the hotels ratings, and return the data for the
    top ten hotels for that city.
    """

    # convert city input in all lower case
    city = str(city).lower()

    engine = cadb.connect_aws_db(write_unicode=True)

    if revdb == 'yelp':
        revdf = get_yelp_reviews(city, engine)
    if revdb == 'ta':
        revdf = get_ta_reviews(city, engine)

    # get the unique hotels
    unique_biz_ids = np.unique(revdf['business_id'])

    unique_hotel_df = revdf[revdf['business_id'].isin(unique_biz_ids)].copy()

    unique_hotel_df.drop_duplicates(subset='business_id', inplace=True)
    unique_hotel_df['num_dog_reviews'] = [len(revdf[revdf['business_id'] == business_id]) for business_id in unique_hotel_df['business_id'].values]
    #max_revs = unique_hotel_df['num_dog_reviews'].max()

    print('max reviews is: {}'.format(max_revs))

    unique_hotel_df['average_dog_rating'] = [np.mean(revdf[revdf['business_id'] == business_id]['dog_rating'].values) * 0.8 + min(1, unique_hotel_df[unique_hotel_df['business_id'] == business_id]['num_dog_reviews'].values[0]/max_revs) for business_id in unique_hotel_df['business_id'].values]

    # print('dog ratings are...')
    # print(unique_hotel_df['average_dog_rating'].values)

    uniq_df_srtd = unique_hotel_df.sort_values(by='average_dog_rating', ascending=False).head(10).copy()

    best_dog_hotel_names = uniq_df_srtd['hotel_name'].values

    best_dog_hotel_ratings = np.round(uniq_df_srtd['average_dog_rating'].values, 1)

    string_ratings = [str(rat) for rat in best_dog_hotel_ratings]

    best_dog_hotel_imgs = uniq_df_srtd['hotel_img_url'].values
    best_dog_hotel_urls = uniq_df_srtd['hotel_url'].values
    best_dog_hotel_prices = [str(prc) if prc > 0 else "nan" for prc in np.int64(np.round(uniq_df_srtd['hotel_price'].values))]
    # print('best dog hotel prices')
    # print(best_dog_hotel_prices)
    #print('best dog hotels:')
    #print(best_dog_hotel_names)

    return best_dog_hotel_names, string_ratings, best_dog_hotel_imgs, best_dog_hotel_urls, best_dog_hotel_prices
Ejemplo n.º 6
0
def classify_review_type(traindb="yelp", ntrain_dog_revs=1500, ntrain_gen_revs=1500, classdb="ta", verbose=False):
    """
    PURPOSE: To classify reviews as being either pet-related or general.

    :param traindb:
    The data set to use for training. The default is Yelp.

    :param ntrain_dog_revs:
    The number of dog-related hotel reviews to use in the training.

    :param ntrain_gen_revs:
    The number of general hotel reviews to use in the training.

    :param classdb:
    The data set to classify (either yelp or ta)

    :param verbose:
    Set this to True to print progress shit.
    """
    engine = cadb.connect_aws_db(write_unicode=True)

    if verbose:
        print("grabbing bf data...")
    # get the bringfido reviews
    bfdf = get_bf_reviews(engine)

    if verbose:
        print("grabbing general review data...")
    if traindb == "ta":
        print("using TA data for training...")
        gentraindf = get_ta_reviews(engine)
    if traindb == "yelp":
        print("using Yelp data for training...")
        gentraindf = get_yelp_reviews(engine)

    train_data = np.hstack(
        (bfdf["review_text"].values[:ntrain_dog_revs], gentraindf["review_text"].values[:ntrain_gen_revs])
    )

    labels = ["dog"] * ntrain_dog_revs
    labels.extend(["general"] * ntrain_gen_revs)
    y_train = labels

    if verbose:
        print("vectorizing...")
    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(train_data)
    duration = time() - t0
    print("vectorized in {:.2f} seconds.".format(duration))

    penalty = "l2"
    clf = LinearSVC(loss="l2", penalty=penalty, dual=False, tol=1e-3)

    if verbose:
        print("training model...")
    clf.fit(X_train, y_train)

    if classdb == "yelp":
        print("categorizing Yelp data...")
        classdf = get_yelp_reviews(engine, remove_shorts=False)
    if classdb == "ta":
        print("categorizing TA data...")
        classdf = get_ta_reviews(engine, remove_shorts=False)

    X_yrevs = vectorizer.transform(classdf["review_text"].values)

    if verbose:
        print("predicting...")
    pred = clf.predict(X_yrevs)

    classdf["review_category"] = pred

    update_table_rev_cat(classdf, engine)
def splinter_scrape_ta_hotels(city_url="", city="new_haven", state="ct", write_to_db=False, max_pages=20):
    """PURPOSE: To """
    # this only needs to be done at the very beginning
    br = Browser()

    if city_url == "":
        city_url = get_city(city.lower() + "_" + state.lower())

    print("using the following url:")
    print("{}".format(city_url))
    # city_url = "http://www.tripadvisor.com/Hotels-g31310-Phoenix_Arizona-Hotels.html"

    #####################################################
    # do not edit below this line
    #####################################################
    # more_pages is used to keep track if there is more
    # than one page of hotel results for the given city
    more_pages = True

    # scraping will start on page 1 of the hotel results
    page = 1

    # open the URL in a browser object:
    br.visit(city_url)

    # find the div to enter the date range. This is needed to get pricing info:
    date_bar = br.find_by_xpath('//*[contains(@class, "meta_date_wrapper")]')

    # find the check in calendar span:
    cin_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field   check_in")]/span')[0]

    # now click the check_in span to activate it
    cin_btn.click()

    # select the right calendar div (next month)
    rightcal = br.find_by_xpath('//div[contains(@class, "month")]')[1]

    # now select the third Friday of next month as the check in date
    fri_btn = rightcal.find_by_xpath("table/tbody/tr[3]/td[6]/div")

    # and click it
    fri_btn.click()

    # now choose the next day (saturday) as the check out date
    cout_btn = date_bar.find_by_xpath('span[contains(@class, "meta_date_field   check_out")]/span')[0]
    cout_btn.click()
    leftcal = br.find_by_xpath('//div[contains(@class, "month")]')[0]
    sat_btn = leftcal.find_by_xpath("table/tbody/tr[3]/td[7]/div")
    sat_btn.click()
    print("Dates selected.")

    # wait a few seconds for ta to retrieve prices
    time.sleep(5)

    # get the city and state info
    loclist = br.find_by_xpath('//*[contains(@id, "BREADCRUMBS")]')
    locstring = loclist.text.split(u"\u203a")
    hotel_city = city = locstring[2].lower()

    hotel_state = re.findall("\w+ \(([A-Z][A-Z])\)", locstring[1])[0].lower()

    # create a pandas dataframe that will be used for writing
    # the results to the DB:

    columns = [
        "hotel_id",
        "hotel_url",
        "hotel_img_url",
        "hotel_name",
        "hotel_address",
        "hotel_city",
        "hotel_state",
        "hotel_rating",
        "hotel_latitude",
        "hotel_longitude",
        "hotel_price",
        "business_id",
        "review_count",
        "dog_review_count",
    ]

    bigdf = pd.DataFrame(columns=columns)

    # create some lists to fill w. the results from each page
    hotel_names = []
    links = []
    img_url = []
    hotel_price = []
    business_id = []
    print("starting scraper loop.")
    while more_pages and page <= max_pages:
        print("*" * 75)
        print("Now scraping page {} of {} of the hotel results".format(page, max_pages))
        print("*" * 75)
        # get all the review divs
        print("waiting a few seconds before scraping...")
        time.sleep(np.random.uniform(8, 20))
        listing_div = br.find_by_xpath('//*[contains(@class, "hotels_lf_condensed")]')
        xsts1 = br.is_element_present_by_xpath('//*[contains(@class, "photo_booking")]', wait_time=1)
        xsts2 = br.is_element_present_by_xpath('//*[contains(@class, "property_details")]', wait_time=1)
        xsts3 = br.is_element_present_by_xpath(
            '//*[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]',
            wait_time=1,
        )
        while len(listing_div) < 1 or not xsts1 or not xsts2 or not xsts3:
            print("now waiting for DOIs to return")
            time.sleep(5)
            listing_div = br.find_by_xpath('//*[contains(@class, "hotels_lf_condensed")]')
            xsts1 = br.is_element_present_by_xpath('//*[contains(@class, "photo_booking")]', wait_time=1)
            xsts2 = br.is_element_present_by_xpath('//*[contains(@class, "property_details")]', wait_time=1)
            xsts3 = br.is_element_present_by_xpath(
                '//*[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]',
                wait_time=1,
            )
            print("# of listings: {}".format(len(listing_div)))
            print("photo_booking exists: {}".format(xsts1))
            print("property_details exists: {}".format(xsts2))
            print("prw_up exists: {}".format(xsts3))

        print("Number of hotel listings on this page: {}".format(len(listing_div)))

        df = pd.DataFrame(columns=columns)

        for listing in listing_div:
            try:
                biz_id = re.findall("hotel_(\d+)", listing["id"])
                if len(biz_id) > 0:
                    biz_id = biz_id[0]
                else:
                    biz_id = None
                print("business_id: {}".format(biz_id))
                business_id.append(biz_id)
            except:
                print("!" * 80)
                print("biz_id DOES NOT EXIST!")
                print("!" * 80)
                business_id.append(None)
            try:
                prop = listing.find_by_xpath('div/div/div/div[contains(@class, "property_details")]')
            except:
                print("!" * 80)
                print("prop DIV DOES NOT EXIST!")
                print("!" * 80)
            try:
                title = prop.find_by_xpath('div/div[@class="listing_title"]')
                print(title.text)
                hotel_names.append(title.text)
            except:
                print("!" * 80)
                print("TITLE DIV DOES NOT EXIST!")
                print("!" * 80)
                hotel_names.append(None)
            try:
                hotel_link = title.find_by_xpath("a")["href"]
                print(hotel_link)
                links.append(hotel_link)
            except:
                print("!" * 80)
                print("hotel_link DOES NOT EXIST!")
                print("!" * 80)
                links.append(None)
            try:
                hotel_img = prop.find_by_xpath('div[@class="photo_booking"]/div/div/a/img')["src"]
                print("Hotel img URL: {}".format(hotel_img))
                img_url.append(hotel_img)
            except:
                print("!" * 80)
                print("hotel_img DIV DOES NOT EXIST!")
                print("!" * 80)
                img_url.append(None)
            try:
                price_text = prop.find_by_xpath(
                    'div[contains(@class, "prw_rup")]/div/div/div/div[@class="headerContents"]/div[contains(@class, "price")]'
                ).text
                price = re.findall("(\d+)", price_text)[0]
                print("Price: ${}".format(price))
                hotel_price.append(price)
            except:
                print("!" * 80)
                print("price DIV DOES NOT EXIST!")
                print("!" * 80)
                hotel_price.append(None)
            print("*" * 50)

        if len(hotel_names) > 0:
            print("len of hotel_names: {}".format(len(hotel_names)))
            print("len of hotel_price: {}".format(len(hotel_price)))
            print("len of img_url: {}".format(len(img_url)))
            print("len of business_id: {}".format(len(business_id)))
            print("len of hotel_city: {}".format(len(hotel_city)))
            print("len of hotel_state: {}".format(len(hotel_state)))
            df["hotel_name"] = hotel_names
            df["hotel_price"] = hotel_price
            df["hotel_img_url"] = img_url
            df["hotel_url"] = links
            df["business_id"] = business_id
            df["hotel_city"] = hotel_city
            df["hotel_state"] = hotel_state
            bigdf = bigdf.append(df)

        # update the page number
        page += 1

        # if more pages are desired, look for a "next" button
        if page <= max_pages:
            nxt_btn = br.find_by_xpath(
                '//div[contains(@class, "deckTools")]/div[contains(@class, "unified")]/a[contains(@class, "next")]'
            )
            # if there is a next button, click it
            # else exit the while loop
            if len(nxt_btn) > 0:
                nxt_btn.click()
            else:
                more_pages = False

    if write_to_db:
        engine = cadb.connect_aws_db(write_unicode=True)
        bigdf = remove_ad_hotels(bigdf)
        remove_duplicate_hotels(bigdf, city, engine)
        bigdf.to_sql("ta_hotels", engine, if_exists="append", index=False)
Ejemplo n.º 8
0
def retrieve_best_hotels2(city, state='',
                          revdb='ta',
                          max_revs=10):
    """PURPOSE: To """
    city = str(city).lower()

    engine = cadb.connect_aws_db(write_unicode=True)

    # create and train the model to use to classify ratings:
    clf, vectorizer = create_and_train_model(engine)

    if revdb == 'yelp':
        revdf = get_yelp_reviews(city, engine)
    if revdb == 'ta':
        revdf = get_ta_reviews(city, engine)

    review_text = revdf['review_text'].values
    print('Number of reviews: {}'.format(len(review_text)))

    t0 = time()
    X_pred = vectorizer.transform(review_text)
    duration = time() - t0
    print('transformed test data in {:.2f} seconds.'.format(duration))

    #yelp_city = yelp[yelp['hotel_city'] == city.strip()]

    #yelp_dog_review = yelp_city[yelp_city['review_text'].str.contains('dog')].copy().reset_index()

    #average_dog_ratings = [np.mean(yelp_dog_review[yelp_dog_review['hotel_id'] == hotel_id]['review_rating'].values) for hotel_id in np.unique(yelp_dog_review['hotel_id'])]

    #now predict the rating based on the sentiment of the review:
    y_pred = clf.predict(X_pred)
    revdf['dog_rating'] = y_pred

    # get the unique hotels
    unique_biz_ids = np.unique(revdf['business_id'])

    unique_hotel_df = revdf[revdf['business_id'].isin(unique_biz_ids)].copy()

    unique_hotel_df.drop_duplicates(cols='business_id', inplace=True)
    unique_hotel_df['num_dog_reviews'] = [len(revdf[revdf['business_id'] == business_id]) for business_id in unique_hotel_df['business_id'].values]
    #max_revs = unique_hotel_df['num_dog_reviews'].max()

    print('max reviews is: {}'.format(max_revs))

    unique_hotel_df['average_dog_rating'] = [np.mean(revdf[revdf['business_id'] == business_id]['dog_rating'].values) * 0.8 + min(1, unique_hotel_df[unique_hotel_df['business_id'] == business_id]['num_dog_reviews'].values[0]/max_revs) for business_id in unique_hotel_df['business_id'].values]

    print('dog ratings are...')
    print(unique_hotel_df['average_dog_rating'].values)

    uniq_df_srtd = unique_hotel_df.sort(columns='average_dog_rating', ascending=False).head(10).copy()

    best_dog_hotel_names = uniq_df_srtd['hotel_name'].values

    best_dog_hotel_ratings = np.round(uniq_df_srtd['average_dog_rating'].values, 1)

    string_ratings = [str(rat) for rat in best_dog_hotel_ratings]

    best_dog_hotel_imgs = uniq_df_srtd['hotel_img_url'].values
    best_dog_hotel_urls = uniq_df_srtd['hotel_url'].values
    best_dog_hotel_prices = [str(prc) if prc > 0 else "nan" for prc in np.int64(np.round(uniq_df_srtd['hotel_price'].values))]
    print('best dog hotel prices')
    print(best_dog_hotel_prices)
    #print('best dog hotels:')
    #print(best_dog_hotel_names)

    return best_dog_hotel_names, string_ratings, best_dog_hotel_imgs, best_dog_hotel_urls, best_dog_hotel_prices
Ejemplo n.º 9
0
def splinter_scrape_bf(city, state):
    """PURPOSE: To """
    city = city.lower()
    state = state.lower()
    br = Browser()
    citystate_string = city+'_'+state+'_us/'
    url = 'http://www.bringfido.com/lodging/city/'+citystate_string
    br.visit(url)

    page = 1
    npages = len(br.find_by_xpath('//*[@id="results_paging_controls_bottom"]/span'))

    columns = ['hotel_id',
               'hotel_img_url',
               'hotel_url',
               'hotel_name',
               'hotel_address',
               'hotel_city',
               'hotel_state',
               'hotel_rating',
               'hotel_latitude',
               'hotel_longitude',
               'review_count',
               'hotel_address',
               'business_id',
               'review_id',
               'user_id',
               'username',
               'review_title',
               'review_text',
               'review_rating',
               'review_date']

    bigdf = pd.DataFrame(columns=columns)

    while (page == 1 or page < npages):
        print('*'*70)
        print('Now on page {}'.format(page))
        archive_links = br.find_by_xpath('//*[@id="results_list"]/div')

        hotel_names = []
        text_summaries = []
        links = []
        biz_ids = []
        hotel_img_urls = []

        df = pd.DataFrame(columns=columns)

        texts = []
        titles = []
        authors = []
        ratings = []
        hnms = []
        hiurls = []
        bids = []
        lnks = []

        for lnk in archive_links:
            hotel_names.append(lnk.find_by_xpath('div[2]/h1/a').value)
            text_summaries.append(lnk.text)
            this_link = lnk.find_by_xpath('div/h1/a')['href']
            links.append(this_link)
            hotel_img_urls.append(lnk.find_by_xpath('div/div[@class="photo_inner"]/a/img')['src'])
            biz_ids.append(lnk['id'].split('_')[-1])

        for hotel_id, link in enumerate(links):
            print('*'*75)
            print('Now on {}: {}'.format(hotel_id, link))
            print('*'*75)
            br.visit(link)

            # hotel_description = br.find_by_xpath('//*[@class="body"]').text

            # scrape the address details section of the page
            details = br.find_by_xpath('//*[@class="address"]').text.split('\n')

            # now get just the address:
            address = details[0]

            # and just the city, state, country, and zip code:
            csczip = details[1]

            # and just the phone number
            # phone = details[2]

            # now separate the city, state, and zip:
            city, state, zipcode = csczip.strip().split(',')
            zipcode = zipcode[3:]

            #Now using correct Xpath we are fetching URL of archives
            reviews = br.find_by_xpath('//*[@class="review_container"]')

            print(reviews)
            print('')
            for rev in reviews:
                titles.append(rev.find_by_xpath('div/div[1]').text)
                authors.append(rev.find_by_xpath('div/div[2]').text)
                texts.append(rev.find_by_xpath('div/div[3]').text)
                ratings.append(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1])
                hnms.append(hotel_names[hotel_id])
                hiurls.append(hotel_img_urls[hotel_id])
                bids.append(biz_ids[hotel_id])
                lnks.append(link)
                print(rev.find_by_xpath('div[2]/img')['src'].split('/')[-1][0:1])

        print('Number of new titles: {}'.format(len(titles)))
        print('Number of new ratings: {}'.format(len(ratings)))

        df['review_title'] = titles
        df['username'] = authors
        df['review_text'] = texts
        df['review_rating'] = ratings
        df['hotel_id'] = hotel_id
        df['hotel_name'] = hnms
        df['hotel_url'] = lnks
        df['hotel_img_url'] = hiurls
        df['hotel_address'] = address
        df['hotel_city'] = city
        df['hotel_state'] = state
        df['hotel_rating'] = np.mean([int(rat) for rat in ratings])
        df['hotel_latitude'] = None
        df['hotel_longitude'] = None
        df['review_count'] = len(texts)
        df['review_id'] = 0
        df['user_id'] = 0
        df['business_id'] = bids

        print('new entries from this page: {}'.format(len(df)))
        bigdf = bigdf.append(df.copy())
        page += 1
        if page < npages:
            page_timeout = True
            while page_timeout:
                br.visit(url)
                time.sleep(1)
                print('Now scraping page {} of {}'.format(page, npages))
                button = br.find_by_id('page_'+str(page))
                print(button)
                if len(button) > 0:
                    button.click()
                    page_timeout = False

    bigdf_reviews = bigdf[['hotel_id', 'review_id', 'business_id', 'user_id',
                          'username', 'review_title', 'review_text', 'review_rating']].copy()

    bigdf_hotels = bigdf[['hotel_id', 'hotel_url', 'hotel_img_url', 'hotel_name',
                          'hotel_address', 'hotel_city', 'hotel_state', 'hotel_rating',
                          'hotel_latitude', 'hotel_longitude', 'business_id', 'review_count']].copy()

    bigdf_hotels.drop_duplicates(subset='business_id', inplace=True)
    bigdf_hotels['hotel_id'] = None
    bigdf_reviews['review_id'] = None

    print('Number of bf reviews to add: {}'.format(len(bigdf_reviews)))

    engine = cadb.connect_aws_db(write_unicode=True)
    bigdf_reviews.to_sql('bf_reviews', engine, if_exists='append', index=False)
    bigdf_hotels.to_sql('bf_hotels', engine, if_exists='append', index=False)