def load_data(): air_hood = AirBnBNeighborhood(db_name='airbnb', coll_name='neighborhoods') hood_df = pd.DataFrame(list(air_hood.coll.find({}))) air_listing = AirBnBNeighborhood(db_name='airbnb', coll_name='listings') listing_df = pd.DataFrame(list(air_listing.coll.find({}))) listing_df = listing_df[listing_df['description_raw'].isnull() == False] merged_df = listing_df.merge( right=hood_df[['neighborhood', 'city', 'traits']], on='neighborhood', suffixes=('', '_copy')) return merged_df
def main(): air_hood = AirBnBNeighborhood(db_name=DB, coll_name=COLL) df = pd.read_csv(NEIGHBORHOOD_FILEPATH) hood_list = df.to_dict('records') for hood in hood_list: air_hood.scrape_and_insert(neighborhood_id=hood['neighborhood_id'], neighborhood=hood['neighborhood'], neighborhood_url=hood['neighborhood_url'], city_id=hood['city_id'], city=hood['city']) # print "%s > %s" % (hood['city'], hood['neighborhood']) time.sleep(2.5) # as to not get banned from AirBnB
def main(): air_hood = AirBnBNeighborhood(db_name=DB_NAME, coll_name=COLL_NAME) hoods_dict = list(air_hood.coll.find({}, {'_id': 1})) for hood in hoods_dict: hood_id = hood['_id'] air_hood.pull_from_db(neighborhood_id=hood_id) air_hood.extract_and_add_features() print "Extracting Features for %s" % hood_id