Example #1
0
def structure_records_total_counts():
    data_thesis = database.database("Thesis")

    global records_total_counts, max_num
    #get location clusters
    location_clusters = data_thesis.get_data("SELECT `features_plain_gobig`.`listing_id`, `location_cluster`, `cluster` FROM `features_plain_gobig` INNER JOIN `listing_clusters_big_all` ON `features_plain_gobig`.`listing_id` = `listing_clusters_big_all`.`listing_id` WHERE location_cluster != -1") #trying go big for now

    #make structure for the pulled data
    #`listing_id`, `location_cluster`, `cluster`
    #
    #
    #structure for the final datastructure
    #location_cluster : { listing_cluster: { year: {day: percentage occupied}}}

    for year in ["2013", "2014", "2015", "2016"]:
            records_averages_full_year["ENQUIRY"][year] = {}
            records_averages_full_year["CANCELLED"][year] = {}

    for entry in location_clusters:
        #if the location cluster
        if entry[1] not in records_total_counts["ENQUIRY"]:
            for filename in ["ENQUIRY", "CANCELLED"]:
                records_total_counts[filename][entry[1]] = {}
                records_averages[filename][entry[1]] = {}
                for year in ["2013", "2014", "2015", "2016"]:
                    records_averages_full_year[filename][year][entry[1]] = {}
                    #if the listing_cluster
        if entry[2] not in records_total_counts["ENQUIRY"][entry[1]].keys():
            for filename in ["ENQUIRY", "CANCELLED"]:
                records_total_counts[filename][entry[1]][entry[2]] = time.default_date_structure()
                records_averages[filename][entry[1]][entry[2]] = time.default_date_structure()
                for year in ["2013", "2014", "2015", "2016"]:
                    records_averages_full_year[filename][year][entry[1]][entry[2]] = {}

    data_thesis.destroy_connection()
def insert_zeros():
    global listing_dict

    #get earliest_dates (might as well):
    data_thesis = database.database("Thesis")
    earliest_dates = data_thesis.get_data("SELECT * FROM `earliest_date`")

    for entry in earliest_dates:  #for every listing
        #fetch entry date data
        #all_dict: status, listing_id, year
        for status, all_data in all_dict.iteritems():
            #entry[0] = listing_id
            try:
                for year, days in all_data[entry[0]].iteritems():
                    for day, occupancy in days.iteritems():  #within the year
                        try:
                            if int((datetime.datetime.strptime(
                                    day, "%Y-%m-%d").date() -
                                    entry[1]).days) > 0:
                                #if the day is after the earliest recorded date of activity
                                if occupancy is None:
                                    listing_dict[entry[0]][year][day] = 0
                        except Exception as e:
                            pass
                            #sys.exit()
            except Exception as e:  #most likely listing has no data for this year
                pass

    data_thesis.destroy_connection()
Example #3
0
def fill_cluster_data():
    _fill_cluster_listings()
    global clusters_listings, clusters_listings_with_data

    print "Finished making getting the data structures ready.  Time to find averages"
    '''
    Now we begin to find the averages per day
    '''
    data_thesis = database.database("Thesis")
    #find average for each day (pain in ass)
    for location_cluster, listing_cluster in clusters_listings.iteritems():
        for listing_cluster_id, date_data in listing_cluster.iteritems():
            for year, days_data in date_data.iteritems():  #day is a string?
                for this_day in days_data.keys():
                    #find the average
                    query = "SELECT `listing_clusters_plain`.`listing_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_clusters_plain`.`listing_id` = `listing_locations_DBSCAN_final`.`listing_id` WHERE `label_id` = %s AND `cluster_id` = %s GROUP BY `listing_id`"
                    occupancy_set = data_thesis.get_data(
                        query % (location_cluster, listing_cluster_id))

                #get average and put it in
                average = _get_average(occupancy_set, this_day)
                clusters_listings[location_cluster][listing_cluster_id][year][
                    this_day] = average
                if average:
                    clusters_listings_with_data.append(
                        {location_cluster: listing_cluster})

    print "finished here's a sample"
    #print clusters_listings[1][len(clusters_listings[1].keys()) - 2]
    data_thesis.destroy_connection()
Example #4
0
def organize_data():
	thesis_data = database.database("Thesis")
	print "\nNow let's shape up our features"
	my_data = thesis_data.get_data("SELECT `all_data_better`.`listing_id`, property_title, property_type, size, description, room_description, bed_count, bathroom_boolean, bed_type, accomodates, label_id FROM `all_data_better`  INNER JOIN `dbscan_labels_full` on `dbscan_labels_full`.`listing_id` = `all_data_better`.`listing_id` INNER JOIN `listings_accomodation` ON `listings_accomodation`.`listing_id` = `all_data_better`.`listing_id` WHERE label_id IN(0, 1, 19)")
	print "I have data! "
	print my_data[1]
	thesis_data.destroy_connection()

	final = {entry[0]:{"property_title": entry[1], "property_type": entry[2], "description": entry[4], "size": entry[3], "rooms": [], "beds": {}, "bathroom_count": 0, "accomodates": entry[9],"location": entry[9]} for entry in my_data}

	for entry in my_data:
		#my_data: listing_id, property_title, property_type, size, description, room_description, bed_count, bathroom_boolean, bed_type, accomodates, label_id (location)

		if entry[5] is not None and entry[5] > ' ':
			final[entry[0]]["rooms"].append(entry[5])
		else:
			final[entry[0]]["rooms"].append("empty_content")

		#beds
		if entry[8] is not None and entry[8] > " ":
			if entry[8] not in final[entry[0]]["beds"].keys():
				final[entry[0]]["beds"][entry[8]] = 1
			else:
				final[entry[0]]["beds"][entry[8]] += 1
		else:
			if "empty_content" not in final[entry[0]]["beds"].keys():
				final[entry[0]]["beds"]["empty_content"] = 1
			else:
				final[entry[0]]["beds"]["empty_content"] += 1

			#bathroom
		if entry[7] is not None:
			final[entry[0]]["bathroom_count"] += entry[7]

	return final
def get_testing_listings(list_of_location_ids=[1]):
    #return listings that have at least one year of occupancy data
    #And restricted to barcelona
    global my_features
    testing_listings = []
    thesis_data = database.database("Thesis")
    query_entries = ""
    for x in list_of_location_ids:
        query_entries += str(x) + ","

    #pop off final coma
    query_entries = query_entries[:(len(query_entries) - 1)]

    query = "SELECT `listing_locations_DBSCAN_final`.`listing_id`, `listing_clusters_plain`.`cluster_id`, `listing_locations_DBSCAN_final`.`label_id` FROM `listing_locations_DBSCAN_final` INNER JOIN `listing_clusters_plain` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE `label_id` IN(" + query_entries + ");"

    initial_data = thesis_data.get_data(query)

    for listing_data in initial_data:
        try:
            sample = my_features.json_files["occupancy_dict"][str(
                listing_data[0])]
            testing_listings.append(listing_data)
        except KeyError:
            pass

    thesis_data.destroy_connection()

    return testing_listings
def insert_zeros():
    global listing_dict

    #get earliest_dates (might as well):
    data_thesis = database.database("Thesis")
    earliest_dates = data_thesis.get_data("SELECT * FROM `earliest_date`")

    for entry in earliest_dates: #for every listing
        #fetch entry date data
        #all_dict: status, listing_id, year
        for status, all_data in all_dict.iteritems():
            #entry[0] = listing_id
            try:
                for year, days in all_data[entry[0]].iteritems():
                    for day, occupancy in days.iteritems(): #within the year
                        try:
                            if int( (datetime.datetime.strptime(day, "%Y-%m-%d").date() - entry[1]).days ) > 0:
                        #if the day is after the earliest recorded date of activity
                                if occupancy is None:
                                    listing_dict[entry[0]][year][day] = 0
                        except Exception as e:
                            pass
                            #sys.exit()
            except Exception as e: #most likely listing has no data for this year
                pass


    data_thesis.destroy_connection()
Example #7
0
def fill_cluster_data():
    _fill_cluster_listings()
    global clusters_listings, clusters_listings_with_data

    print "Finished making getting the data structures ready.  Time to find averages"
    '''
    Now we begin to find the averages per day
    '''
    data_thesis = database.database("Thesis")
    #find average for each day (pain in ass)
    for location_cluster, listing_cluster in clusters_listings.iteritems():
        for listing_cluster_id, date_data in listing_cluster.iteritems():
            for year, days_data in date_data.iteritems(): #day is a string?
                for this_day in days_data.keys():
                #find the average
                    query = "SELECT `listing_clusters_plain`.`listing_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_clusters_plain`.`listing_id` = `listing_locations_DBSCAN_final`.`listing_id` WHERE `label_id` = %s AND `cluster_id` = %s GROUP BY `listing_id`"
                    occupancy_set = data_thesis.get_data(query % (location_cluster, listing_cluster_id))

                #get average and put it in
                average = _get_average(occupancy_set, this_day)
                clusters_listings[location_cluster][listing_cluster_id][year][this_day] = average
                if average:
                    clusters_listings_with_data.append({location_cluster:listing_cluster})

    print "finished here's a sample"
    #print clusters_listings[1][len(clusters_listings[1].keys()) - 2]
    data_thesis.destroy_connection()
def get_valid_ids():
    global valid_ids
    thesis_data = database.database("Thesis")
    pot_listings = thesis_data.get_data(
        "SELECT `listing_id` FROM listing_clusters_plain")
    valid_ids = [entry[0] for entry in pot_listings]
    thesis_data.destroy_connection()
Example #9
0
def _get_city(location_cluster):
    thesis_database = database.database("Thesis")
    data_list = thesis_database.get_data(
        "SELECT `city` FROM `listing_locations_DBSCAN_final` WHERE `listing_locations_DBSCAN_final`.`label_id` = %s GROUP BY `label_id`"
        % (int(location_cluster)))
    thesis_database.destroy_connection()
    return data_list[0][0]
def get_testing_listings(list_of_location_ids = [1]):
    #return listings that have at least one year of occupancy data
    #And restricted to barcelona
    global my_features
    testing_listings = []
    thesis_data = database.database("Thesis")
    query_entries = ""
    for x in list_of_location_ids:
        query_entries += str(x) + ","

    #pop off final coma
    query_entries = query_entries[:(len(query_entries) - 1)]

    query = "SELECT `listing_locations_DBSCAN_final`.`listing_id`, `listing_clusters_plain`.`cluster_id`, `listing_locations_DBSCAN_final`.`label_id` FROM `listing_locations_DBSCAN_final` INNER JOIN `listing_clusters_plain` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE `label_id` IN(" + query_entries + ");"

    initial_data = thesis_data.get_data(query)

    for listing_data in initial_data:
        try:
            sample = my_features.json_files["occupancy_dict"][str(listing_data[0])]
            testing_listings.append(listing_data)
        except KeyError:
            pass

    thesis_data.destroy_connection()

    return testing_listings
def get_data():
    worldhomes_data = database.database("worldhomes")

    query = "SELECT `listing_id`,`created_at`, `checkin`, `checkout`, `status`, `id`, CASE status WHEN 'CONFIRMED' OR 'BLOCKEDBYCONFIRMED' OR 'CANCELLATIONREQUESTED' OR 'UNAVAILABLE' OR 'DOUBLEBOOKING' THEN 'CONFIRMED' END AS status FROM `reservations` WHERE `additional_description` NOT LIKE 'test ' AND `checkin` >= '2014-01-01' AND `checkout` < '2016-01-30' AND `listing_id` IS NOT NULL AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkout`, `checkin`) >= 0 ORDER BY `status`;"
    full_data = worldhomes_data.get_data(query)
    worldhomes_data.destroy_connection()

    return full_data
def get_data():
    worldhomes_data = database.database("worldhomes")

    query = "SELECT `listing_id`,`created_at`, `checkin`, `checkout`, `status`, `id`, CASE status WHEN 'CONFIRMED' OR 'BLOCKEDBYCONFIRMED' OR 'CANCELLATIONREQUESTED' OR 'UNAVAILABLE' OR 'DOUBLEBOOKING' THEN 'CONFIRMED' END AS status FROM `reservations` WHERE `additional_description` NOT LIKE 'test ' AND `checkin` >= '2014-01-01' AND `checkout` < '2016-01-30' AND `listing_id` IS NOT NULL AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkout`, `checkin`) >= 0 ORDER BY `status`;"
    full_data = worldhomes_data.get_data(query)
    worldhomes_data.destroy_connection()

    return full_data
Example #13
0
def insert_zeros():
    global listing_dict

    active_listing = []
    #get earliest_dates (might as well):
    data_worldhomes = database.database("worldhomes")
    earliest_dates = data_worldhomes.get_data("SELECT `id`, `created_at` FROM `listings`")
    earliest_dates_dict = {entry[0]: entry[1] for entry in earliest_dates}

    end_dates_list = data_worldhomes.get_data("SELECT `id`, `updated_at`, `deleted_at`, `active` FROM `listings`;")
    #sort into a dictionary
    end_dates = {entry[0]: {'updated_at': entry[1], 'deleted_at': entry[2], 'active': entry[3]} for entry in end_dates_list}


    for listing in listing_dict.keys(): #for every listing
        #fetch entry date data
        for year, days in listing_dict[listing].iteritems():
            for day, occupancy in days.iteritems(): #within the year
                #see if there is an end date entry
                end_date = _get_end_date(int(listing), end_dates)
                if end_date:
                #if end is still after current day, then positive number
                    days_before_end = int(
                        (end_date.date() - datetime.datetime.strptime(day, "%Y-%m-%d").date()).days)
                else:
                    days_before_end = None


                days_after_earliest = int( (datetime.datetime.strptime(day, "%Y-%m-%d").date() - earliest_dates_dict[int(listing)].date()).days)

                #even if occupancy, before official created date we make none
                if days_after_earliest:
                    if days_after_earliest < 0:
                        listing_dict[listing][year][day] = None

                if days_after_earliest and days_before_end:
                    if days_after_earliest >= 0 and days_before_end > 0:
                        if occupancy is None:
                            #print "made something 0 with days_before_end"
                            listing_dict[listing][year][day] = 0
                    if days_before_end <=0:
                        listing_dict[listing][year][day] = None

                elif days_after_earliest and not days_before_end:
                    if days_after_earliest >= 0:
                    #if the day is after the earliest recorded date of activity
                        if occupancy is None:
                                #print "made something 0 without days_before end"
                            listing_dict[listing][year][day] = 0
                    #else:
                        #print "nothing is happening"


    print "finished inserting zeros, here's what something looks like"
    print listing_dict[listing_dict.keys()[10]]


    data_worldhomes.destroy_connection()
def get_data():

    query = "SELECT `listing_id`,`checkin`, `checkout`, `status`, `created_at`, `id` FROM `reservations` WHERE  (`additional_description` IS NULL OR (`additional_description` NOT LIKE 'test ' AND `additional_description` NOT LIKE ' test')) AND `checkin` >= '2013-01-01' AND `checkout` < '2016-01-30' AND `status` IN('ENQUIRY', 'CANCELLED') AND `listing_id` IS NOT NULL AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkin`, `created_at`) >= 0 AND DATEDIFF(`checkout`, `checkin`) > 0 ORDER BY `status`;"

    database_worldhomes = database.database("worldhomes")
    database_worldhomes.cursor.execute(query)
    all_data = list(database_worldhomes.cursor.fetchall())

    database_worldhomes.destroy_connection()
    return all_data
def get_data():

    query = "SELECT `listing_id`,`checkin`, `checkout`, `status`, `created_at`, `id` FROM `reservations` WHERE  (`additional_description` IS NULL OR (`additional_description` NOT LIKE 'test ' AND `additional_description` NOT LIKE ' test')) AND `checkin` >= '2013-01-01' AND `checkout` < '2016-01-30' AND `status` IN('ENQUIRY', 'CANCELLED') AND `listing_id` IS NOT NULL AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkin`, `created_at`) >= 0 AND DATEDIFF(`checkout`, `checkin`) > 0 ORDER BY `status`;"

    database_worldhomes = database.database("worldhomes")
    database_worldhomes.cursor.execute(query)
    all_data = list(database_worldhomes.cursor.fetchall())

    database_worldhomes.destroy_connection()
    return all_data
Example #16
0
def fill_weights():
	global listing_weights, bed_weights
	from library import database

	thesis_data = database.database("Thesis")

	data_list = thesis_data.get_data("SELECT * FROM `types_of_listings`;")
	listing_weights = {entry[0]: {'small': entry[4], 'big': entry[3]} for entry in data_list}

	data_list = thesis_data.get_data("SELECT * FROM `bed_types`;")
	bed_weights = {entry[0]: {"small": entry[2], 'big': entry[1]} for entry in data_list}

	thesis_data.destroy_connection()
Example #17
0
def fill_global_data(inclusive = True):
    global listing_important_dates, json_files, feature_data_space, listing_cluster_normalisation
    #json_files loading: occupancy_dict, date_dict

    for filename in json_files.keys():
        json_files[filename] = _load_json(filename, inclusive)

    #earliest dates
    worldhomes_data = database.database("worldhomes")
    listing_important_dates_list = worldhomes_data.get_data("SELECT `id`, `created_at`, `updated_at`, `deleted_at`, `active` FROM `listings`")
    listing_important_dates = {entry[0]: {"created_at": entry[1], "updated_at": entry[2], "deleted_at": entry[3], "active": entry[4]} for entry in listing_important_dates_list}

    worldhomes_data.destroy_connection()
def fill_global_data(inclusive = True):
    global listing_important_dates, json_files, feature_data_space, listing_cluster_normalisation
    #json_files loading: occupancy_dict, date_dict

    for filename in json_files.keys():
        json_files[filename] = _load_json(filename, inclusive)

    #earliest dates
    worldhomes_data = database.database("worldhomes")
    listing_important_dates_list = worldhomes_data.get_data("SELECT `id`, `created_at`, `updated_at`, `deleted_at`, `active` FROM `listings`")
    listing_important_dates = {entry[0]: {"created_at": entry[1], "updated_at": entry[2], "deleted_at": entry[3], "active": entry[4]} for entry in listing_important_dates_list}

    worldhomes_data.destroy_connection()
Example #19
0
def main():
    query = "SELECT `listing_id`,`checkin`, `checkout`, `created_at`, `additional_description` FROM `reservations` WHERE (`additional_description` IS NULL OR (`additional_description` NOT LIKE 'test ' AND `additional_description` NOT LIKE ' test')) AND `checkin` >= '2013-01-01' AND `checkout` < '2016-1-29' AND `listing_id` IS NOT NULL AND `status` IN('CONFIRMED','BLOCKEDBYCONFIRMED','UNAVAILABLE', 'DOUBLEBOOKING', 'CANCELLATIONREQUESTED') AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkin`, `created_at`) > 0;"

    woldhomes_data = database.database("worldhomes")
    my_data = woldhomes_data.get_data(query)

    thesis_data = database.database("Thesis")
    valid_listings = thesis_data.get_data(
        "SELECT `listing_id` FROM listing_locations_DBSCAN_final WHERE label_id IN(0, 1, 19);"
    )
    valid_listings = [entry[0] for entry in valid_listings]

    for entry in my_data:
        process_entry(entry, valid_listings)

    print "finished processing entries in my data, now need to insert 0's"

    insert_zeros()

    print "now let's find the average cluster data for each day"
    fill_cluster_data()

    global clusters_listings
    #
    print "at least putting things into a json"

    cluster_filepath = "data/cluster_averages.json"
    occupancy_filepath = "data/occupancy_dict.json"

    with open(cluster_filepath, 'w') as outfile:
        json.dump(clusters_listings, outfile)

    with open(occupancy_filepath, 'w') as outfile2:
        json.dump(listing_dict, outfile2)

    print "Finished saving the average cluster listings data.  Onto occupancy_dict"

    woldhomes_data.destroy_connection()
Example #20
0
def get_data(custom_prices = True):
    global earliest_dates
    worldhomes_data = database.database("worldhomes")
    earliest_dates_list = worldhomes_data.get_data("SELECT `id`, `created_at` FROM `listings`")
    earliest_dates = {entry[0]: entry[1] for entry in earliest_dates_list}

    if custom_prices is True:
        query = 'SELECT `listing_id`,`date_start`, `date_end`, `rate` FROM `listing_custom_prices` WHERE date_end >= "2013-01-01" AND date_start < "2016-1-30" AND date_start > "2012-01-01" ORDER BY `date_start` ASC;'
    else:
        query = 'SELECT `id`,`rate` FROM `listings` WHERE master_id IS NULL;'
    final = worldhomes_data.get_data(query)

    worldhomes_data.destroy_connection()
    return final
Example #21
0
def main():
    query = "SELECT `listing_id`,`checkin`, `checkout`, `created_at`, `additional_description` FROM `reservations` WHERE (`additional_description` IS NULL OR (`additional_description` NOT LIKE 'test ' AND `additional_description` NOT LIKE ' test')) AND `checkin` >= '2013-01-01' AND `checkout` < '2016-1-29' AND `listing_id` IS NOT NULL AND `status` IN('CONFIRMED','BLOCKEDBYCONFIRMED','UNAVAILABLE', 'DOUBLEBOOKING', 'CANCELLATIONREQUESTED') AND DATEDIFF(`checkin`, `created_at`) <= 365 AND DATEDIFF(`checkin`, `created_at`) > 0;"

    woldhomes_data = database.database("worldhomes")
    my_data = woldhomes_data.get_data(query)

    thesis_data = database.database("Thesis")
    valid_listings = thesis_data.get_data("SELECT `listing_id` FROM listing_locations_DBSCAN_final WHERE label_id IN(0, 1, 19);")
    valid_listings = [entry[0] for entry in valid_listings]

    for entry in my_data:
        process_entry(entry, valid_listings)

    print "finished processing entries in my data, now need to insert 0's"

    insert_zeros()


    print "now let's find the average cluster data for each day"
    fill_cluster_data()

    global clusters_listings
    #
    print "at least putting things into a json"

    cluster_filepath = "data/cluster_averages.json"
    occupancy_filepath = "data/occupancy_dict.json"

    with open(cluster_filepath, 'w') as outfile:
        json.dump(clusters_listings, outfile)

    with open(occupancy_filepath, 'w') as outfile2:
        json.dump(listing_dict, outfile2)

    print "Finished saving the average cluster listings data.  Onto occupancy_dict"

    woldhomes_data.destroy_connection()
Example #22
0
def main():
    global all_prices_calendar, valid_listings

    all_data = get_data()

    #make structure and only do it for listing_ids that have data
    thesis_data = database.database("Thesis")
    pot_listings = thesis_data.get_data(
        "SELECT `listing_id` FROM listing_clusters_plain")
    valid_listings = [entry[0] for entry in pot_listings]
    thesis_data.destroy_connection()

    for listing_id in valid_listings:
        all_prices_calendar[listing_id] = time.default_date_structure()

    #fill structure:
    for item in all_data:
        process_entry(item)

    #get default_rate data:
    #just id, rate
    all_data = get_data(False)

    for entry in all_data:
        listing_id = entry[0]
        if listing_id not in all_prices_calendar.keys():
            #need to get like this:
            #listing_id`,`date_start`, `date_end`, `rate`
            all_prices_calendar[listing_id] = time.default_date_structure()

            to_process = [
                listing_id,
                datetime.date(2014, 1, 1),
                datetime.date(2016, 1, 29), entry[1]
            ]

            is_none = process_entry(to_process)
            if is_none is True:
                del all_prices_calendar[listing_id]

    #fill the ends of the dates and really make sure there are no 0's or None
    for listing_id, default_date_dict in all_prices_calendar.iteritems():
        fill_nones(listing_id, datetime.date(2016, 1, 29))

    #save it
    with open("data/price_dict.json", 'w') as jsonFile:
        json.dump(all_prices_calendar, jsonFile)

    print "Finished!!!  "
Example #23
0
def get_data(custom_prices=True):
    global earliest_dates
    worldhomes_data = database.database("worldhomes")
    earliest_dates_list = worldhomes_data.get_data(
        "SELECT `id`, `created_at` FROM `listings`")
    earliest_dates = {entry[0]: entry[1] for entry in earliest_dates_list}

    if custom_prices is True:
        query = 'SELECT `listing_id`,`date_start`, `date_end`, `rate` FROM `listing_custom_prices` WHERE date_end >= "2013-01-01" AND date_start < "2016-1-30" AND date_start > "2012-01-01" ORDER BY `date_start` ASC;'
    else:
        query = 'SELECT `id`,`rate` FROM `listings` WHERE master_id IS NULL;'
    final = worldhomes_data.get_data(query)

    worldhomes_data.destroy_connection()
    return final
def save_to_database(city, feature_header, feature_varience, experiment_name):
    thesis_data = database.database("Thesis")

    location_dict = {1: "Barcelona", 0: "Rome", 19: "Amsterdam"}
    #clear table
    query = "DELETE FROM `feature_varience` WHERE `city` = '" + location_dict[city] + "' AND `type_standardsiation` = '" + experiment_name + "';"
    thesis_data.execute(query)
    #table: feature_variance
    #city, feature_name, variance, type_standardsiation
    query = "INSERT INTO `feature_varience` VALUES ('%s', '%s', %s, '%s');"
    for x, entry in enumerate(feature_header):

        to_insert = (location_dict[city], entry, feature_varience[x], experiment_name)

        thesis_data.execute(query % to_insert)

    print "Finished saving to database"
Example #25
0
def main():
	#save listing_features into listing_plain_features
	#listing_id INT, property_title TEXT, description TEXT, property_type INT, size INT, room_count INT, bathroom_count INT, beds TEXT
	#get all data
	thesis_data = database.database("Thesis")
	query = "SELECT `listing_id`, `property_type`, `room_count`, `bathroom_count`, `beds`, `accomodates` FROM `listing_plain_features` WHERE `accomodates` IS NOT NULL;"
	data = thesis_data.get_data(query)

	#
	data_dict = {entry[0]: {"property_type": entry[1], "room_count": entry[2], "bathroom_count": entry[3], "beds": entry[4], 'accommodates': entry[5]} for entry in data}
	#set weights
	fill_weights()

	features_plain = calculate_features(data_dict, "plain")
	save_calculated_features(features_plain, "features_plain_gobig")

	features_fancy = calculate_features(data_dict, "fancy") #with dimensions all within 0-1
	save_calculated_features(features_fancy, "features_fancy")
Example #26
0
def save_calculated_features(feature_list, database_name):
	#do later
	#features_plain_gobig_new
	#features_plain_gosmall_new
	thesis_data = database.database("Thesis")
	thesis_data.clear_table(database_name)

	for entry in feature_list:
		insert_string = ''
		for item in entry:
			insert_string = insert_string + str(item) + ", "
		insert_string = insert_string[: len(insert_string) - 2] #pop off last coma
		#print insert_string
		query = "INSERT INTO `" + database_name + "` VALUES(" + insert_string + ");"

		thesis_data.execute(query)

	thesis_data.destroy_connection()
def main():
    global full_reservation_data

    all_data = get_data()

    #get valid_listings and make reservation structure
    thesis_data = database.database("Thesis")
    valid_listings = thesis_data.get_data("SELECT `listing_id` FROM     listing_clusters_plain")
    full_reservation_data = {entry[0]: time.default_date_structure() for entry in valid_listings}

    for entry in all_data:
        if entry[0] in full_reservation_data.keys():
            process_data(entry)

    with open("data/monte_carlo_reservation_dict.json", 'w') as outFile:
        json.dump(full_reservation_data, outFile)

    thesis_data.destroy_connection()
Example #28
0
def main():
    global all_prices_calendar, valid_listings

    all_data = get_data()

    #make structure and only do it for listing_ids that have data
    thesis_data = database.database("Thesis")
    pot_listings = thesis_data.get_data("SELECT `listing_id` FROM listing_clusters_plain")
    valid_listings = [entry[0] for entry in pot_listings]
    thesis_data.destroy_connection()

    for listing_id in valid_listings:
        all_prices_calendar[listing_id] = time.default_date_structure()

    #fill structure:
    for item in all_data:
        process_entry(item)

    #get default_rate data:
    #just id, rate
    all_data = get_data(False)

    for entry in all_data:
        listing_id = entry[0]
        if listing_id not in all_prices_calendar.keys():
            #need to get like this:
            #listing_id`,`date_start`, `date_end`, `rate`
            all_prices_calendar[listing_id] = time.default_date_structure()

            to_process = [listing_id, datetime.date(2014, 1, 1), datetime.date(2016, 1, 29), entry[1]]

            is_none = process_entry(to_process)
            if is_none is True:
                del all_prices_calendar[listing_id]

    #fill the ends of the dates and really make sure there are no 0's or None
    for listing_id, default_date_dict in all_prices_calendar.iteritems():
        fill_nones(listing_id, datetime.date(2016, 1, 29))

    #save it
    with open("data/price_dict.json", 'w') as jsonFile:
        json.dump(all_prices_calendar, jsonFile)

    print "Finished!!!  "
Example #29
0
def _fill_cluster_listings():
    data_thesis = database.database("Thesis")
    global clusters_listings
    #get location clusters
    location_clusters = data_thesis.get_data("SELECT `listing_clusters_plain`.`listing_id`,`listing_locations_DBSCAN_final`.`label_id`, `listing_clusters_plain`.`cluster_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE label_id != -1;") #trying go big for now

    #make structure for the pulled data
    #`listing_id`, `location_cluster`, `cluster`
    #
    #
    #structure for the final datastructure
    #location_cluster : { listing_cluster: { year: {day: percentage occupied}}}
    for entry in location_clusters:
        #if the location cluster
        if entry[1] not in clusters_listings.keys():
            clusters_listings[entry[1]] = {}
        #if the listing_cluster
        if entry[2] not in clusters_listings[entry[1]].keys():
            clusters_listings[entry[1]][entry[2]] = default_date_structure()

    data_thesis.destroy_connection()
def main():
    global full_reservation_data

    all_data = get_data()

    #get valid_listings and make reservation structure
    thesis_data = database.database("Thesis")
    valid_listings = thesis_data.get_data(
        "SELECT `listing_id` FROM     listing_clusters_plain")
    full_reservation_data = {
        entry[0]: time.default_date_structure()
        for entry in valid_listings
    }

    for entry in all_data:
        if entry[0] in full_reservation_data.keys():
            process_data(entry)

    with open("data/monte_carlo_reservation_dict.json", 'w') as outFile:
        json.dump(full_reservation_data, outFile)

    thesis_data.destroy_connection()
Example #31
0
def main():
    from library import database  #don't know why all of the sudden this needs to be in the function
    data_thesis = database.database("Thesis")

    query_plain = "SELECT * FROM `features_plain_gobig`;"
    query_fancy = "SELECT * FROM `features_fancy`;"

    data_plain = data_thesis.get_data(query_plain)
    data_fancy = data_thesis.get_data(query_fancy)

    #sorting data
    go_plain = [[float(item) for item in entry[1:]] for entry in data_plain]
    go_plain_identification = [entry[0] for entry in data_plain]

    go_fancy = [[float(item) for item in entry[1:]] for entry in data_fancy]
    go_fancy_identification = [entry[0] for entry in data_fancy]

    print "Let's try to isolate clusters by location!"

    #_DBSCAN(cluster_data, identification, db)
    plain_final = _Dirichlet(go_plain, go_plain_identification)
    fancy_final = _Dirichlet(go_fancy, go_fancy_identification)
    '''
	go big: 11 clusters (full data)
	go small = 13 clusters (full data)
	data: listing_id, cluster
	'''

    #save data
    data_thesis.clear_table("listing_clusters_fancy")
    data_thesis.clear_table("listing_clusters_plain")

    for x in range(0, len(plain_final)):
        data_thesis.add_entry_to_table("listing_clusters_plain",
                                       plain_final[x])
        data_thesis.add_entry_to_table("listing_clusters_fancy",
                                       fancy_final[x])

    data_thesis.destroy_connection()
Example #32
0
def main():
    from library import database  # don't know why all of the sudden this needs to be in the function

    data_thesis = database.database("Thesis")

    query_plain = "SELECT * FROM `features_plain_gobig`;"
    query_fancy = "SELECT * FROM `features_fancy`;"

    data_plain = data_thesis.get_data(query_plain)
    data_fancy = data_thesis.get_data(query_fancy)

    # sorting data
    go_plain = [[float(item) for item in entry[1:]] for entry in data_plain]
    go_plain_identification = [entry[0] for entry in data_plain]

    go_fancy = [[float(item) for item in entry[1:]] for entry in data_fancy]
    go_fancy_identification = [entry[0] for entry in data_fancy]

    print "Let's try to isolate clusters by location!"

    # _DBSCAN(cluster_data, identification, db)
    plain_final = _Dirichlet(go_plain, go_plain_identification)
    fancy_final = _Dirichlet(go_fancy, go_fancy_identification)

    """
	go big: 11 clusters (full data)
	go small = 13 clusters (full data)
	data: listing_id, cluster
	"""

    # save data
    data_thesis.clear_table("listing_clusters_fancy")
    data_thesis.clear_table("listing_clusters_plain")

    for x in range(0, len(plain_final)):
        data_thesis.add_entry_to_table("listing_clusters_plain", plain_final[x])
        data_thesis.add_entry_to_table("listing_clusters_fancy", fancy_final[x])

    data_thesis.destroy_connection()
Example #33
0
def save_to_database(table_name, experiment_name, city_name, full_dict):
    thesis_data = database.database("Thesis")

    #delete similar entries
    query = "DELETE FROM `" + table_name + "` WHERE `city` = '" + city_name + "' AND `experiment` = '" + experiment_name + "';"
    #print query
    thesis_data.execute(query)

    print "saving to database " + table_name + " experiment results: " + experiment_name

    #put entries in, then the keys are lists and what I want to store are the true_true,
    if full_dict and isinstance(full_dict.keys()[0], long):
        insert_query = "INSERT INTO " + table_name + "  VALUES('%s','%s',%s,'%s',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        for listing_id, full_records in full_dict.iteritems():
            #experiment ,  city , listing_id, method, true_true, true_false, false_true, false_false, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne, correct_overall
            for method, method_results in full_records.iteritems():
                to_insert = [experiment_name, city_name, listing_id, method]
                for this_thing in ["true_true", "true_false", "false_true", "false_false", "occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne", "correct_overall"]:
                    if method_results[this_thing]:
                        to_insert.append(method_results[this_thing])
                    else:
                        to_insert.append("null")
                #print (insert_query % to_insert)
                thesis_data.execute(insert_query % tuple(to_insert))
    elif full_dict:
        insert_query = "INSERT INTO " + table_name + " VALUES('%s','%s','%s',%s,%s,%s,%s,%s, %s)"
        #experiment ,  city ,  method, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne,
        for method, method_results in full_dict.iteritems():
            to_insert = [experiment_name, city_name, method]
            for this_thing in ["occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne"]:
                if method_results[this_thing]:
                    to_insert.append(method_results[this_thing])
                else:
                    to_insert.append("null")

            thesis_data.execute(insert_query % tuple(to_insert))

    thesis_data.destroy_connection()
def save_to_database(table_name, experiment_name, city_name, full_dict):
    thesis_data = database.database("Thesis")

    #delete similar entries
    query = "DELETE FROM `" + table_name + "` WHERE `city` = '" + city_name + "' AND `experiment` = '" + experiment_name + "';"
    #print query
    thesis_data.execute(query)

    print "saving to database " + table_name + " experiment results: " + experiment_name

    #put entries in, then the keys are lists and what I want to store are the true_true,
    if full_dict and isinstance(full_dict.keys()[0], long):
        insert_query = "INSERT INTO " + table_name + "  VALUES('%s','%s',%s,'%s',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        for listing_id, full_records in full_dict.iteritems():
            #experiment ,  city , listing_id, method, true_true, true_false, false_true, false_false, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne, correct_overall
            for method, method_results in full_records.iteritems():
                to_insert = [experiment_name, city_name, listing_id, method]
                for this_thing in ["true_true", "true_false", "false_true", "false_false", "occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne", "correct_overall"]:
                    if method_results[this_thing]:
                        to_insert.append(method_results[this_thing])
                    else:
                        to_insert.append("null")
                #print (insert_query % to_insert)
                thesis_data.execute(insert_query % tuple(to_insert))
    elif full_dict:
        insert_query = "INSERT INTO " + table_name + " VALUES('%s','%s','%s',%s,%s,%s,%s,%s, %s)"
        #experiment ,  city ,  method, occupancy_precision, occupancy_recall, empty_precision, empty_recall, occupancy_fOne, empty_fOne,
        for method, method_results in full_dict.iteritems():
            to_insert = [experiment_name, city_name, method]
            for this_thing in ["occupancy_precision", "occupancy_recall", "empty_precision", "empty_recall", "occupancy_fOne", "empty_fOne"]:
                if method_results[this_thing]:
                    to_insert.append(method_results[this_thing])
                else:
                    to_insert.append("null")

            thesis_data.execute(insert_query % tuple(to_insert))

    thesis_data.destroy_connection()
Example #35
0
def _fill_cluster_listings():
    data_thesis = database.database("Thesis")
    global clusters_listings
    #get location clusters
    location_clusters = data_thesis.get_data(
        "SELECT `listing_clusters_plain`.`listing_id`,`listing_locations_DBSCAN_final`.`label_id`, `listing_clusters_plain`.`cluster_id` FROM `listing_clusters_plain` INNER JOIN `listing_locations_DBSCAN_final` ON `listing_locations_DBSCAN_final`.`listing_id` = `listing_clusters_plain`.`listing_id` WHERE label_id != -1;"
    )  #trying go big for now

    #make structure for the pulled data
    #`listing_id`, `location_cluster`, `cluster`
    #
    #
    #structure for the final datastructure
    #location_cluster : { listing_cluster: { year: {day: percentage occupied}}}
    for entry in location_clusters:
        #if the location cluster
        if entry[1] not in clusters_listings.keys():
            clusters_listings[entry[1]] = {}
        #if the listing_cluster
        if entry[2] not in clusters_listings[entry[1]].keys():
            clusters_listings[entry[1]][entry[2]] = default_date_structure()

    data_thesis.destroy_connection()
Example #36
0
def save_general_features(full_listing_data):
	thesis_data = database.database("Thesis")
	#listing_id INT, property_title TEXT, description TEXT, property_type INT, size INT, room_count INT, bathroom_count INT, beds TEXT

	#truncate
	thesis_data.clear_table("listing_plain_features")
	query = "INSERT INTO `listing_plain_features` VALUES(%s, \"%s\", '%s', %s,%s, %s, %s, \"%s\", %s)"
	for listing_id, listing_data in full_listing_data.iteritems():
		beds = str(listing_data["beds"])
		#print query % (listing_id, listing_data["property_title"], listing_data["description"], listing_data["property_type"], listing_data["size"], len(listing_data["rooms"]), listing_data["bathroom_count"], beds)
		if listing_data["property_title"]:
			title = listing_data["property_title"]
		else:
			title = "null"
		if listing_data["description"]:
			description = listing_data["description"]
		else:
			description = "null"
		if listing_data["property_type"]:
			property_type = listing_data["property_type"]
		else:
			property_type = "null"
		if listing_data["size"]:
			size = listing_data["size"]
		else:
			size = "null"
		if listing_data["rooms"]:
			room_count = len(listing_data["rooms"])
		else:
			room_count = "null"
		if listing_data["accomodates"]:
			accomodates = listing_data["accomodates"]
		else:
			accomodates = "null"

		thesis_data.execute(query % (listing_id, title, description, property_type, size, room_count, listing_data["bathroom_count"], beds, accomodates))
		print "saved entry for, ", listing_id
with open("data/monte_carlo_reservation_dict.json") as jsonFile:
        all_reservations_dict = json.load(jsonFile)

with open('data/k-means_season_clusters.json') as jsonFile:
    k_means = json.load(jsonFile)
with open ('data/occupancy_dict.json') as jsonFile:
    occupancy_dict = json.load(jsonFile)

listing_location = common_database_functions.location_listing_pairings(True)

#final: listing_id: day: {reservations: {}, k_cluster: , occupancy: }
final = {}

#find valid listings with the right created-at dates
worldhomes_data = database.database("worldhomes")
enough_data_listings = worldhomes_data.get_data("SELECT `id` FROM `listings` WHERE `created_at` > '2013-12-31';")
enough_data_listings = [entry[0] for entry in enough_data_listings]

valid_ids = [this_id for this_id in occupancy_dict.keys() if int(this_id) in enough_data_listings]

for listing_id in valid_ids:
    final[listing_id] = {}
    for day in my_time._daterange(datetime.date(2014, 1, 1) , datetime.date(2016, 1,30) ): #does not include the outer bound
        year = str(day.year)

        final[listing_id][str(day)] = {}
        try:
            if all_reservations_dict[listing_id][year][str(day)]:
                final[listing_id][str(day)]['reservations'] = all_reservations_dict[listing_id][year][str(day)]
            else:
Example #38
0
def insert_zeros():
    global listing_dict

    active_listing = []
    #get earliest_dates (might as well):
    data_worldhomes = database.database("worldhomes")
    earliest_dates = data_worldhomes.get_data(
        "SELECT `id`, `created_at` FROM `listings`")
    earliest_dates_dict = {entry[0]: entry[1] for entry in earliest_dates}

    end_dates_list = data_worldhomes.get_data(
        "SELECT `id`, `updated_at`, `deleted_at`, `active` FROM `listings`;")
    #sort into a dictionary
    end_dates = {
        entry[0]: {
            'updated_at': entry[1],
            'deleted_at': entry[2],
            'active': entry[3]
        }
        for entry in end_dates_list
    }

    for listing in listing_dict.keys():  #for every listing
        #fetch entry date data
        for year, days in listing_dict[listing].iteritems():
            for day, occupancy in days.iteritems():  #within the year
                #see if there is an end date entry
                end_date = _get_end_date(int(listing), end_dates)
                if end_date:
                    #if end is still after current day, then positive number
                    days_before_end = int(
                        (end_date.date() - datetime.datetime.strptime(
                            day, "%Y-%m-%d").date()).days)
                else:
                    days_before_end = None

                days_after_earliest = int(
                    (datetime.datetime.strptime(day, "%Y-%m-%d").date() -
                     earliest_dates_dict[int(listing)].date()).days)

                #even if occupancy, before official created date we make none
                if days_after_earliest:
                    if days_after_earliest < 0:
                        listing_dict[listing][year][day] = None

                if days_after_earliest and days_before_end:
                    if days_after_earliest >= 0 and days_before_end > 0:
                        if occupancy is None:
                            #print "made something 0 with days_before_end"
                            listing_dict[listing][year][day] = 0
                    if days_before_end <= 0:
                        listing_dict[listing][year][day] = None

                elif days_after_earliest and not days_before_end:
                    if days_after_earliest >= 0:
                        #if the day is after the earliest recorded date of activity
                        if occupancy is None:
                            #print "made something 0 without days_before end"
                            listing_dict[listing][year][day] = 0
                    #else:
                    #print "nothing is happening"

    print "finished inserting zeros, here's what something looks like"
    print listing_dict[listing_dict.keys()[10]]

    data_worldhomes.destroy_connection()
Example #39
0
def fill_records_total_counts_data():
    structure_records_total_counts()
    global records, records_total_counts, max_num

    print "Finished making getting the data structures ready"
    '''
    Now we begin to find the averages per day
    '''
    data_thesis = database.database("Thesis")
    #find average for each day (pain in ass)
    for filename in ["ENQUIRY", "CANCELLED"]:
        print "on: ", filename
    #for filename in ["ENQUIRY"]:
        for location_cluster, listing_cluster in records_total_counts[filename].iteritems():
            print "On location, ", location_cluster
        #for location_cluster in [1]:
            #print "On location: ", location_cluster
            for listing_cluster_id, date_data in records_total_counts[filename][location_cluster].iteritems():
            #for x in range(1,5):
                for year, days_data in date_data.iteritems(): #day is a string?
                #print "On lisitng_cluster, ", x

                #listing_cluster_id = x
                #for year in ["2013"]:
                    for this_day, day_list in days_data.iteritems():
                        days_data[this_day] = []
                #find the average
                        query = "SELECT `features_plain_gobig`.`listing_id` FROM `features_plain_gobig` INNER JOIN `listing_clusters_big_all` ON `features_plain_gobig`.`listing_id` = `listing_clusters_big_all`.`listing_id` WHERE `location_cluster` = %s AND `cluster` = %s GROUP BY `listing_id`"
                        listing_id_set = data_thesis.get_data(query % (location_cluster, listing_cluster_id))

                        #type check
                        if isinstance(listing_id_set[0], tuple) or isinstance(listing_id_set[0], list):
                            listing_id_set = [entry[0] for entry in listing_id_set]

                        #process
                        for listing_id in listing_id_set:
                            try:
                                days_data[this_day].append(len( records[filename][str(listing_id)][year][this_day].keys()) )
                                #print "adding entry for cluster", x, ": ", this_day, ": ", days_data[this_day]

                            except Exception:
                                #there's nothing on this day in the database

                                #check to see if active date
                                try:
                                    date_activity = records["occupancy_dict"][str(listing_id)][year][this_day]
                                    if date_activity == 0:
                                        days_data[this_day].append(0)
                                except KeyError: #there is no occupancy data, let's delete this entire listing
                                    pass

                    #do average calculations here in the year section
                    print "doing averages for year, ", year
                    to_add = fill_records_averages(days_data)

                    records_averages[filename][location_cluster][listing_cluster_id][year] = fill_records_averages(days_data)
                    if to_add is False:
                        del records_averages_full_year[filename][year][location_cluster][listing_cluster_id]
                    else:
                        "Huzzah some records for year, ", year, "location, ", location_cluster
                        records_averages_full_year[filename][year][location_cluster][listing_cluster_id][year] = to_add

    data_thesis.destroy_connection()
Example #40
0
def _get_city(location_cluster):
    thesis_database = database.database("Thesis")
    data_list = thesis_database.get_data("SELECT `city` FROM `listing_locations_DBSCAN_final` WHERE `listing_locations_DBSCAN_final`.`label_id` = %s GROUP BY `label_id`" % (int(location_cluster)) )
    thesis_database.destroy_connection()
    return data_list[0][0]
def get_valid_ids():
    global valid_ids
    thesis_data = database.database("Thesis")
    pot_listings = thesis_data.get_data("SELECT `listing_id` FROM listing_clusters_plain")
    valid_ids = [entry[0] for entry in pot_listings]
    thesis_data.destroy_connection()