def query_yelp_with_year_and_zip(gentri_chunk_cache_path, yelp_db, res_cache_path, chunk_id, tol): gentri_chunk_cache = cache_load(gentri_chunk_cache_path) if not gentri_chunk_cache: gentri_chunk_cache = seperate_gentri_to_chunks( gentrification_db_path, gentri_chunk_cache_path) key = "chunk" + str(chunk_id) gentri_rows = gentri_chunk_cache[key] conn = sqlite3.connect(yelp_db) c = conn.cursor() cache = cache_load(res_cache_path) for row in gentri_rows: year = row[0] zipcode = row[1] gentri_status = row[2] state = "Select review_id, user_id, res_id from yelp_" + str(year) state += " where year = ? and zipcode = ?" c.execute(state, (year, zipcode)) res = c.fetchall() key = str(year) + "_" + str(zipcode) print(key) val = dict() val['gentri_status'] = gentri_status val['yelp_info'] = res cache[key] = val cache_write(res_cache_path, cache) print(str(chunk_id) + " of " + str(tol) + " finished!")
def category_feature_extract(): merge_yelp_cache = cache_load(yelp_merge_gentr_cache) res_cache = cache_load(res_json_path) category_dict_top20 = dict() #global top 20 distribution category_distribution_cache = dict() # key->year_zipcode, value->dict, distribution print("fetch data from sql") conn = sqlite3.connect(yelp_db_path) c = conn.cursor() c.execute("SELECT year, zipcode FROM Yelp_gentrification") rows = c.fetchall() print("traverse all records") for row in rows: year = row[0] zipcode = row[1] key = str(year) + "_" + zipcode sub_category_distribution = dict() res_all = set( info[2] for info in merge_yelp_cache[key]['yelp_info']) for res in res_all: try: category_list = res_cache[res]['categories'].split(",") for c in category_list: lower_c = c.lower().strip() if lower_c not in sub_category_distribution: sub_category_distribution[lower_c] = 0 sub_category_distribution[lower_c] += 1 except: continue category_distribution_cache[key] = sub_category_distribution print(sub_category_distribution) sorted_category = sorted(sub_category_distribution.items(), key=operator.itemgetter(1), reverse = True) for i in range(math.ceil(len(sorted_category) * 0.1)): if sorted_category[i][0] not in category_dict_top20: category_dict_top20[sorted_category[i][0]] = 0 category_dict_top20[sorted_category[i][0]] += 1 print("finish one record!") cache_write('../category_top20.json', category_dict_top20) cache_write('../category_distribution.json', category_distribution_cache,)
def add_top_category_feature_and_build_csv(limit = 50): csv_dict = dict() category_distribution_cache = cache_load("../category_distribution.json") print("fetch data from sql") conn = sqlite3.connect(yelp_db_path) c = conn.cursor() c.execute("SELECT year_zip, gentri_status FROM Yelp_gentrification") rows = c.fetchall() sorted_category = sort(limit) i = 0 for pair in sorted_category: if i >= limit: break c = pair[0] vals = [] for row in rows: key = row[0] category_dict = category_distribution_cache[key] if c in category_dict: vals.append(category_dict[c]) else: vals.append(0) csv_dict[c] = vals i += 1 csv_dict['key'] = [ row[0] for row in rows ] csv_dict['gentri_status'] = [ row[1] for row in rows ] df = pd.DataFrame.from_dict(csv_dict) df.to_csv("../category.csv")
def select_all_zillow_records_by_zipcode(zipcode, fields = cols_opt, table_name = "Zillow"): conn = create_connection(zillow_db_path) cur = conn.cursor() key = "select_all_zillow_records_by_zipcode_" + str(zipcode) + str(fields) sql_cache = cache_load(sql_cache_path) if key not in sql_cache: state = " SELECT Date ," for index, f in enumerate(fields): state += f if index != len(fields) - 1: state += " ," state += " FROM " state += table_name state += " WHERE RegionName = ? ORDER BY Date" cur.execute(state, (zipcode, )) rows = cur.fetchall() sql_cache[key] = rows with open(sql_cache_path, 'w') as fp: json.dump(sql_cache, fp) fp.close() return sql_cache[key]
def get_useful_zillow_record(): conn = sqlite3.connect(zillow_db_path) c = conn.cursor() c.execute("SELECT * FROM Zillow WHERE gentrifying = 0 or gentrifying = 1") rows = c.fetchall() cache = cache_load(baseline_cache_path) cache['useful_zillow'] = rows cache_write(baseline_cache_path, cache) conn.close() return rows
def merge_data_to_sql(merge_data_cache_path, yelp_db): merge_cache = cache_load(merge_data_cache_path) keys = [] years = [] zipcodes = [] gentri_status = [] rev_nums = [] rev_ids_str = [] rev_ids_all = [] for key, val in merge_cache.items(): if len(val['yelp_info']) == 0: continue keys.append(key) years.append(key.split("_")[0]) zipcodes.append(key.split("_")[1]) gentri_status.append(val['gentri_status']) rev_ids = [] for info in val['yelp_info']: rev_ids.append(info[0]) rev_ids_all.extend(rev_ids) rev_nums.append(len(rev_ids)) revs = ",".join(rev_ids) rev_ids_str.append(revs) all_rev_ids = dict() all_rev_ids['all_ids'] = rev_ids_all cache_write('./data/all_rev_ids.json', all_rev_ids) #create table conn = sqlite3.connect(yelp_db) c = conn.cursor() state = "CREATE TABLE IF NOT EXISTS Yelp_gentrification\ (year_zip CHAR(50) PRIMARY KEY NOT NULL, \ year INT, \ zipcode CHAR(10),\ gentri_status INT, \ rev_ids TEXT, \ rev_num INT)" c.execute(state) print("Table created successfully") conn.commit() #insert data into table state = "INSERT INTO Yelp_gentrification (year_zip, year, zipcode, gentri_status, rev_ids, rev_num) VALUES (?,?,?,?,?,?);" c.executemany( state, zip(keys, years, zipcodes, gentri_status, rev_ids_str, rev_nums)) print("insert successully") conn.commit() conn.close()
def update_zillow(): cache = cache_load(baseline_cache_path) ids = cache['ids'] status = cache['status'] conn = sqlite3.connect(zillow_db_path) c = conn.cursor() c.executemany('UPDATE Zillow SET gentrifying = ? WHERE id=?', zip(status, ids)) conn.commit() conn.close()
def select_all_restaurants_by_zipcode(zipcode): conn = create_connection(yelp_db_path) cur = conn.cursor() key = "select_all_restaurants_by_zipcode_" + str(zipcode) sql_cache = cache_load(sql_cache_path) if key not in sql_cache: cur.execute(" SELECT res_id, review_year, review_month FROM Reviews Where zipcode = ?", (zipcode, )) rows = cur.fetchall() sql_cache[key] = rows with open(sql_cache_path, 'w') as fp: json.dump(sql_cache, fp) fp.close() return sql_cache[key]
def seperate_gentri_to_chunks(gentri_path, cache_path): c = cache_load(cache_path) if c: return len(c.keys()) conn = sqlite3.connect(gentri_path) c = conn.cursor() c.execute( "SELECT year, RegionName_zip, eligible_gentrification FROM gentrification" ) rows = c.fetchall() num = 0 count = 0 chunk = [] for row in rows: if count >= 100: cache = cache_load(cache_path) key = "chunk" + str(num) cache[key] = chunk cache_write(cache_path, cache) num += 1 count = 0 chunk = [] chunk.append(row) count += 1 if len(chunk) > 0: cache = cache_load(cache_path) key = "chunk" + str(num) cache[key] = chunk cache_write(cache_path, cache) num += 1 conn.close() return num
def choose_zillow_metrics(bar=0.1): cache = cache_load(baseline_cache_path) if 'useful_zillow' in cache: rows = cache['useful_zillow'] else: rows = get_useful_zillow_record() #get sorted cols in zillow with only zillow metrics headers_sorted = read_csv_header_list(zillow_path) remove_cols = ['RegionName', 'Date'] #first 5 is not zillow metrics, and the very last index, the structure is [()()], each tuple has 80 long cols_na_count = {} cols_index = {} index = 5 rows_len = len(rows) for c in headers_sorted: if c in remove_cols: continue cols_na_count[c] = 0 cols_index[c] = index index += 1 for row in rows: for col_name, index in cols_index.items(): if col_name not in cols_na_count: continue if row[index] == 0: cols_na_count[col_name] += 1 if cols_na_count[col_name] >= bar * rows_len: cols_na_count.pop(col_name, None) res_dict = {} for col_name, val in cols_na_count.items(): res_dict[col_name] = cols_index[col_name] return res_dict
def data_prep_before_insert(baseline_cache_path, bar=0.1): cache = cache_load(baseline_cache_path) if 'useful_zillow' in cache: rows = cache['useful_zillow'] else: rows = get_useful_zillow_record() zillow_metrics = choose_zillow_metrics(bar) zillow_metrics['id'] = 0 zillow_metrics['year'] = 2 zillow_metrics['RegionName'] = 4 zillow_metrics['gentrifying'] = 79 #print(zillow_metrics.keys()) vals_dict = {} for c in list(zillow_metrics.keys()): vals_dict[c] = [] for row in rows: for col_name, index in zillow_metrics.items(): vals_dict[col_name].append(row[index]) print(vals_dict.keys()) return vals_dict
def sort(limit = 50): category_top20 = cache_load('../category_top20.json') sorted_category = sorted(category_top20.items(), key=operator.itemgetter(1), reverse = True) return sorted_category