def query_zillow(zillow_db_path, gentrification_db_path): rows = query_gentrification(gentrification_db_path) conn = sqlite3.connect(zillow_db_path) c = conn.cursor() ids = [] status = [] for row in rows: year = row[0] zip = row[1] eligi = row[2] c.execute(" SELECT id FROM Zillow Where year = ? and RegionName = ?", (year, zip)) res = c.fetchone() if res is not None: print(res[0]) ids.append(res[0]) status.append(eligi) res_dict = {} res_dict['ids'] = ids res_dict['status'] = status cache_write(baseline_cache_path, res_dict) conn.close()
def query_yelp_with_year_and_zip(gentri_chunk_cache_path, yelp_db, res_cache_path, chunk_id, tol): gentri_chunk_cache = cache_load(gentri_chunk_cache_path) if not gentri_chunk_cache: gentri_chunk_cache = seperate_gentri_to_chunks( gentrification_db_path, gentri_chunk_cache_path) key = "chunk" + str(chunk_id) gentri_rows = gentri_chunk_cache[key] conn = sqlite3.connect(yelp_db) c = conn.cursor() cache = cache_load(res_cache_path) for row in gentri_rows: year = row[0] zipcode = row[1] gentri_status = row[2] state = "Select review_id, user_id, res_id from yelp_" + str(year) state += " where year = ? and zipcode = ?" c.execute(state, (year, zipcode)) res = c.fetchall() key = str(year) + "_" + str(zipcode) print(key) val = dict() val['gentri_status'] = gentri_status val['yelp_info'] = res cache[key] = val cache_write(res_cache_path, cache) print(str(chunk_id) + " of " + str(tol) + " finished!")
def get_useful_zillow_record(): conn = sqlite3.connect(zillow_db_path) c = conn.cursor() c.execute("SELECT * FROM Zillow WHERE gentrifying = 0 or gentrifying = 1") rows = c.fetchall() cache = cache_load(baseline_cache_path) cache['useful_zillow'] = rows cache_write(baseline_cache_path, cache) conn.close() return rows
def merge_data_to_sql(merge_data_cache_path, yelp_db): merge_cache = cache_load(merge_data_cache_path) keys = [] years = [] zipcodes = [] gentri_status = [] rev_nums = [] rev_ids_str = [] rev_ids_all = [] for key, val in merge_cache.items(): if len(val['yelp_info']) == 0: continue keys.append(key) years.append(key.split("_")[0]) zipcodes.append(key.split("_")[1]) gentri_status.append(val['gentri_status']) rev_ids = [] for info in val['yelp_info']: rev_ids.append(info[0]) rev_ids_all.extend(rev_ids) rev_nums.append(len(rev_ids)) revs = ",".join(rev_ids) rev_ids_str.append(revs) all_rev_ids = dict() all_rev_ids['all_ids'] = rev_ids_all cache_write('./data/all_rev_ids.json', all_rev_ids) #create table conn = sqlite3.connect(yelp_db) c = conn.cursor() state = "CREATE TABLE IF NOT EXISTS Yelp_gentrification\ (year_zip CHAR(50) PRIMARY KEY NOT NULL, \ year INT, \ zipcode CHAR(10),\ gentri_status INT, \ rev_ids TEXT, \ rev_num INT)" c.execute(state) print("Table created successfully") conn.commit() #insert data into table state = "INSERT INTO Yelp_gentrification (year_zip, year, zipcode, gentri_status, rev_ids, rev_num) VALUES (?,?,?,?,?,?);" c.executemany( state, zip(keys, years, zipcodes, gentri_status, rev_ids_str, rev_nums)) print("insert successully") conn.commit() conn.close()
def category_feature_extract(): merge_yelp_cache = cache_load(yelp_merge_gentr_cache) res_cache = cache_load(res_json_path) category_dict_top20 = dict() #global top 20 distribution category_distribution_cache = dict() # key->year_zipcode, value->dict, distribution print("fetch data from sql") conn = sqlite3.connect(yelp_db_path) c = conn.cursor() c.execute("SELECT year, zipcode FROM Yelp_gentrification") rows = c.fetchall() print("traverse all records") for row in rows: year = row[0] zipcode = row[1] key = str(year) + "_" + zipcode sub_category_distribution = dict() res_all = set( info[2] for info in merge_yelp_cache[key]['yelp_info']) for res in res_all: try: category_list = res_cache[res]['categories'].split(",") for c in category_list: lower_c = c.lower().strip() if lower_c not in sub_category_distribution: sub_category_distribution[lower_c] = 0 sub_category_distribution[lower_c] += 1 except: continue category_distribution_cache[key] = sub_category_distribution print(sub_category_distribution) sorted_category = sorted(sub_category_distribution.items(), key=operator.itemgetter(1), reverse = True) for i in range(math.ceil(len(sorted_category) * 0.1)): if sorted_category[i][0] not in category_dict_top20: category_dict_top20[sorted_category[i][0]] = 0 category_dict_top20[sorted_category[i][0]] += 1 print("finish one record!") cache_write('../category_top20.json', category_dict_top20) cache_write('../category_distribution.json', category_distribution_cache,)
def seperate_gentri_to_chunks(gentri_path, cache_path): c = cache_load(cache_path) if c: return len(c.keys()) conn = sqlite3.connect(gentri_path) c = conn.cursor() c.execute( "SELECT year, RegionName_zip, eligible_gentrification FROM gentrification" ) rows = c.fetchall() num = 0 count = 0 chunk = [] for row in rows: if count >= 100: cache = cache_load(cache_path) key = "chunk" + str(num) cache[key] = chunk cache_write(cache_path, cache) num += 1 count = 0 chunk = [] chunk.append(row) count += 1 if len(chunk) > 0: cache = cache_load(cache_path) key = "chunk" + str(num) cache[key] = chunk cache_write(cache_path, cache) num += 1 conn.close() return num