Ejemplo n.º 1
0
def query_yelp_with_year_and_zip(gentri_chunk_cache_path, yelp_db,
                                 res_cache_path, chunk_id, tol):
    gentri_chunk_cache = cache_load(gentri_chunk_cache_path)
    if not gentri_chunk_cache:
        gentri_chunk_cache = seperate_gentri_to_chunks(
            gentrification_db_path, gentri_chunk_cache_path)

    key = "chunk" + str(chunk_id)
    gentri_rows = gentri_chunk_cache[key]

    conn = sqlite3.connect(yelp_db)
    c = conn.cursor()
    cache = cache_load(res_cache_path)

    for row in gentri_rows:
        year = row[0]
        zipcode = row[1]
        gentri_status = row[2]
        state = "Select review_id, user_id, res_id from yelp_" + str(year)
        state += " where year = ? and zipcode = ?"
        c.execute(state, (year, zipcode))
        res = c.fetchall()
        key = str(year) + "_" + str(zipcode)
        print(key)
        val = dict()
        val['gentri_status'] = gentri_status
        val['yelp_info'] = res
        cache[key] = val

    cache_write(res_cache_path, cache)
    print(str(chunk_id) + " of " + str(tol) + " finished!")
Ejemplo n.º 2
0
def category_feature_extract():

    merge_yelp_cache = cache_load(yelp_merge_gentr_cache)
    res_cache = cache_load(res_json_path)

    category_dict_top20 = dict() #global top 20 distribution
    category_distribution_cache = dict() # key->year_zipcode, value->dict, distribution

    print("fetch data from sql")
    conn = sqlite3.connect(yelp_db_path)
    c = conn.cursor()
    c.execute("SELECT year, zipcode FROM Yelp_gentrification")
    rows = c.fetchall()

    print("traverse all records")
    for row in rows:
        year = row[0]
        zipcode = row[1]
        key = str(year) + "_" + zipcode
        
        sub_category_distribution = dict()
        res_all = set( info[2] for info in merge_yelp_cache[key]['yelp_info'])

        for res in res_all:
            try:
                category_list = res_cache[res]['categories'].split(",")
                for c in category_list:
                    lower_c = c.lower().strip()
                    if lower_c not in sub_category_distribution:
                        sub_category_distribution[lower_c] = 0
                    sub_category_distribution[lower_c] += 1
            except:
                continue

        category_distribution_cache[key] = sub_category_distribution
        print(sub_category_distribution)
        sorted_category = sorted(sub_category_distribution.items(), key=operator.itemgetter(1), reverse = True)
        

        for i in range(math.ceil(len(sorted_category) * 0.1)):
            if sorted_category[i][0] not in category_dict_top20:
                category_dict_top20[sorted_category[i][0]] = 0
            category_dict_top20[sorted_category[i][0]] += 1
        
        print("finish one record!")

    cache_write('../category_top20.json', category_dict_top20)
    cache_write('../category_distribution.json', category_distribution_cache,)
Ejemplo n.º 3
0
def add_top_category_feature_and_build_csv(limit = 50):
    csv_dict = dict()
    category_distribution_cache = cache_load("../category_distribution.json")

    print("fetch data from sql")
    conn = sqlite3.connect(yelp_db_path)
    c = conn.cursor()
    c.execute("SELECT year_zip, gentri_status FROM Yelp_gentrification")
    rows = c.fetchall()

    sorted_category = sort(limit)
    i = 0
    for pair in sorted_category:
        if i >= limit:
            break
        c = pair[0]
        vals = []
        for row in rows:
            key = row[0]
            category_dict = category_distribution_cache[key]
            if c in category_dict:
                vals.append(category_dict[c])
            else:
                vals.append(0)
        csv_dict[c] = vals
        i += 1
    
    csv_dict['key'] = [ row[0] for row in rows ]
    csv_dict['gentri_status'] = [ row[1] for row in rows ]
    
    df = pd.DataFrame.from_dict(csv_dict)
    df.to_csv("../category.csv")
Ejemplo n.º 4
0
def select_all_zillow_records_by_zipcode(zipcode, fields = cols_opt, table_name = "Zillow"):
    conn = create_connection(zillow_db_path)
    cur = conn.cursor()
    key = "select_all_zillow_records_by_zipcode_" + str(zipcode) + str(fields)
    sql_cache = cache_load(sql_cache_path)

    if key not in sql_cache:

        state = " SELECT Date ,"
        for index, f in enumerate(fields):
            state += f
            if index != len(fields) - 1:
                state += " ,"
        state += " FROM "
        state += table_name
        state += " WHERE RegionName = ? ORDER BY Date"
        
        cur.execute(state, (zipcode, ))
        rows = cur.fetchall()
        sql_cache[key] = rows

        with open(sql_cache_path, 'w') as fp:
                json.dump(sql_cache, fp)
        fp.close()

    return sql_cache[key]
Ejemplo n.º 5
0
def get_useful_zillow_record():
    conn = sqlite3.connect(zillow_db_path)
    c = conn.cursor()
    c.execute("SELECT * FROM Zillow WHERE gentrifying = 0 or gentrifying = 1")
    rows = c.fetchall()
    cache = cache_load(baseline_cache_path)
    cache['useful_zillow'] = rows
    cache_write(baseline_cache_path, cache)
    conn.close()
    return rows
Ejemplo n.º 6
0
def merge_data_to_sql(merge_data_cache_path, yelp_db):
    merge_cache = cache_load(merge_data_cache_path)

    keys = []
    years = []
    zipcodes = []
    gentri_status = []
    rev_nums = []
    rev_ids_str = []

    rev_ids_all = []

    for key, val in merge_cache.items():
        if len(val['yelp_info']) == 0:
            continue
        keys.append(key)
        years.append(key.split("_")[0])
        zipcodes.append(key.split("_")[1])
        gentri_status.append(val['gentri_status'])
        rev_ids = []
        for info in val['yelp_info']:
            rev_ids.append(info[0])
        rev_ids_all.extend(rev_ids)
        rev_nums.append(len(rev_ids))
        revs = ",".join(rev_ids)
        rev_ids_str.append(revs)

    all_rev_ids = dict()
    all_rev_ids['all_ids'] = rev_ids_all
    cache_write('./data/all_rev_ids.json', all_rev_ids)

    #create table
    conn = sqlite3.connect(yelp_db)
    c = conn.cursor()
    state = "CREATE TABLE IF NOT EXISTS Yelp_gentrification\
        (year_zip  CHAR(50)  PRIMARY KEY     NOT NULL, \
        year INT, \
        zipcode     CHAR(10),\
        gentri_status INT, \
        rev_ids  TEXT, \
        rev_num INT)"

    c.execute(state)
    print("Table created successfully")
    conn.commit()

    #insert data into table
    state = "INSERT INTO Yelp_gentrification (year_zip, year, zipcode, gentri_status, rev_ids, rev_num) VALUES (?,?,?,?,?,?);"
    c.executemany(
        state, zip(keys, years, zipcodes, gentri_status, rev_ids_str,
                   rev_nums))
    print("insert successully")
    conn.commit()

    conn.close()
Ejemplo n.º 7
0
def update_zillow():
    cache = cache_load(baseline_cache_path)
    ids = cache['ids']
    status = cache['status']

    conn = sqlite3.connect(zillow_db_path)
    c = conn.cursor()
    c.executemany('UPDATE Zillow SET  gentrifying = ? WHERE id=?',
                  zip(status, ids))
    conn.commit()
    conn.close()
Ejemplo n.º 8
0
def select_all_restaurants_by_zipcode(zipcode):
    conn = create_connection(yelp_db_path)
    cur = conn.cursor()
    key = "select_all_restaurants_by_zipcode_" + str(zipcode) 
    sql_cache = cache_load(sql_cache_path)
    if key not in sql_cache:
        cur.execute(" SELECT res_id, review_year, review_month FROM Reviews Where zipcode = ?", (zipcode, ))
        rows = cur.fetchall()
        sql_cache[key] = rows
        with open(sql_cache_path, 'w') as fp:
                json.dump(sql_cache, fp)
        fp.close()
    return sql_cache[key]
Ejemplo n.º 9
0
def seperate_gentri_to_chunks(gentri_path, cache_path):
    c = cache_load(cache_path)
    if c:
        return len(c.keys())

    conn = sqlite3.connect(gentri_path)
    c = conn.cursor()
    c.execute(
        "SELECT year, RegionName_zip, eligible_gentrification FROM gentrification"
    )
    rows = c.fetchall()

    num = 0
    count = 0
    chunk = []
    for row in rows:
        if count >= 100:
            cache = cache_load(cache_path)
            key = "chunk" + str(num)
            cache[key] = chunk
            cache_write(cache_path, cache)
            num += 1
            count = 0
            chunk = []

        chunk.append(row)
        count += 1

    if len(chunk) > 0:
        cache = cache_load(cache_path)
        key = "chunk" + str(num)
        cache[key] = chunk
        cache_write(cache_path, cache)
        num += 1

    conn.close()

    return num
Ejemplo n.º 10
0
def choose_zillow_metrics(bar=0.1):
    cache = cache_load(baseline_cache_path)
    if 'useful_zillow' in cache:
        rows = cache['useful_zillow']
    else:
        rows = get_useful_zillow_record()

    #get sorted cols in zillow with only zillow metrics
    headers_sorted = read_csv_header_list(zillow_path)
    remove_cols = ['RegionName', 'Date']

    #first 5 is not zillow metrics, and the very last index, the structure is [()()], each tuple has 80 long
    cols_na_count = {}
    cols_index = {}
    index = 5
    rows_len = len(rows)
    for c in headers_sorted:
        if c in remove_cols:
            continue
        cols_na_count[c] = 0
        cols_index[c] = index
        index += 1

    for row in rows:
        for col_name, index in cols_index.items():
            if col_name not in cols_na_count:
                continue
            if row[index] == 0:
                cols_na_count[col_name] += 1
                if cols_na_count[col_name] >= bar * rows_len:
                    cols_na_count.pop(col_name, None)

    res_dict = {}
    for col_name, val in cols_na_count.items():
        res_dict[col_name] = cols_index[col_name]

    return res_dict
Ejemplo n.º 11
0
def data_prep_before_insert(baseline_cache_path, bar=0.1):
    cache = cache_load(baseline_cache_path)
    if 'useful_zillow' in cache:
        rows = cache['useful_zillow']
    else:
        rows = get_useful_zillow_record()

    zillow_metrics = choose_zillow_metrics(bar)
    zillow_metrics['id'] = 0
    zillow_metrics['year'] = 2
    zillow_metrics['RegionName'] = 4
    zillow_metrics['gentrifying'] = 79
    #print(zillow_metrics.keys())

    vals_dict = {}
    for c in list(zillow_metrics.keys()):
        vals_dict[c] = []

    for row in rows:
        for col_name, index in zillow_metrics.items():
            vals_dict[col_name].append(row[index])

    print(vals_dict.keys())
    return vals_dict
Ejemplo n.º 12
0
def sort(limit = 50):
    category_top20 = cache_load('../category_top20.json')
    sorted_category = sorted(category_top20.items(), key=operator.itemgetter(1), reverse = True)
    return sorted_category