Exemple #1
0
def query_zillow(zillow_db_path, gentrification_db_path):
    rows = query_gentrification(gentrification_db_path)

    conn = sqlite3.connect(zillow_db_path)
    c = conn.cursor()

    ids = []
    status = []

    for row in rows:
        year = row[0]
        zip = row[1]
        eligi = row[2]
        c.execute(" SELECT id FROM Zillow Where year = ? and RegionName = ?",
                  (year, zip))
        res = c.fetchone()
        if res is not None:
            print(res[0])
            ids.append(res[0])
            status.append(eligi)

    res_dict = {}
    res_dict['ids'] = ids
    res_dict['status'] = status

    cache_write(baseline_cache_path, res_dict)
    conn.close()
Exemple #2
0
def query_yelp_with_year_and_zip(gentri_chunk_cache_path, yelp_db,
                                 res_cache_path, chunk_id, tol):
    gentri_chunk_cache = cache_load(gentri_chunk_cache_path)
    if not gentri_chunk_cache:
        gentri_chunk_cache = seperate_gentri_to_chunks(
            gentrification_db_path, gentri_chunk_cache_path)

    key = "chunk" + str(chunk_id)
    gentri_rows = gentri_chunk_cache[key]

    conn = sqlite3.connect(yelp_db)
    c = conn.cursor()
    cache = cache_load(res_cache_path)

    for row in gentri_rows:
        year = row[0]
        zipcode = row[1]
        gentri_status = row[2]
        state = "Select review_id, user_id, res_id from yelp_" + str(year)
        state += " where year = ? and zipcode = ?"
        c.execute(state, (year, zipcode))
        res = c.fetchall()
        key = str(year) + "_" + str(zipcode)
        print(key)
        val = dict()
        val['gentri_status'] = gentri_status
        val['yelp_info'] = res
        cache[key] = val

    cache_write(res_cache_path, cache)
    print(str(chunk_id) + " of " + str(tol) + " finished!")
Exemple #3
0
def get_useful_zillow_record():
    conn = sqlite3.connect(zillow_db_path)
    c = conn.cursor()
    c.execute("SELECT * FROM Zillow WHERE gentrifying = 0 or gentrifying = 1")
    rows = c.fetchall()
    cache = cache_load(baseline_cache_path)
    cache['useful_zillow'] = rows
    cache_write(baseline_cache_path, cache)
    conn.close()
    return rows
Exemple #4
0
def merge_data_to_sql(merge_data_cache_path, yelp_db):
    merge_cache = cache_load(merge_data_cache_path)

    keys = []
    years = []
    zipcodes = []
    gentri_status = []
    rev_nums = []
    rev_ids_str = []

    rev_ids_all = []

    for key, val in merge_cache.items():
        if len(val['yelp_info']) == 0:
            continue
        keys.append(key)
        years.append(key.split("_")[0])
        zipcodes.append(key.split("_")[1])
        gentri_status.append(val['gentri_status'])
        rev_ids = []
        for info in val['yelp_info']:
            rev_ids.append(info[0])
        rev_ids_all.extend(rev_ids)
        rev_nums.append(len(rev_ids))
        revs = ",".join(rev_ids)
        rev_ids_str.append(revs)

    all_rev_ids = dict()
    all_rev_ids['all_ids'] = rev_ids_all
    cache_write('./data/all_rev_ids.json', all_rev_ids)

    #create table
    conn = sqlite3.connect(yelp_db)
    c = conn.cursor()
    state = "CREATE TABLE IF NOT EXISTS Yelp_gentrification\
        (year_zip  CHAR(50)  PRIMARY KEY     NOT NULL, \
        year INT, \
        zipcode     CHAR(10),\
        gentri_status INT, \
        rev_ids  TEXT, \
        rev_num INT)"

    c.execute(state)
    print("Table created successfully")
    conn.commit()

    #insert data into table
    state = "INSERT INTO Yelp_gentrification (year_zip, year, zipcode, gentri_status, rev_ids, rev_num) VALUES (?,?,?,?,?,?);"
    c.executemany(
        state, zip(keys, years, zipcodes, gentri_status, rev_ids_str,
                   rev_nums))
    print("insert successully")
    conn.commit()

    conn.close()
Exemple #5
0
def category_feature_extract():

    merge_yelp_cache = cache_load(yelp_merge_gentr_cache)
    res_cache = cache_load(res_json_path)

    category_dict_top20 = dict() #global top 20 distribution
    category_distribution_cache = dict() # key->year_zipcode, value->dict, distribution

    print("fetch data from sql")
    conn = sqlite3.connect(yelp_db_path)
    c = conn.cursor()
    c.execute("SELECT year, zipcode FROM Yelp_gentrification")
    rows = c.fetchall()

    print("traverse all records")
    for row in rows:
        year = row[0]
        zipcode = row[1]
        key = str(year) + "_" + zipcode
        
        sub_category_distribution = dict()
        res_all = set( info[2] for info in merge_yelp_cache[key]['yelp_info'])

        for res in res_all:
            try:
                category_list = res_cache[res]['categories'].split(",")
                for c in category_list:
                    lower_c = c.lower().strip()
                    if lower_c not in sub_category_distribution:
                        sub_category_distribution[lower_c] = 0
                    sub_category_distribution[lower_c] += 1
            except:
                continue

        category_distribution_cache[key] = sub_category_distribution
        print(sub_category_distribution)
        sorted_category = sorted(sub_category_distribution.items(), key=operator.itemgetter(1), reverse = True)
        

        for i in range(math.ceil(len(sorted_category) * 0.1)):
            if sorted_category[i][0] not in category_dict_top20:
                category_dict_top20[sorted_category[i][0]] = 0
            category_dict_top20[sorted_category[i][0]] += 1
        
        print("finish one record!")

    cache_write('../category_top20.json', category_dict_top20)
    cache_write('../category_distribution.json', category_distribution_cache,)
Exemple #6
0
def seperate_gentri_to_chunks(gentri_path, cache_path):
    c = cache_load(cache_path)
    if c:
        return len(c.keys())

    conn = sqlite3.connect(gentri_path)
    c = conn.cursor()
    c.execute(
        "SELECT year, RegionName_zip, eligible_gentrification FROM gentrification"
    )
    rows = c.fetchall()

    num = 0
    count = 0
    chunk = []
    for row in rows:
        if count >= 100:
            cache = cache_load(cache_path)
            key = "chunk" + str(num)
            cache[key] = chunk
            cache_write(cache_path, cache)
            num += 1
            count = 0
            chunk = []

        chunk.append(row)
        count += 1

    if len(chunk) > 0:
        cache = cache_load(cache_path)
        key = "chunk" + str(num)
        cache[key] = chunk
        cache_write(cache_path, cache)
        num += 1

    conn.close()

    return num