Beispiel #1
0
def main():
    # get zip codes
    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    # # add leading 0's to zip codes due to excel's stupidness
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str)
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x)

    current_month = datetime.date.today().month
    current_rows = session.query(Indeed).filter(
        extract('month', Indeed.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [
        zip_code for zip_code in zip_codes
        if zip_code not in existing_zip_codes
    ]

    LOGGER.info(
        'Found {} rows for current month: {}. Extracting {} remaining zip codes'
        .format(len(current_rows), current_month, len(remaining_zip_codes)))

    for i, zip_code in enumerate(remaining_zip_codes):
        job_count = get_num_job_postings(zip_code)
        row = Indeed(zip_code=zip_code,
                     job_count=job_count,
                     date_created=datetime.date.today())
        session.merge(row)
        session.commit()

        LOGGER.info("Extracting zip code {} ({} of {})".format(
            zip_code, i, len(remaining_zip_codes)))
    session.close()
def main():
    # get zip codes
    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    # # add leading 0's to zip codes due to excel's stupidness
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].astype(str)
    # zip_codes_df['zip_code'] = zip_codes_df['zip_code'].apply(lambda x: '0' * (5 - len(x)) + x)

    current_month = datetime.date.today().month
    current_rows = session.query(Indeed).filter(extract('month', Indeed.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes]

    LOGGER.info('Found {} rows for current month: {}. Extracting {} remaining zip codes'.format(len(current_rows),
                                                                                                current_month,
                                                                                                len(
                                                                                                    remaining_zip_codes)))

    for i, zip_code in enumerate(remaining_zip_codes):
        job_count = get_num_job_postings(zip_code)
        row = Indeed(zip_code=zip_code, job_count=job_count, date_created=datetime.date.today())
        session.merge(row)
        session.commit()

        LOGGER.info("Extracting zip code {} ({} of {})".format(zip_code, i, len(remaining_zip_codes)))
    session.close()
Beispiel #3
0
def persist_zip_code_data(df):
    zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state',
                             'county']].drop_duplicates()
    session.query(ZipCode).delete(
    )  # TODO: should append to existing data in case zillow changes something
    session.add_all(
        [ZipCode(**row) for row in zip_code_labels_df.to_dict('records')])
    session.commit()
def persist_zillow_metrics(df):
    metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1)
    session.query(ZillowMetrics).delete()  # TODO: should append to existing data in case zillow changes something
    session.commit()
    insert_chunk = 100000
    index_start = 0
    while index_start < len(metrics_df):
        LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format(index_start + insert_chunk,
                                                                      len(metrics_df)))
        engine.execute(
            ZillowMetrics.__table__.insert(metrics_df[index_start:index_start + insert_chunk].to_dict('records')))
        index_start += insert_chunk
Beispiel #5
0
def persist_zillow_metrics(df):
    metrics_df = df.drop(['city', 'metro', 'state', 'county'], axis=1)
    session.query(ZillowMetrics).delete(
    )  # TODO: should append to existing data in case zillow changes something
    session.commit()
    insert_chunk = 100000
    index_start = 0
    while index_start < len(metrics_df):
        LOGGER.info('Persisting Zillow Metrics rows: {} of {}'.format(
            index_start + insert_chunk, len(metrics_df)))
        engine.execute(
            ZillowMetrics.__table__.insert(
                metrics_df[index_start:index_start +
                           insert_chunk].to_dict('records')))
        index_start += insert_chunk
Beispiel #6
0
def main():
    yelp_api = YelpAPI(login_data['yelp_consumer_key'],
                       login_data['yelp_consumer_secret'],
                       login_data['yelp_token'],
                       login_data['yelp_token_secret'])

    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    current_month = datetime.date.today().month
    current_rows = session.query(YelpAPIDb).filter(
        extract('month', YelpAPIDb.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [
        zip_code for zip_code in zip_codes
        if zip_code not in existing_zip_codes
    ]

    category_list = [
        "cafes", "newamerican", "indpak", "italian", "japanese", "thai"
    ]

    for i, zip_code in enumerate(remaining_zip_codes):
        zip_code_results = []
        for category in category_list:
            offset = 0
            total_count = 21
            results_per_query_limit = 20
            business_counter = 1
            remaining_count = 1

            LOGGER.info(
                "Extracting {} restaurants from zip code {} ({} out of {})".
                format(category, zip_code, i, len(remaining_zip_codes)))
            while remaining_count > 0:
                try:

                    search_results = yelp_api.search_query(
                        location=zip_code,
                        category_filter=category,
                        sort=0,
                        limit=20,
                        offset=offset)
                    total_count = search_results['total']
                except YelpAPI.YelpAPIError as e:
                    print e
                    break
                if search_results['total'] == 0:
                    session.merge(
                        YelpAPIDb(zip_code=zip_code,
                                  date_created=datetime.date.today(),
                                  avg_rating=None,
                                  business_count=0))
                    session.commit()
                    break
                for business in search_results['businesses']:
                    if is_business_valid(business, zip_code):
                        print "{} out of {} businesses".format(
                            business_counter, total_count)
                        zip_code_results.append({
                            "zip_code":
                            zip_code,
                            "rating":
                            business['rating'],
                            "review_count":
                            business["review_count"]
                        })
                    business_counter += 1

                remaining_count = total_count - business_counter
                offset += results_per_query_limit

        if zip_code_results:
            total_review_count = sum(
                [business['review_count'] for business in zip_code_results])
            zip_code_avg_rating = sum([
                business['rating'] * business['review_count']
                for business in zip_code_results
            ]) / total_review_count
            row = YelpAPIDb(zip_code=zip_code,
                            date_created=datetime.date.today(),
                            avg_rating=zip_code_avg_rating,
                            business_count=len(zip_code_results))
            session.merge(row)
            session.commit()
        else:
            session.merge(
                YelpAPIDb(zip_code=zip_code,
                          date_created=datetime.date.today(),
                          avg_rating=None,
                          business_count=0))
            session.commit()
    session.close()
Beispiel #7
0
        model_df = df.copy()
        for column in column_shift:
            model_df[column[1]] = model_df[column[1]].shift(column[0])
        model.add_data(model_df, y_col=[y_col])
        model.model_data()
        results.append({'shift': column_shift,
                        'RMSE': model.RMSE,
                        'Score': model.score,
                        'Coefs': model.coefficients})

    return max(results, key=lambda x: x['Score'])


# yelp_df = pd.DataFrame([row.as_dict() for row in session.query(Yelp).all()])
# yelp_df = yelp_df.rename(columns={'date_published': 'month'})
zillow_results = session.query(ZillowMetrics).filter(ZillowMetrics.ZRI != None, ZillowMetrics.ZHVI != None).all()

zillow_metrics_df = pd.DataFrame([row.as_dict() for row in zillow_results])
# merged_df = yelp_df.merge(zillow_metrics_df, on=['month', 'zip_code'])

lin_model = Model(linear_model.LinearRegression())

min_periods = 8
lag_columns = ['ZRI']

zip_code_sample = zillow_metrics_df.zip_code.unique()
random.shuffle(zip_code_sample)
zip_code_sample = zip_code_sample[:10]

best_models = []
for zip_code in zip_code_sample:
def main():
    yelp_api = YelpAPI(login_data['yelp_consumer_key'],
                       login_data['yelp_consumer_secret'],
                       login_data['yelp_token'],
                       login_data['yelp_token_secret'])

    zip_codes = [row.zip_code for row in session.query(ZipCode).all()]

    current_month = datetime.date.today().month
    current_rows = session.query(YelpAPIDb).filter(extract('month', YelpAPIDb.date_created) == current_month).all()
    current_rows = [row.as_dict() for row in current_rows]
    existing_zip_codes = [row['zip_code'] for row in current_rows]
    remaining_zip_codes = [zip_code for zip_code in zip_codes if zip_code not in existing_zip_codes]

    category_list = ["cafes",
                     "newamerican",
                     "indpak",
                     "italian",
                     "japanese",
                     "thai"]

    for i, zip_code in enumerate(remaining_zip_codes):
        zip_code_results = []
        for category in category_list:
            offset = 0
            total_count = 21
            results_per_query_limit = 20
            business_counter = 1
            remaining_count = 1

            LOGGER.info("Extracting {} restaurants from zip code {} ({} out of {})".format(category, zip_code, i,
                                                                                           len(remaining_zip_codes)))
            while remaining_count > 0:
                try:

                    search_results = yelp_api.search_query(location=zip_code,
                                                           category_filter=category, sort=0, limit=20,
                                                           offset=offset)
                    total_count = search_results['total']
                except YelpAPI.YelpAPIError as e:
                    print e
                    break
                if search_results['total'] == 0:
                    session.merge(YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None,
                                            business_count=0))
                    session.commit()
                    break
                for business in search_results['businesses']:
                    if is_business_valid(business, zip_code):
                        print "{} out of {} businesses".format(business_counter, total_count)
                        zip_code_results.append({"zip_code": zip_code,
                                                 "rating": business['rating'],
                                                 "review_count": business["review_count"]})
                    business_counter += 1

                remaining_count = total_count - business_counter
                offset += results_per_query_limit

        if zip_code_results:
            total_review_count = sum([business['review_count'] for business in zip_code_results])
            zip_code_avg_rating = sum(
                [business['rating'] * business['review_count'] for business in zip_code_results]) / total_review_count
            row = YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=zip_code_avg_rating,
                            business_count=len(zip_code_results))
            session.merge(row)
            session.commit()
        else:
            session.merge(
                YelpAPIDb(zip_code=zip_code, date_created=datetime.date.today(), avg_rating=None, business_count=0))
            session.commit()
    session.close()
def persist_zip_code_data(df):
    zip_code_labels_df = df[['zip_code', 'city', 'metro', 'state', 'county']].drop_duplicates()
    session.query(ZipCode).delete()  # TODO: should append to existing data in case zillow changes something
    session.add_all([ZipCode(**row) for row in zip_code_labels_df.to_dict('records')])
    session.commit()