def run_sentiment_analysis(with_graphs=False):
    conn = db_conn(db_config())
    sql = 'SELECT * FROM reviews'
    df = db_table(conn, sql)
    df['lang'] = df.apply(lambda x: detect_language(x['comments']), axis=1)
    df_english = df.loc[df['lang'] == 'english']
    df_scores = get_sentiment_scores(df_english)

    if with_graphs:
        plot_score_histograms(df_scores['positive'],
                              score_type='Positive',
                              filename='pos_sentiment.png')
        plot_score_histograms(df_scores['neutral'],
                              score_type='Neutral',
                              filename='neu_sentiment.png')
        plot_score_histograms(df_scores['negative'],
                              score_type='Negative',
                              filename='neg_sentiment.png')
        plot_score_histograms(df_scores['compound'],
                              score_type='Compound',
                              filename='compound_sentiment.png')

    df_avg = sentiment_by_listing(df_scores)
    dtypes = {'listing_id': INTEGER,
              'compound': FLOAT,
              'positive': FLOAT,
              'neutral': FLOAT,
              'negative': FLOAT}

    write_to_db(conn, df_avg, name='listings_sentiment', dtypes=dtypes)
Beispiel #2
0
def main():
    config = db_config()
    conn = db_conn(config)

    # Create scorer to train models using RMSE
    scorer = make_scorer(rmse, greater_is_better=False)

    # Load features
    features_a = preprocess_features(conn)
    features_b = preprocess_features(conn, with_sa=False)

    # Get train and test sets for both feature sets
    X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a)
    X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b)

    # Train regression models with sentiment score features
    modelpath = 'models'
    linear_models = {
        'linear': LinearRegression(),
        'ridge': Ridge(),
        'lasso': Lasso()
    }

    for name, model in linear_models.items():
        grid = train_model(model_object=model,
                           model_type=name,
                           X_train=X_train_a,
                           y_train=y_train_a,
                           score_function=scorer,
                           cv=3)
        filename = get_abspath('{}.model'.format(name), modelpath)
        ensure_dir_exists(filename)
        save_pickled_model(grid, filename)

    # Train XGBoost model with sentiment score features
    xgb = XGBRegressor(objective='reg:linear')
    grid = train_model(model_object=xgb,
                       model_type='xgb',
                       X_train=X_train_a,
                       y_train=y_train_a,
                       score_function=scorer,
                       cv=3)
    filename = get_abspath('xgb_a.model', modelpath)
    save_pickled_model(grid, filename)

    # Train XGBoost model without sentiment score features
    grid = train_model(model_object=xgb,
                       model_type='xgb',
                       X_train=X_train_b,
                       y_train=y_train_b,
                       score_function=scorer,
                       cv=3)
    filename = get_abspath('xgb_b.model', modelpath)
    save_pickled_model(grid, filename)
def main():
    print(REPORT_NUMS)
    conn, cursor, sql = db_config()
    save_in_db(conn=conn, cursor=cursor, sql=sql)
from bs4 import BeautifulSoup
from utils import insert_in_db, db_config, apply_multiprocessing
import requests
import pickle
import pymysql

MATERIALS_URL = "http://www.foodsafetykorea.go.kr/portal/healthyfoodlife/searchHfPrdlstRawmtrl.do"
DETAIL_URL = "http://www.foodsafetykorea.go.kr/portal/healthyfoodlife/searchHomeHFDetail.do"

CONN, CURSOR, SQL = db_config()


def get_materials_data(product_report_no: int):
    materials_param = {"prdlst_report_no": product_report_no}
    res_json = requests.post(MATERIALS_URL, data=materials_param).json()
    materials = ""
    print("======Material_Json_Data===========")
    print(res_json)
    if res_json is None:
        return ""

    for js in res_json:
        if "rawmtrl_nm" not in js.keys():
            continue
        material_name = js["rawmtrl_nm"]
        materials += material_name + "|"
    print(materials)
    return materials


def save_product_detail_data(detail_keywords: tuple):
Beispiel #5
0
def main():
    conn, cursor, sql = db_config()
    save_all_data(sql, conn, cursor)
Beispiel #6
0
        sa_features = ['compound', 'positive', 'neutral', 'negative']
        features.drop(columns=sa_features, inplace=True)

    return features


def preprocess_calendar_features(conn, limit, offset):
    """Cleans and returns AIBNB calendar features to be used for scoring. Uses
    one-hot encoding for categorical features.

    """
    sql = '''SELECT * FROM calendar_features
             LIMIT {0} OFFSET {1}'''.format(limit, offset)
    features = db_table(conn, sql)

    # Split out entity columns and drop from features
    entity_cols = ['listing_id', 'date', 'available', 'actual_price']
    entities = features[entity_cols]
    features.drop(columns=entity_cols, inplace=True)

    return entities, features


if __name__ == '__main__':
    # Get DB connection
    config = db_config()
    conn = db_conn(config)

    # Generate features table in the AIBNB DB
    execute_sql(conn, sql_file='sql/generate_features.sql')
Beispiel #7
0
def main():
    config = db_config()
    conn = db_conn(config)

    # Remove basic results file
    try:
        combined = get_abspath('basic_results.csv', 'outputs')
        os.remove(combined)
    except IOError:
        pass

    # Load features
    features_a = preprocess_features(conn)
    features_b = preprocess_features(conn, with_sa=False)

    # Create scorer to train models using RMSE
    scorer = make_scorer(rmse, greater_is_better=False)

    # Load models in a dict
    models = {
        'linear': load_pickled_model('models/linear.model'),
        'ridge': load_pickled_model('models/ridge.model'),
        'lasso': load_pickled_model('models/lasso.model'),
        'xgb_a': load_pickled_model('models/xgb_a.model'),
        'xgb_b': load_pickled_model('models/xgb_b.model')
    }

    # Validation curve parameter names and ranges
    vc_params = {
        'xgb_a': ('max_depth', np.arange(1, 20, 1)),
        'xgb_b': ('max_depth', np.arange(1, 20, 1))
    }

    # Split into train and test sets
    X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a)
    X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b)

    # Generate basic results and learning curves for all models
    for name, grid in models.items():
        if name in ['linear', 'ridge', 'lasso']:
            basic_results(grid, X_test_a, y_test_a, name)
            train_scores, test_scores = create_learning_curve(
                grid.best_estimator_,
                scorer,
                X_train_a,
                y_train_a,
                model_name=name,
                cv=3)
            plot_learning_curve(name, train_scores, test_scores)
        if name == 'xgb_b':
            basic_results(grid, X_test_b, y_test_b, name)
            train_scores, test_scores = create_learning_curve(
                grid.best_estimator_,
                scorer,
                X_train_b,
                y_train_b,
                model_name=name,
                cv=3)
            plot_learning_curve(name, train_scores, test_scores)

    # Generate validation curves for XGBoost models
    create_validation_curve(models['xgb_a'].best_estimator_,
                            X_train_a,
                            y_train_a,
                            model_name='xgb_a',
                            param_name=vc_params['xgb_a'][0],
                            param_range=vc_params['xgb_a'][1],
                            scorer=scorer)

    create_validation_curve(models['xgb_b'].best_estimator_,
                            X_train_b,
                            y_train_b,
                            model_name='xgb_b',
                            param_name=vc_params['xgb_b'][0],
                            param_range=vc_params['xgb_b'][1],
                            scorer=scorer)

    # Generate XGBoost feature importance plots and results
    fi_a = get_feature_importances('xgb_a', models['xgb_a'], features_a)
    fi_b = get_feature_importances('xgb_b', models['xgb_b'], features_b)
    plot_feature_importances('xgb_a', fi_a, nfeats=15)
    plot_feature_importances('xgb_b', fi_b, nfeats=15)

    # Plot test set learning curves of all five models
    plot_lc_all()