def run_sentiment_analysis(with_graphs=False): conn = db_conn(db_config()) sql = 'SELECT * FROM reviews' df = db_table(conn, sql) df['lang'] = df.apply(lambda x: detect_language(x['comments']), axis=1) df_english = df.loc[df['lang'] == 'english'] df_scores = get_sentiment_scores(df_english) if with_graphs: plot_score_histograms(df_scores['positive'], score_type='Positive', filename='pos_sentiment.png') plot_score_histograms(df_scores['neutral'], score_type='Neutral', filename='neu_sentiment.png') plot_score_histograms(df_scores['negative'], score_type='Negative', filename='neg_sentiment.png') plot_score_histograms(df_scores['compound'], score_type='Compound', filename='compound_sentiment.png') df_avg = sentiment_by_listing(df_scores) dtypes = {'listing_id': INTEGER, 'compound': FLOAT, 'positive': FLOAT, 'neutral': FLOAT, 'negative': FLOAT} write_to_db(conn, df_avg, name='listings_sentiment', dtypes=dtypes)
def main(): config = db_config() conn = db_conn(config) # Create scorer to train models using RMSE scorer = make_scorer(rmse, greater_is_better=False) # Load features features_a = preprocess_features(conn) features_b = preprocess_features(conn, with_sa=False) # Get train and test sets for both feature sets X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a) X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b) # Train regression models with sentiment score features modelpath = 'models' linear_models = { 'linear': LinearRegression(), 'ridge': Ridge(), 'lasso': Lasso() } for name, model in linear_models.items(): grid = train_model(model_object=model, model_type=name, X_train=X_train_a, y_train=y_train_a, score_function=scorer, cv=3) filename = get_abspath('{}.model'.format(name), modelpath) ensure_dir_exists(filename) save_pickled_model(grid, filename) # Train XGBoost model with sentiment score features xgb = XGBRegressor(objective='reg:linear') grid = train_model(model_object=xgb, model_type='xgb', X_train=X_train_a, y_train=y_train_a, score_function=scorer, cv=3) filename = get_abspath('xgb_a.model', modelpath) save_pickled_model(grid, filename) # Train XGBoost model without sentiment score features grid = train_model(model_object=xgb, model_type='xgb', X_train=X_train_b, y_train=y_train_b, score_function=scorer, cv=3) filename = get_abspath('xgb_b.model', modelpath) save_pickled_model(grid, filename)
def main(): print(REPORT_NUMS) conn, cursor, sql = db_config() save_in_db(conn=conn, cursor=cursor, sql=sql)
from bs4 import BeautifulSoup from utils import insert_in_db, db_config, apply_multiprocessing import requests import pickle import pymysql MATERIALS_URL = "http://www.foodsafetykorea.go.kr/portal/healthyfoodlife/searchHfPrdlstRawmtrl.do" DETAIL_URL = "http://www.foodsafetykorea.go.kr/portal/healthyfoodlife/searchHomeHFDetail.do" CONN, CURSOR, SQL = db_config() def get_materials_data(product_report_no: int): materials_param = {"prdlst_report_no": product_report_no} res_json = requests.post(MATERIALS_URL, data=materials_param).json() materials = "" print("======Material_Json_Data===========") print(res_json) if res_json is None: return "" for js in res_json: if "rawmtrl_nm" not in js.keys(): continue material_name = js["rawmtrl_nm"] materials += material_name + "|" print(materials) return materials def save_product_detail_data(detail_keywords: tuple):
def main(): conn, cursor, sql = db_config() save_all_data(sql, conn, cursor)
sa_features = ['compound', 'positive', 'neutral', 'negative'] features.drop(columns=sa_features, inplace=True) return features def preprocess_calendar_features(conn, limit, offset): """Cleans and returns AIBNB calendar features to be used for scoring. Uses one-hot encoding for categorical features. """ sql = '''SELECT * FROM calendar_features LIMIT {0} OFFSET {1}'''.format(limit, offset) features = db_table(conn, sql) # Split out entity columns and drop from features entity_cols = ['listing_id', 'date', 'available', 'actual_price'] entities = features[entity_cols] features.drop(columns=entity_cols, inplace=True) return entities, features if __name__ == '__main__': # Get DB connection config = db_config() conn = db_conn(config) # Generate features table in the AIBNB DB execute_sql(conn, sql_file='sql/generate_features.sql')
def main(): config = db_config() conn = db_conn(config) # Remove basic results file try: combined = get_abspath('basic_results.csv', 'outputs') os.remove(combined) except IOError: pass # Load features features_a = preprocess_features(conn) features_b = preprocess_features(conn, with_sa=False) # Create scorer to train models using RMSE scorer = make_scorer(rmse, greater_is_better=False) # Load models in a dict models = { 'linear': load_pickled_model('models/linear.model'), 'ridge': load_pickled_model('models/ridge.model'), 'lasso': load_pickled_model('models/lasso.model'), 'xgb_a': load_pickled_model('models/xgb_a.model'), 'xgb_b': load_pickled_model('models/xgb_b.model') } # Validation curve parameter names and ranges vc_params = { 'xgb_a': ('max_depth', np.arange(1, 20, 1)), 'xgb_b': ('max_depth', np.arange(1, 20, 1)) } # Split into train and test sets X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a) X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b) # Generate basic results and learning curves for all models for name, grid in models.items(): if name in ['linear', 'ridge', 'lasso']: basic_results(grid, X_test_a, y_test_a, name) train_scores, test_scores = create_learning_curve( grid.best_estimator_, scorer, X_train_a, y_train_a, model_name=name, cv=3) plot_learning_curve(name, train_scores, test_scores) if name == 'xgb_b': basic_results(grid, X_test_b, y_test_b, name) train_scores, test_scores = create_learning_curve( grid.best_estimator_, scorer, X_train_b, y_train_b, model_name=name, cv=3) plot_learning_curve(name, train_scores, test_scores) # Generate validation curves for XGBoost models create_validation_curve(models['xgb_a'].best_estimator_, X_train_a, y_train_a, model_name='xgb_a', param_name=vc_params['xgb_a'][0], param_range=vc_params['xgb_a'][1], scorer=scorer) create_validation_curve(models['xgb_b'].best_estimator_, X_train_b, y_train_b, model_name='xgb_b', param_name=vc_params['xgb_b'][0], param_range=vc_params['xgb_b'][1], scorer=scorer) # Generate XGBoost feature importance plots and results fi_a = get_feature_importances('xgb_a', models['xgb_a'], features_a) fi_b = get_feature_importances('xgb_b', models['xgb_b'], features_b) plot_feature_importances('xgb_a', fi_a, nfeats=15) plot_feature_importances('xgb_b', fi_b, nfeats=15) # Plot test set learning curves of all five models plot_lc_all()