def run_sentiment_analysis(with_graphs=False): conn = db_conn(db_config()) sql = 'SELECT * FROM reviews' df = db_table(conn, sql) df['lang'] = df.apply(lambda x: detect_language(x['comments']), axis=1) df_english = df.loc[df['lang'] == 'english'] df_scores = get_sentiment_scores(df_english) if with_graphs: plot_score_histograms(df_scores['positive'], score_type='Positive', filename='pos_sentiment.png') plot_score_histograms(df_scores['neutral'], score_type='Neutral', filename='neu_sentiment.png') plot_score_histograms(df_scores['negative'], score_type='Negative', filename='neg_sentiment.png') plot_score_histograms(df_scores['compound'], score_type='Compound', filename='compound_sentiment.png') df_avg = sentiment_by_listing(df_scores) dtypes = {'listing_id': INTEGER, 'compound': FLOAT, 'positive': FLOAT, 'neutral': FLOAT, 'negative': FLOAT} write_to_db(conn, df_avg, name='listings_sentiment', dtypes=dtypes)
def get_publisher_specific_is_about_climate_change_dfs( ) -> typing.Dict[typing.Any, pd.DataFrame]: dfs = {} for publisher in utils.publishers: with utils.db_conn() as conn: dfs[publisher] = pd.read_sql_query(f""" SELECT articles_total.published_date AS published, (SELECT CAST(articles_about_climate_change_absolute.n AS real) / articles_total.n) * 100 AS articles_about_climate_change_percent FROM ( SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) as n FROM article WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020) AND {is_about_climate_change_sql_statement[publisher.language]} GROUP BY TO_CHAR(published, 'YYYY') ) AS articles_about_climate_change_absolute JOIN ( SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) AS n FROM article WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020) GROUP BY TO_CHAR(published, 'YYYY') ) AS articles_total ON articles_total.published_date = articles_about_climate_change_absolute.published_date ORDER BY articles_total.published_date; """, conn, index_col="published") dfs[publisher].index = pd.to_datetime(dfs[publisher].index, format="%Y") return dfs
def insert_to_db(payload): try: db = db_conn() db.insert(payload) print("insert_finished") except: print("couldn't")
def main(): graph = Graph() for publisher in utils.publishers: with utils.db_conn() as conn: df = pd.read_sql_query( f""" SELECT articles_total.published_date AS published_date, (SELECT CAST(articles_about_climate_change_absolute.n AS real) / articles_total.n) * 100 AS articles_about_climate_change_percent FROM ( SELECT to_timestamp(TO_CHAR(published, 'YYYYMM'), 'YYYYMM') AS published_date, COUNT(*) as n FROM article WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015 AND {is_about_climate_change_sql_statement[publisher.language]} GROUP BY TO_CHAR(published, 'YYYYMM') ) AS articles_about_climate_change_absolute JOIN ( SELECT to_timestamp(TO_CHAR(published, 'YYYYMM'), 'YYYYMM') AS published_date, COUNT(*) AS n FROM article WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015 GROUP BY TO_CHAR(published, 'YYYYMM') ) AS articles_total ON articles_total.published_date = articles_about_climate_change_absolute.published_date ORDER BY TO_CHAR(articles_total.published_date, 'YYYYMM'); """ , conn) graph.plot(publisher, df["published_date"], df["articles_about_climate_change_percent"]) graph.save()
def _write_user_to_db(telegram_id, name): with db_conn() as conn: with conn.cursor() as cur: cur.execute(f""" INSERT INTO {os.environ.get('DB_PROD_LEVEL')}.users (telegram_id, name, status_timestamp) VALUES ({telegram_id}, '{name}', '{datetime.datetime.now(tz=pytz.timezone('Europe/Berlin'))}') """)
def __init__(self, subreddit_names): self.num_active_threads = 0 self.subreddit_names = subreddit_names self.subreddits = {} self.subreddits_dao = {} self.index = 0 self.conn = utils.db_conn() for _subreddit in subreddit_names: self.add_subreddit(_subreddit)
def f_df_save(df, table_name, sql_option= 'append'): ''' Storing: DataFrame -> SQL Used extensively for scraping ''' if not table_name: table_name = str(df) #Create the SQL table and the schema if it's the initial run df.to_sql(table_name, db_conn(), if_exists= sql_option)
def f_df_save(df, table_name, sql_option='append'): ''' Storing: DataFrame -> SQL Used extensively for scraping ''' if not table_name: table_name = str(df) #Create the SQL table and the schema if it's the initial run df.to_sql(table_name, db_conn(), if_exists=sql_option)
def main(): config = db_config() conn = db_conn(config) # Create scorer to train models using RMSE scorer = make_scorer(rmse, greater_is_better=False) # Load features features_a = preprocess_features(conn) features_b = preprocess_features(conn, with_sa=False) # Get train and test sets for both feature sets X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a) X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b) # Train regression models with sentiment score features modelpath = 'models' linear_models = { 'linear': LinearRegression(), 'ridge': Ridge(), 'lasso': Lasso() } for name, model in linear_models.items(): grid = train_model(model_object=model, model_type=name, X_train=X_train_a, y_train=y_train_a, score_function=scorer, cv=3) filename = get_abspath('{}.model'.format(name), modelpath) ensure_dir_exists(filename) save_pickled_model(grid, filename) # Train XGBoost model with sentiment score features xgb = XGBRegressor(objective='reg:linear') grid = train_model(model_object=xgb, model_type='xgb', X_train=X_train_a, y_train=y_train_a, score_function=scorer, cv=3) filename = get_abspath('xgb_a.model', modelpath) save_pickled_model(grid, filename) # Train XGBoost model without sentiment score features grid = train_model(model_object=xgb, model_type='xgb', X_train=X_train_b, y_train=y_train_b, score_function=scorer, cv=3) filename = get_abspath('xgb_b.model', modelpath) save_pickled_model(grid, filename)
def _write_event_to_db(chat_id, event_id, event_name, telegram_id, msg_text): with db_conn() as conn: with conn.cursor() as cur: cur.execute(f""" INSERT INTO {os.environ.get('DB_PROD_LEVEL')}.group_events (chat_id, event_id, event_name, telegram_id, msg_text, status_timestamp) VALUES ( {chat_id}, {event_id}, '{event_name}', {telegram_id}, '{msg_text}', '{datetime.datetime.now(tz=pytz.timezone('Europe/Berlin'))}' ) """)
def _load_users(): """get the most recent telegram_id-name combination for each telegram_id""" with db_conn() as conn: with conn.cursor() as cur: cur.execute(f""" select u.telegram_id, u.name from {os.environ.get('DB_PROD_LEVEL')}.users u join ( select telegram_id, max(status_timestamp) as max_timestamp from {os.environ.get('DB_PROD_LEVEL')}.users group by telegram_id ) s on u.telegram_id = s.telegram_id and u.status_timestamp = s.max_timestamp ;""") df_users = cur.fetchall() return {row[0]: row[1] for row in df_users}
def main(): if not os.path.isdir("output"): os.mkdir("output") for publisher in utils.publishers: with utils.db_conn() as conn: df = pd.read_sql_query( f""" SELECT url, publisher, TO_CHAR(published, 'YYYY-MM-DD') AS published FROM article TABLESAMPLE BERNOULLI(2) WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015 AND {is_about_climate_change_sql_statement[publisher.language]}; """, conn) df.to_excel(f"output/{publisher}.ods", engine="odf")
def main(): args = arguments() browser = args.browser if browser.lower() == 'c' or browser.lower() == 'chrome': driver = driver_chrome() elif browser.lower() == 'f' or browser.lower() == 'firefox': driver = driver_firefox() print('\n-- RECLAME AQUI SCRAPER --') file = args.file id_page = args.id pages = args.pages conn, cursor = db_conn() coletor = url_collector(driver, file, id_page, pages, conn, cursor) scraper(driver, coletor, id_page, conn, cursor) driver.quit()
def main(): dfs = [] for publisher in utils.publishers: with utils.db_conn() as conn: dfs.append( pd.read_sql_query( f""" SELECT articles_total.published_date AS published_date, (SELECT CAST(articles_about_climate_change_absolute.n AS real) / articles_total.n) * 100 AS articles_about_climate_change_percent FROM ( SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) as n FROM article WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020) AND {is_about_climate_change_sql_statement[publisher.language]} GROUP BY TO_CHAR(published, 'YYYY') ) AS articles_about_climate_change_absolute JOIN ( SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) AS n FROM article WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020) GROUP BY TO_CHAR(published, 'YYYY') ) AS articles_total ON articles_total.published_date = articles_about_climate_change_absolute.published_date ORDER BY articles_total.published_date; """, conn)) graph = Graph() df_combined = pd.concat(dfs) df_combined = df_combined.groupby(df_combined["published_date"]).mean() graph.plot(df_combined.index, df_combined["articles_about_climate_change_percent"]) graph.save()
import requests import time from tqdm import tqdm import json from bs4 import BeautifulSoup from utils import db_conn #Read parameters from `CONSTANTS.jsos` file with open("CONSTANTS.json" , "r") as file : constants = json.loads( file.read() ) #Initialise/Connect to DB cnxn, cursor=db_conn( constants["DB_name"]) # Get seesion IDs from DB query = "SELECT session_id FROM sessions" cursor.execute(query) session_ids = [row[0] for row in cursor.fetchall()] #Call API; Get sittings for each session xmls = [] sittings_url = "http://apps.lrs.lt/sip/p2b.ad_seimo_posedziai?sesijos_id=" for ids in tqdm(session_ids): result = requests.get(sittings_url + ids) xmls.append(result.text) time.sleep(0.2)
import pandas as pd import matplotlib.pyplot as plt import utils for publisher in utils.publishers: with utils.db_conn() as conn: df_m = pd.read_sql_query( f""" SELECT to_timestamp(TO_CHAR(published, 'YYYYMM'), 'YYYYMM') AS published_date, COUNT(*) AS n FROM article WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015 GROUP BY TO_CHAR(published, 'YYYYMM') ORDER BY TO_CHAR(published, 'YYYYMM'); """ , conn) df_d = pd.read_sql_query( f""" SELECT DATE(published) AS published_date, COUNT(*) AS n FROM article WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015 GROUP BY DATE(published) ORDER BY DATE(published); """ , conn) fig, ax = plt.subplots(2, figsize=(42, 4.2)) ax[0].plot(df_m["published_date"], df_m["n"]) ax[1].plot(df_d["published_date"], df_d["n"])
def run(self): self.conn = utils.db_conn() self.stream() self.conn.close()
sa_features = ['compound', 'positive', 'neutral', 'negative'] features.drop(columns=sa_features, inplace=True) return features def preprocess_calendar_features(conn, limit, offset): """Cleans and returns AIBNB calendar features to be used for scoring. Uses one-hot encoding for categorical features. """ sql = '''SELECT * FROM calendar_features LIMIT {0} OFFSET {1}'''.format(limit, offset) features = db_table(conn, sql) # Split out entity columns and drop from features entity_cols = ['listing_id', 'date', 'available', 'actual_price'] entities = features[entity_cols] features.drop(columns=entity_cols, inplace=True) return entities, features if __name__ == '__main__': # Get DB connection config = db_config() conn = db_conn(config) # Generate features table in the AIBNB DB execute_sql(conn, sql_file='sql/generate_features.sql')
def create_tables(): """ create tables""" commands = (""" CREATE SCHEMA IF NOT EXISTS prod """, """ CREATE SCHEMA IF NOT EXISTS staging """, """ DROP TABLE IF EXISTS staging.users; CREATE TABLE staging.users ( id SERIAL PRIMARY KEY, telegram_id BIGINT NOT NULL, name VARCHAR(60) NOT NULL, status_timestamp TIMESTAMP NOT NULL) """, """ INSERT INTO staging.users (telegram_id, name, status_timestamp) VALUES (459871623, 'tester1', '2021-12-29 16:24:52'), (918237832, 'tester2', '2021-12-29 18:24:52') """, """ CREATE TABLE IF NOT EXISTS prod.users ( id SERIAL PRIMARY KEY, telegram_id BIGINT NOT NULL, name VARCHAR(60) NOT NULL, status_timestamp TIMESTAMP NOT NULL) """, """ DROP TABLE IF EXISTS staging.aya_messages; CREATE TABLE staging.aya_messages ( id SERIAL PRIMARY KEY, chat_id BIGINT NOT NUll, telegram_id BIGINT NOT NULL, update_id BIGINT, message_text VARCHAR(255) NOT NULL, event_name VARCHAR(60), timestamp_received TIMESTAMP NOT NULL, timestamp_saved TIMESTAMP NOT NULL) """, """ INSERT INTO staging.aya_messages (chat_id, telegram_id, update_id, message_text, event_name, timestamp_received, timestamp_saved) VALUES (918237832, 918237832, 123, '/fasten di 12 12', 'fast_start', '2021-01-15 16:24:52', '2021-01-15 16:24:52'), (918237832, 918237832, 126, '/teilnehmen', 'fast_end', '2021-01-15 18:24:52', '2021-01-15 18:24:52') """, """ CREATE TABLE IF NOT EXISTS prod.aya_messages ( id SERIAL PRIMARY KEY, chat_id BIGINT NOT NUll, telegram_id BIGINT NOT NULL, update_id BIGINT, message_text VARCHAR(255) NOT NULL, event_name VARCHAR(60), timestamp_received TIMESTAMP NOT NULL, timestamp_saved TIMESTAMP NOT NULL) """, """ DROP TABLE IF EXISTS staging.aya_events; CREATE TABLE staging.aya_events ( id SERIAL PRIMARY KEY, chat_id BIGINT NOT NUll, telegram_id BIGINT NOT NULL, event_name VARCHAR(60), event_value FLOAT, timestamp_saved TIMESTAMP NOT NULL) """, """ INSERT INTO staging.aya_events (chat_id, telegram_id, event_name, event_value, timestamp_saved) VALUES (918237832, 918237832, 'fast_end', 18.75, '2021-01-15 16:24:52'), (918237832, 918237832, 'fast_end', 14.23, '2021-01-15 18:24:52') """, """ CREATE TABLE IF NOT EXISTS prod.aya_events ( id SERIAL PRIMARY KEY, chat_id BIGINT NOT NUll, telegram_id BIGINT NOT NULL, event_name VARCHAR(60), event_value FLOAT, timestamp_saved TIMESTAMP NOT NULL) """, """ DROP TABLE IF EXISTS staging.group_events; CREATE TABLE staging.group_events ( id SERIAL PRIMARY KEY, chat_id BIGINT NOT NUll, event_id BIGINT NOT NULL, event_name VARCHAR(60) NOT NULL, telegram_id BIGINT NOT NULL, msg_text VARCHAR(255) NOT NULL, status_timestamp TIMESTAMP NOT NULL) """, """ INSERT INTO staging.group_events (chat_id, event_id, event_name, telegram_id, msg_text, status_timestamp) VALUES (-123456789, 123, 'fast_create', 918237832, '/fasten di 12 12', '2021-01-15 16:24:52'), (-123456789, 126, 'fast_accept', 918237832, '/teilnehmen', '2021-01-15 18:24:52'), (-123456789, 129, 'fast_decline', 918237832, '/ablehnen', '2021-01-15 18:25:52'), (-123456789, 138, 'fast_delete', 918237832, '/loeschen', '2021-01-15 19:24:52') """, """ CREATE TABLE IF NOT EXISTS prod.group_events ( id SERIAL PRIMARY KEY, chat_id BIGINT NOT NUll, event_id BIGINT NOT NULL, event_name VARCHAR(60) NOT NULL, telegram_id BIGINT NOT NULL, msg_text VARCHAR(255) NOT NULL, status_timestamp TIMESTAMP NOT NULL) """) with db_conn() as conn: with conn.cursor() as cur: for command in commands: cur.execute(command) logging.info(f"executed stmt: {command}")
def main(): config = db_config() conn = db_conn(config) # Remove basic results file try: combined = get_abspath('basic_results.csv', 'outputs') os.remove(combined) except IOError: pass # Load features features_a = preprocess_features(conn) features_b = preprocess_features(conn, with_sa=False) # Create scorer to train models using RMSE scorer = make_scorer(rmse, greater_is_better=False) # Load models in a dict models = { 'linear': load_pickled_model('models/linear.model'), 'ridge': load_pickled_model('models/ridge.model'), 'lasso': load_pickled_model('models/lasso.model'), 'xgb_a': load_pickled_model('models/xgb_a.model'), 'xgb_b': load_pickled_model('models/xgb_b.model') } # Validation curve parameter names and ranges vc_params = { 'xgb_a': ('max_depth', np.arange(1, 20, 1)), 'xgb_b': ('max_depth', np.arange(1, 20, 1)) } # Split into train and test sets X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a) X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b) # Generate basic results and learning curves for all models for name, grid in models.items(): if name in ['linear', 'ridge', 'lasso']: basic_results(grid, X_test_a, y_test_a, name) train_scores, test_scores = create_learning_curve( grid.best_estimator_, scorer, X_train_a, y_train_a, model_name=name, cv=3) plot_learning_curve(name, train_scores, test_scores) if name == 'xgb_b': basic_results(grid, X_test_b, y_test_b, name) train_scores, test_scores = create_learning_curve( grid.best_estimator_, scorer, X_train_b, y_train_b, model_name=name, cv=3) plot_learning_curve(name, train_scores, test_scores) # Generate validation curves for XGBoost models create_validation_curve(models['xgb_a'].best_estimator_, X_train_a, y_train_a, model_name='xgb_a', param_name=vc_params['xgb_a'][0], param_range=vc_params['xgb_a'][1], scorer=scorer) create_validation_curve(models['xgb_b'].best_estimator_, X_train_b, y_train_b, model_name='xgb_b', param_name=vc_params['xgb_b'][0], param_range=vc_params['xgb_b'][1], scorer=scorer) # Generate XGBoost feature importance plots and results fi_a = get_feature_importances('xgb_a', models['xgb_a'], features_a) fi_b = get_feature_importances('xgb_b', models['xgb_b'], features_b) plot_feature_importances('xgb_a', fi_a, nfeats=15) plot_feature_importances('xgb_b', fi_b, nfeats=15) # Plot test set learning curves of all five models plot_lc_all()