def run_sentiment_analysis(with_graphs=False):
    conn = db_conn(db_config())
    sql = 'SELECT * FROM reviews'
    df = db_table(conn, sql)
    df['lang'] = df.apply(lambda x: detect_language(x['comments']), axis=1)
    df_english = df.loc[df['lang'] == 'english']
    df_scores = get_sentiment_scores(df_english)

    if with_graphs:
        plot_score_histograms(df_scores['positive'],
                              score_type='Positive',
                              filename='pos_sentiment.png')
        plot_score_histograms(df_scores['neutral'],
                              score_type='Neutral',
                              filename='neu_sentiment.png')
        plot_score_histograms(df_scores['negative'],
                              score_type='Negative',
                              filename='neg_sentiment.png')
        plot_score_histograms(df_scores['compound'],
                              score_type='Compound',
                              filename='compound_sentiment.png')

    df_avg = sentiment_by_listing(df_scores)
    dtypes = {'listing_id': INTEGER,
              'compound': FLOAT,
              'positive': FLOAT,
              'neutral': FLOAT,
              'negative': FLOAT}

    write_to_db(conn, df_avg, name='listings_sentiment', dtypes=dtypes)
def get_publisher_specific_is_about_climate_change_dfs(
) -> typing.Dict[typing.Any, pd.DataFrame]:
    dfs = {}

    for publisher in utils.publishers:
        with utils.db_conn() as conn:
            dfs[publisher] = pd.read_sql_query(f"""
                SELECT articles_total.published_date AS published,
                    (SELECT CAST(articles_about_climate_change_absolute.n AS real) / articles_total.n) * 100
                    AS articles_about_climate_change_percent
                FROM (
                    SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) as n
                    FROM article
                    WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020)
                        AND {is_about_climate_change_sql_statement[publisher.language]}
                    GROUP BY TO_CHAR(published, 'YYYY')
                ) AS articles_about_climate_change_absolute
                JOIN (
                    SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) AS n
                    FROM article
                    WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020)
                    GROUP BY TO_CHAR(published, 'YYYY')
                ) AS articles_total
                ON articles_total.published_date = articles_about_climate_change_absolute.published_date
                ORDER BY articles_total.published_date;
                """,
                                               conn,
                                               index_col="published")
            dfs[publisher].index = pd.to_datetime(dfs[publisher].index,
                                                  format="%Y")
    return dfs
Beispiel #3
0
def insert_to_db(payload):
    try:
        db = db_conn()
        db.insert(payload)
        print("insert_finished")
    except:
        print("couldn't")
def main():
    graph = Graph()

    for publisher in utils.publishers:

        with utils.db_conn() as conn:
            df = pd.read_sql_query(
                f"""
                SELECT articles_total.published_date AS published_date,
                    (SELECT CAST(articles_about_climate_change_absolute.n AS real) / articles_total.n) * 100
                    AS articles_about_climate_change_percent
                FROM (
                    SELECT to_timestamp(TO_CHAR(published, 'YYYYMM'), 'YYYYMM') AS published_date, COUNT(*) as n
                    FROM article
                    WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015
                    AND {is_about_climate_change_sql_statement[publisher.language]}
                    GROUP BY TO_CHAR(published, 'YYYYMM')
                ) AS articles_about_climate_change_absolute
                JOIN (
                    SELECT to_timestamp(TO_CHAR(published, 'YYYYMM'), 'YYYYMM') AS published_date, COUNT(*) AS n
                    FROM article
                    WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015
                    GROUP BY TO_CHAR(published, 'YYYYMM')
                ) AS articles_total
                ON articles_total.published_date = articles_about_climate_change_absolute.published_date
                ORDER BY TO_CHAR(articles_total.published_date, 'YYYYMM');
                """
                , conn)

        graph.plot(publisher, df["published_date"], df["articles_about_climate_change_percent"])

    graph.save()
Beispiel #5
0
def _write_user_to_db(telegram_id, name):
    with db_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(f"""
                INSERT INTO {os.environ.get('DB_PROD_LEVEL')}.users 
                    (telegram_id, name, status_timestamp) 
                VALUES  
                    ({telegram_id}, '{name}', '{datetime.datetime.now(tz=pytz.timezone('Europe/Berlin'))}')
                """)
 def __init__(self, subreddit_names):
     self.num_active_threads = 0
     self.subreddit_names = subreddit_names
     self.subreddits = {}
     self.subreddits_dao = {}
     self.index = 0
     self.conn = utils.db_conn()
     for _subreddit in subreddit_names:
         self.add_subreddit(_subreddit)
def f_df_save(df, table_name, sql_option= 'append'):
    '''
    Storing: DataFrame -> SQL 
    Used extensively for scraping
    '''
    if not table_name:
        table_name = str(df) 

    #Create the SQL table and the schema if it's the initial run
    df.to_sql(table_name, db_conn(), if_exists= sql_option)
def f_df_save(df, table_name, sql_option='append'):
    '''
    Storing: DataFrame -> SQL 
    Used extensively for scraping
    '''
    if not table_name:
        table_name = str(df)

    #Create the SQL table and the schema if it's the initial run
    df.to_sql(table_name, db_conn(), if_exists=sql_option)
Beispiel #9
0
def main():
    config = db_config()
    conn = db_conn(config)

    # Create scorer to train models using RMSE
    scorer = make_scorer(rmse, greater_is_better=False)

    # Load features
    features_a = preprocess_features(conn)
    features_b = preprocess_features(conn, with_sa=False)

    # Get train and test sets for both feature sets
    X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a)
    X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b)

    # Train regression models with sentiment score features
    modelpath = 'models'
    linear_models = {
        'linear': LinearRegression(),
        'ridge': Ridge(),
        'lasso': Lasso()
    }

    for name, model in linear_models.items():
        grid = train_model(model_object=model,
                           model_type=name,
                           X_train=X_train_a,
                           y_train=y_train_a,
                           score_function=scorer,
                           cv=3)
        filename = get_abspath('{}.model'.format(name), modelpath)
        ensure_dir_exists(filename)
        save_pickled_model(grid, filename)

    # Train XGBoost model with sentiment score features
    xgb = XGBRegressor(objective='reg:linear')
    grid = train_model(model_object=xgb,
                       model_type='xgb',
                       X_train=X_train_a,
                       y_train=y_train_a,
                       score_function=scorer,
                       cv=3)
    filename = get_abspath('xgb_a.model', modelpath)
    save_pickled_model(grid, filename)

    # Train XGBoost model without sentiment score features
    grid = train_model(model_object=xgb,
                       model_type='xgb',
                       X_train=X_train_b,
                       y_train=y_train_b,
                       score_function=scorer,
                       cv=3)
    filename = get_abspath('xgb_b.model', modelpath)
    save_pickled_model(grid, filename)
Beispiel #10
0
def _write_event_to_db(chat_id, event_id, event_name, telegram_id, msg_text):
    with db_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(f"""
                INSERT INTO {os.environ.get('DB_PROD_LEVEL')}.group_events 
                    (chat_id, event_id, event_name, telegram_id, msg_text, status_timestamp) 
                VALUES  
                    (
                        {chat_id}, {event_id}, '{event_name}', 
                        {telegram_id}, '{msg_text}', '{datetime.datetime.now(tz=pytz.timezone('Europe/Berlin'))}'
                    )
            """)
Beispiel #11
0
def _load_users():
    """get the most recent telegram_id-name combination for each telegram_id"""
    with db_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(f"""
                select u.telegram_id, u.name 
                from {os.environ.get('DB_PROD_LEVEL')}.users u
                join (
                        select telegram_id, max(status_timestamp) as max_timestamp 
                        from {os.environ.get('DB_PROD_LEVEL')}.users group by telegram_id
                    ) s
                on u.telegram_id = s.telegram_id and u.status_timestamp = s.max_timestamp
                ;""")
            df_users = cur.fetchall()
    return {row[0]: row[1] for row in df_users}
def main():

    if not os.path.isdir("output"):
        os.mkdir("output")

    for publisher in utils.publishers:
        with utils.db_conn() as conn:
            df = pd.read_sql_query(
                f"""
                SELECT url, publisher, TO_CHAR(published, 'YYYY-MM-DD') AS published
                FROM article
                TABLESAMPLE BERNOULLI(2)
                WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015
                AND {is_about_climate_change_sql_statement[publisher.language]};
                """, conn)
            df.to_excel(f"output/{publisher}.ods", engine="odf")
def main():
    args = arguments()
    browser = args.browser
    if browser.lower() == 'c' or browser.lower() == 'chrome':
        driver = driver_chrome()
    elif browser.lower() == 'f' or browser.lower() == 'firefox':
        driver = driver_firefox()
    print('\n-- RECLAME AQUI SCRAPER --')

    file = args.file
    id_page = args.id
    pages = args.pages

    conn, cursor = db_conn()

    coletor = url_collector(driver, file, id_page, pages, conn, cursor)
    scraper(driver, coletor, id_page, conn, cursor)
    driver.quit()
def main():

    dfs = []

    for publisher in utils.publishers:

        with utils.db_conn() as conn:
            dfs.append(
                pd.read_sql_query(
                    f"""
                SELECT articles_total.published_date AS published_date,
                    (SELECT CAST(articles_about_climate_change_absolute.n AS real) / articles_total.n) * 100
                    AS articles_about_climate_change_percent
                FROM (
                    SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) as n
                    FROM article
                    WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020)
                    AND {is_about_climate_change_sql_statement[publisher.language]}
                    GROUP BY TO_CHAR(published, 'YYYY')
                ) AS articles_about_climate_change_absolute
                JOIN (
                    SELECT TO_CHAR(published, 'YYYY') AS published_date, COUNT(*) AS n
                    FROM article
                    WHERE publisher = '{publisher}' AND ((SELECT EXTRACT(YEAR FROM published)) BETWEEN 2015 AND 2020)
                    GROUP BY TO_CHAR(published, 'YYYY')
                ) AS articles_total
                ON articles_total.published_date = articles_about_climate_change_absolute.published_date
                ORDER BY articles_total.published_date;
                """, conn))

    graph = Graph()

    df_combined = pd.concat(dfs)
    df_combined = df_combined.groupby(df_combined["published_date"]).mean()
    graph.plot(df_combined.index,
               df_combined["articles_about_climate_change_percent"])

    graph.save()
Beispiel #15
0
import requests
import time
from tqdm import tqdm
import json
from bs4 import BeautifulSoup
from utils import db_conn



#Read parameters from `CONSTANTS.jsos` file 
with open("CONSTANTS.json" , "r") as file :
    constants = json.loads( file.read() )

#Initialise/Connect to DB
cnxn, cursor=db_conn( constants["DB_name"])

# Get seesion IDs from DB
query = "SELECT session_id FROM sessions"
cursor.execute(query)
session_ids = [row[0] for row in cursor.fetchall()]

#Call API; Get sittings for each session
xmls = []
sittings_url = "http://apps.lrs.lt/sip/p2b.ad_seimo_posedziai?sesijos_id="
for ids in tqdm(session_ids):
    result = requests.get(sittings_url + ids)
    xmls.append(result.text)

    time.sleep(0.2)
Beispiel #16
0
import pandas as pd
import matplotlib.pyplot as plt
import utils


for publisher in utils.publishers:
    with utils.db_conn() as conn:
        df_m = pd.read_sql_query(
            f"""
            SELECT to_timestamp(TO_CHAR(published, 'YYYYMM'), 'YYYYMM') AS published_date, COUNT(*) AS n
            FROM article
            WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015
            GROUP BY TO_CHAR(published, 'YYYYMM')
            ORDER BY TO_CHAR(published, 'YYYYMM');
            """
            , conn)
        df_d = pd.read_sql_query(
            f"""
            SELECT DATE(published) AS published_date, COUNT(*) AS n
            FROM article
            WHERE publisher = '{publisher}' AND (SELECT EXTRACT(YEAR FROM published)) >= 2015
            GROUP BY DATE(published)
            ORDER BY DATE(published);
            """
            , conn)

    fig, ax = plt.subplots(2, figsize=(42, 4.2))

    ax[0].plot(df_m["published_date"], df_m["n"])
    ax[1].plot(df_d["published_date"], df_d["n"])
 def run(self):
     self.conn = utils.db_conn()
     self.stream()
     self.conn.close()
Beispiel #18
0
        sa_features = ['compound', 'positive', 'neutral', 'negative']
        features.drop(columns=sa_features, inplace=True)

    return features


def preprocess_calendar_features(conn, limit, offset):
    """Cleans and returns AIBNB calendar features to be used for scoring. Uses
    one-hot encoding for categorical features.

    """
    sql = '''SELECT * FROM calendar_features
             LIMIT {0} OFFSET {1}'''.format(limit, offset)
    features = db_table(conn, sql)

    # Split out entity columns and drop from features
    entity_cols = ['listing_id', 'date', 'available', 'actual_price']
    entities = features[entity_cols]
    features.drop(columns=entity_cols, inplace=True)

    return entities, features


if __name__ == '__main__':
    # Get DB connection
    config = db_config()
    conn = db_conn(config)

    # Generate features table in the AIBNB DB
    execute_sql(conn, sql_file='sql/generate_features.sql')
Beispiel #19
0
def create_tables():
    """ create tables"""
    commands = ("""
        CREATE SCHEMA IF NOT EXISTS prod
        """, """
        CREATE SCHEMA IF NOT EXISTS staging
        """, """
        DROP TABLE IF EXISTS staging.users;
        CREATE TABLE staging.users (
            id SERIAL PRIMARY KEY,
            telegram_id BIGINT NOT NULL,
            name VARCHAR(60) NOT NULL,
            status_timestamp TIMESTAMP NOT NULL)
        """, """
        INSERT INTO staging.users (telegram_id, name, status_timestamp)
            VALUES  (459871623, 'tester1', '2021-12-29 16:24:52'),
                    (918237832, 'tester2', '2021-12-29 18:24:52')
        """, """
        CREATE TABLE IF NOT EXISTS prod.users (
            id SERIAL PRIMARY KEY,
            telegram_id BIGINT NOT NULL,
            name VARCHAR(60) NOT NULL,
            status_timestamp TIMESTAMP NOT NULL)
        """, """
        DROP TABLE IF EXISTS staging.aya_messages;
        CREATE TABLE staging.aya_messages (
            id SERIAL PRIMARY KEY,
            chat_id BIGINT NOT NUll,
            telegram_id BIGINT NOT NULL,
            update_id BIGINT,
            message_text VARCHAR(255) NOT NULL,
            event_name VARCHAR(60),
            timestamp_received TIMESTAMP NOT NULL,
            timestamp_saved TIMESTAMP NOT NULL)
        """, """
        INSERT INTO staging.aya_messages (chat_id, telegram_id, update_id, 
            message_text, event_name, timestamp_received, timestamp_saved)
            VALUES  (918237832, 918237832, 123, '/fasten di 12 12', 
                        'fast_start', '2021-01-15 16:24:52', '2021-01-15 16:24:52'),
                    (918237832, 918237832, 126, '/teilnehmen',
                        'fast_end', '2021-01-15 18:24:52', '2021-01-15 18:24:52')
        """, """
        CREATE TABLE IF NOT EXISTS prod.aya_messages (
            id SERIAL PRIMARY KEY,
            chat_id BIGINT NOT NUll,
            telegram_id BIGINT NOT NULL,
            update_id BIGINT,
            message_text VARCHAR(255) NOT NULL,
            event_name VARCHAR(60),
            timestamp_received TIMESTAMP NOT NULL,
            timestamp_saved TIMESTAMP NOT NULL)
        """, """
        DROP TABLE IF EXISTS staging.aya_events;
        CREATE TABLE staging.aya_events (
            id SERIAL PRIMARY KEY,
            chat_id BIGINT NOT NUll,
            telegram_id BIGINT NOT NULL,
            event_name VARCHAR(60),
            event_value FLOAT,
            timestamp_saved TIMESTAMP NOT NULL)
        """, """
        INSERT INTO staging.aya_events (chat_id, telegram_id, 
            event_name, event_value, timestamp_saved)
            VALUES  (918237832, 918237832, 'fast_end', 
                        18.75, '2021-01-15 16:24:52'),
                    (918237832, 918237832, 'fast_end',
                        14.23, '2021-01-15 18:24:52')
        """, """
        CREATE TABLE IF NOT EXISTS prod.aya_events (
            id SERIAL PRIMARY KEY,
            chat_id BIGINT NOT NUll,
            telegram_id BIGINT NOT NULL,
            event_name VARCHAR(60),
            event_value FLOAT,
            timestamp_saved TIMESTAMP NOT NULL)
        """, """
        DROP TABLE IF EXISTS staging.group_events;
        CREATE TABLE staging.group_events (
            id SERIAL PRIMARY KEY,
            chat_id BIGINT NOT NUll,
            event_id BIGINT NOT NULL,
            event_name VARCHAR(60) NOT NULL,
            telegram_id BIGINT NOT NULL,
            msg_text VARCHAR(255) NOT NULL,
            status_timestamp TIMESTAMP NOT NULL)
        """, """
        INSERT INTO staging.group_events (chat_id, event_id, event_name,
            telegram_id, msg_text, status_timestamp)
            VALUES  (-123456789, 123, 'fast_create', 918237832,
                        '/fasten di 12 12', '2021-01-15 16:24:52'),
                    (-123456789, 126, 'fast_accept', 918237832,
                        '/teilnehmen', '2021-01-15 18:24:52'),
                    (-123456789, 129, 'fast_decline', 918237832,
                        '/ablehnen', '2021-01-15 18:25:52'),
                    (-123456789, 138, 'fast_delete', 918237832,
                        '/loeschen', '2021-01-15 19:24:52')
        """, """
        CREATE TABLE IF NOT EXISTS prod.group_events (
            id SERIAL PRIMARY KEY,
            chat_id BIGINT NOT NUll,
            event_id BIGINT NOT NULL,
            event_name VARCHAR(60) NOT NULL,
            telegram_id BIGINT NOT NULL,
            msg_text VARCHAR(255) NOT NULL,
            status_timestamp TIMESTAMP NOT NULL)
        """)
    with db_conn() as conn:
        with conn.cursor() as cur:
            for command in commands:
                cur.execute(command)
                logging.info(f"executed stmt: {command}")
Beispiel #20
0
def main():
    config = db_config()
    conn = db_conn(config)

    # Remove basic results file
    try:
        combined = get_abspath('basic_results.csv', 'outputs')
        os.remove(combined)
    except IOError:
        pass

    # Load features
    features_a = preprocess_features(conn)
    features_b = preprocess_features(conn, with_sa=False)

    # Create scorer to train models using RMSE
    scorer = make_scorer(rmse, greater_is_better=False)

    # Load models in a dict
    models = {
        'linear': load_pickled_model('models/linear.model'),
        'ridge': load_pickled_model('models/ridge.model'),
        'lasso': load_pickled_model('models/lasso.model'),
        'xgb_a': load_pickled_model('models/xgb_a.model'),
        'xgb_b': load_pickled_model('models/xgb_b.model')
    }

    # Validation curve parameter names and ranges
    vc_params = {
        'xgb_a': ('max_depth', np.arange(1, 20, 1)),
        'xgb_b': ('max_depth', np.arange(1, 20, 1))
    }

    # Split into train and test sets
    X_train_a, X_test_a, y_train_a, y_test_a = split_data(features_a)
    X_train_b, X_test_b, y_train_b, y_test_b = split_data(features_b)

    # Generate basic results and learning curves for all models
    for name, grid in models.items():
        if name in ['linear', 'ridge', 'lasso']:
            basic_results(grid, X_test_a, y_test_a, name)
            train_scores, test_scores = create_learning_curve(
                grid.best_estimator_,
                scorer,
                X_train_a,
                y_train_a,
                model_name=name,
                cv=3)
            plot_learning_curve(name, train_scores, test_scores)
        if name == 'xgb_b':
            basic_results(grid, X_test_b, y_test_b, name)
            train_scores, test_scores = create_learning_curve(
                grid.best_estimator_,
                scorer,
                X_train_b,
                y_train_b,
                model_name=name,
                cv=3)
            plot_learning_curve(name, train_scores, test_scores)

    # Generate validation curves for XGBoost models
    create_validation_curve(models['xgb_a'].best_estimator_,
                            X_train_a,
                            y_train_a,
                            model_name='xgb_a',
                            param_name=vc_params['xgb_a'][0],
                            param_range=vc_params['xgb_a'][1],
                            scorer=scorer)

    create_validation_curve(models['xgb_b'].best_estimator_,
                            X_train_b,
                            y_train_b,
                            model_name='xgb_b',
                            param_name=vc_params['xgb_b'][0],
                            param_range=vc_params['xgb_b'][1],
                            scorer=scorer)

    # Generate XGBoost feature importance plots and results
    fi_a = get_feature_importances('xgb_a', models['xgb_a'], features_a)
    fi_b = get_feature_importances('xgb_b', models['xgb_b'], features_b)
    plot_feature_importances('xgb_a', fi_a, nfeats=15)
    plot_feature_importances('xgb_b', fi_b, nfeats=15)

    # Plot test set learning curves of all five models
    plot_lc_all()