예제 #1
0
def main():
    
    # load local file (already pre-processed)
    new_tweets = stats_helpers.get_raw_data(local=True, db_file=args.file)
    
    # upload into cloud
    DB_PASS = os.environ.get('DB_PASS', '')
    helpers.cloud_upload_local_to_tweet_database(new_tweets, pw=DB_PASS)
예제 #2
0
def main():

    # get raw data
    data = stats_helpers.get_raw_data(local=True, db_file=args.file)

    # run model on data
    bert_proba = bert.run_bert(data)

    # save results
    datestr = datetime.datetime.now().strftime(format='%Y-%m-%d_%H:%M')
    np.save(f'bert_proba_{datestr}.npy', bert_proba)
예제 #3
0
def main():
    """Runs Non-negative Matrix Factorization (NMF) on the hashtag column of the dataset.
    """
    
    # get data
    data = stats_helpers.get_raw_data(db_file=args.file)
    
    # run NMF analysis
    topics, _ = my_nlp.perform_NMF_analysis(data, verbose=1)

    # save results
    with open('bundestweets/data/nmf_topics.json', 'w+') as fp:
        json.dump(topics, fp)
예제 #4
0
def get_data(local=False, db_file='bundestweets/data/tweets_data.db'):
    """Get all data from SQL file and filter for relevent tweets
    
    Returns:
        df: DataFrame with raw data
        content_tweets: only tweets with text
    """

    # get all tweet data
    df = stats_helpers.get_raw_data(local=local, db_file=db_file)

    # get non-empty tweets (only with text content)
    content_tweets = df.loc[~df.text.isna(), :]
    #content_tweets.loc[:, 'date'] = pd.to_datetime(content_tweets.date, format='%Y-%m-%d-%H-%M-%S')

    return df, content_tweets
예제 #5
0
def main():

    # load data
    data = stats_helpers.get_raw_data(local=True, db_file=args.file)
    print(f'Pre-processing {len(data)} new tweets.')

    # preprocess
    print('Cleaning and stemming text data...')
    data, translation_set = my_nlp.preprocess_for_nlp(data)

    # save translation set
    #with open('bundestweets/data/translation_set.json', 'w+') as fp:
    #    json.dump(translation_set, fp)

    # run bert model for offensive language identification
    print('Running BERT model for offensive language identification...')
    bert_proba = bert.run_bert(data)
    data['offensive_proba'] = bert_proba[:, 1]

    # open database file and save preprocessed columns
    conn = sqlite3.connect(args.file)
    cur = conn.cursor()

    # create new columns "text_stemmed" and "text_cleaned"
    try:
        cur.execute('ALTER TABLE tweets ADD text_stemmed TEXT;')
    except sqlite3.OperationalError:
        print('Column "text_stemmed" exists already.')
    try:
        cur.execute('ALTER TABLE tweets ADD text_cleaned TEXT;')
    except sqlite3.OperationalError:
        print('Column "text_cleaned" exists already.')

    # update columns "text_stemmed"
    print("Uploading 'text_stemmed' column...")
    recordList = list(zip(data.text_stemmed, data.id.astype('int')))
    sqlite_update_query = """UPDATE tweets set text_stemmed = ? where id = ?"""
    cur.executemany(sqlite_update_query, recordList)
    conn.commit()

    # update columns "text_cleaned"
    print("Uploading 'text_cleaned' column...")
    recordList = list(zip(data.text_cleaned, data.id.astype('int')))
    sqlite_update_query = """UPDATE tweets set text_cleaned = ? where id = ?"""
    cur.executemany(sqlite_update_query, recordList)
    conn.commit()

    # generate column "offensive_proba"
    try:
        cur.execute(
            'ALTER TABLE tweets ADD offensive_proba FLOAT CONSTRAINT d_offensive_zero DEFAULT 0;'
        )
    except sqlite3.OperationalError:
        print('Column "offensive_proba" exists already.')

    # update columns "offensive_proba"
    print("Uploading 'offensive_proba' column...")
    recordList = list(zip(data.offensive_proba, data.id.astype('int')))
    sqlite_update_query = """UPDATE tweets set offensive_proba = ? where id = ?"""
    cur.executemany(sqlite_update_query, recordList)
    conn.commit()

    cur.close()
    conn.close()