def generate_model_comparison_by_user(input_directory, output_directory): models = [x for x in os.listdir(input_directory) if '.' not in x] # This loop will look across models for all users (not assuming every # model will have data on all users). The intent is to plot, by user, # whatever models are available. users = [] for model in models: model_users = [x for x in os.listdir( os.path.join(input_directory, model)) if '.' not in x] users = users+model_users # dedup users = list(set(users)) # For each user now, let's make the plot across models. for user in users: user_out_dir = os.path.join(output_directory, user) create_dir_if_not_there(user_out_dir) dat = [] for model in models: user_in_dir = os.path.join(input_directory, model, user) # if that model missing for user if not os.path.exists(user_in_dir): continue # Gather data by model dat.append( (model, pd.read_json(os.path.join(user_in_dir, 'own.json' ))[0].values, pd.read_json(os.path.join(user_in_dir, 'other.json'))[0].values)) generate_analysis_output(user_out_dir, dat, 'Model', user)
def generate_model_comparison(in_dir, out_dir): create_dir_if_not_there(output_directory) models = [x for x in os.listdir(input_directory) if '.' not in x] dat = [(model, pd.read_json(os.path.join(in_dir, model, 'own.json'))[0].values, pd.read_json(os.path.join(in_dir, model, 'other.json'))[0].values) for model in models] generate_analysis_output(out_dir, dat, 'Model')
def generate_user_comparison_by_model(input_directory, output_directory): models = [x for x in os.listdir(input_directory) if '.' not in x] for model in models: model_in_dir = os.path.join(input_directory, model) model_out_dir = os.path.join(output_directory, model) create_dir_if_not_there(model_out_dir) model_users = [x for x in os.listdir(model_in_dir) if '.' not in x] dat = [] for user in model_users: in_dir = os.path.join(model_in_dir, user) dat.append( (user, pd.read_json(os.path.join(in_dir, 'own.json' ))[0].values, pd.read_json(os.path.join(in_dir, 'other.json'))[0].values)) generate_analysis_output(model_out_dir, dat, 'User', model)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Takes user tweet files from 'raw_data_path', applies filters, and writes output to preprocessed_data_path. June, 2019 @author: Joshua Rubin """ from get_config import (get_config, create_dir_if_not_there) from tweetvalidator.data_processing import filter_tweets_from_directories # Pull-in filter settings from global configuration. config = get_config() create_dir_if_not_there(config['preprocessed_data_path']) filter_tweets_from_directories(config['raw_data_path'], config['preprocessed_data_path'], config['regexp_tweet_filters'], int(config['min_tweet_characters']) )
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Generates similarity scores for a variety of models and configurations. September, 2019 @author: Joshua Rubin """ import time from get_config import (get_config, create_dir_if_not_there) config = get_config() create_dir_if_not_there(config['eval_output_path']) from tweetvalidator.models import RandomForestModel from tweetvalidator import train_models dir_args = { 'input_directory' : config['processed_data_path'], 'negative_input_directory' : config['processed_negative_data_path'], 'output_directory' : config['eval_output_path']} start = time.time() train_models(RandomForestModel(verbose=True), 'embedding', **dir_args, file_prefix = 'random_forest_model') end = time.time() print(end - start)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Downloads tweets for users specified in 'twitter_users' field of config.json Jan, 2020 @author: Yagna """ from get_config import (get_config, create_dir_if_not_there) from tweetvalidator.data_processing import get_tweets_by_user config = get_config() output_directory = config['raw_data_path'] max_tweets_per_user = config['max_tweets_per_user'] create_dir_if_not_there(output_directory) twitter_users_to_fetch = config['twitter_users'] for user in twitter_users_to_fetch: print(user) get_tweets_by_user(user, max_tweets=max_tweets_per_user, output_path=output_directory)
# -*- coding: utf-8 -*- """ Takes filtered user tweet files from 'preprocessed_data_path', generates embeddings, and writes output processed_data_path. June, 2019 @author: Joshua Rubin """ import time from get_config import (get_config, create_dir_if_not_there) from tweetvalidator.data_processing import embed_tweets_from_directories from tweetvalidator.models import RandomForestModel from tweetvalidator import train_models config = get_config() create_dir_if_not_there(config['processed_data_path']) start = time.time() embed_tweets_from_directories(config['preprocessed_data_path'], config['processed_data_path']) create_dir_if_not_there(config['eval_output_path']) dir_args = { 'input_directory' : config['processed_data_path'], 'negative_input_directory' : config['processed_negative_data_path'], 'output_directory' : config['eval_output_path']} train_models(RandomForestModel(verbose=True), 'embedding', **dir_args,