def test_get_rankings_year_after_sofifa(): year = 2008 temp_folder = os.path.join(os.getcwd(), 'temp') csv_file = '{}-{}.csv'.format(year, year + 1) from_file = os.path.join(RAW_CLEANED_DATA_FILE_PATH, csv_file) to_file = os.path.join(temp_folder, csv_file) make_directory(temp_folder) get_rankings(from_file, to_file, '{}-12-31'.format(str(year+1)), include_prediction=False) cmp_file = os.path.join(STANDINGS_PATH, csv_file) assert compare_csv(cmp_file, to_file) remove_directory(temp_folder)
def get_html(): season_record = wins() streak_record = streak() lost = determine_if_lost() ranking = get_rankings() days_since_lost = countdown_script() return "<!DOCTYPE html><html><head><style>*{font-family: Arial, sans-serif;text-align: center;position: relative;}.top_title{font-size:50pt;top: 5%;}.big_text{font-size: 300pt;top: 25%;}.record{font-size: 50pt;top: 40%;}</style><title>Has FSU Lost Yet?</title></head><body><div class=\"top_title\">Has FSU Lost Yet?</div><div class=\"big_text\"><b>" + lost + "</b></div><div class=\"record\" id =\"winningStreak\">Winning Streak: " + streak_record + "</div><div class=\"record\" id=\"thisSeason\">This Season: " + season_record + "</div><div class=\"record\">College Football Playoff Ranking: " + ranking + "</div>" + days_since_lost + "<script>(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)})(window,document,'script','//www.google-analytics.com/analytics.js','ga');ga('create', 'UA-56656391-1', 'auto');ga('require', 'displayfeatures');ga('send', 'pageview');</script></body></html>"
def get_rankings(self, include_scores=False): """Return a list of players (or a list of tuples (player ID, score) if include_scores==True) in ranked order, where "score" indicates a player's internal score (higher is better).""" return get_rankings(self, include_scores)
def run(): """Contains the main logic for this analysis.""" FileLog.set_log_dir() seeds = Util.load_seeds() for category in _CATEGORIES: log('Preforming analysis for category: %s' % category) size_top_news = _SIZE_TOP_NEWS if category: size_top_news = .10 data_set = DataSet.TESTING retweets = set() if _SWITCHED: data_set = DataSet.TRAINING if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(_TESTING_SET_MONTHS) log('Num retweets to exclude: %s' % len(retweets)) gt_rankings = ground_truths.get_gt_rankings(seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) log('Num ground_truth_rankings: %s' % len(gt_rankings)) # Format for use later. ground_truth_url_to_rank = {} for rank, (url, count) in enumerate(gt_rankings): ground_truth_url_to_rank[url] = rank target_news = ground_truths.find_target_news(gt_rankings, size_top_news) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: run_params_str = 'd%s_t%s_e%s_%s' % (delta, int(size_top_news * 100), int(_SIZE_EXPERTS * 100), category) info_output_dir = '../graph/FolkWisdom/%s/info/' % run_params_str Util.ensure_dir_exist(info_output_dir) groups, d_num_followers = user_groups.get_all_user_groups(delta, category) log('Num experts (precision): %s' % len(groups.precision)) log('Num experts (fscore): %s' % len(groups.fscore)) log('Num experts (ci): %s' % len(groups.ci)) log('Num Super Experts: %s' %len(groups.super_experts)) log('Num Social Bias Experts: %s' % len(groups.social_bias)) log('Finding rankings with an %s hour delta.' % delta) ranks = rankings.get_rankings(delta, seeds, groups, category, d_num_followers) # Output some interesting info to file size_market_unfiltered = '0' with open('../data/FolkWisdom/size_of_market_unfiltered.txt') as in_file: size_market_unfiltered = in_file.readline().strip() with open('%suser_demographics_%s.txt' % (info_output_dir, run_params_str), 'w') as output_file: output_file.write('Number of Newsaholics: %s\n' % len(groups.newsaholics)) output_file.write('Number of Active Users: %s\n' % len(groups.active_users)) output_file.write('Number of Common Users: %s\n' % len(groups.common_users)) output_file.write('\n'); output_file.write('Number of Precision Experts: %s\n' % len(groups.precision)) output_file.write('Number of F-Score Experts: %s\n' % len(groups.fscore)) output_file.write('Number of CI Experts: %s\n' % len(groups.ci)) output_file.write('Number of Social Bias Experts: %s\n' % len(groups.social_bias)) output_file.write('Total number of unique experts: %s\n' % len(groups.all_experts)) output_file.write('Number of Precision and F-Score Experts: %s\n' % len(groups.precision.intersection(groups.fscore))) output_file.write('Number of Precision and CI Experts: %s\n' % len(groups.precision.intersection(groups.ci))) output_file.write('Number of F-Score and CI Experts: %s\n' % len(groups.fscore.intersection(groups.ci))) output_file.write('Number of Super Experts: %s\n' % len(groups.super_experts)) output_file.write('\n'); output_file.write('Number of Users (Total): %s\n' % (len(groups.newsaholics) + len(groups.active_users) + len(groups.common_users))) output_file.write('Size of market (unfiltered): %s\n' % size_market_unfiltered) output_file.write('\n') # output_file.write('Number of votes by Newsaholics: %s\n' # % num_votes_newsaholics) # output_file.write('Number of votes by Market: %s\n' % num_votes_market) # output_file.write('Number of votes by Active Users: %s\n' # % num_votes_active) # output_file.write('Number of votes by Common Users: %s\n' # % num_votes_common) # output_file.write('\n'); # output_file.write('Number of votes by Expert (Precision) Users: %s\n' # % num_votes_expert_precision) # output_file.write('Number of votes by Expert (fscore) Users: %s\n' # % num_votes_expert_fscore) # output_file.write('Number of votes by Expert (ci) Users: %s\n' # % num_votes_expert_ci) # output_file.write('Number of votes by Super Experts: %s\n' # % num_votes_expert_s) # output_file.write('Number of votes by Social Bias Experts: %s\n' # % num_votes_expert_sb) # output_file.write('\n') # output_file.write('Total Number of votes cast: %s\n' # % (num_votes_newsaholics + num_votes_active # + num_votes_common)) # output_file.write('\n') output_file.write('Total Number of Good News: %s\n' % len(target_news)) log('Ground Truth Top 50') for i in range(min(len(gt_rankings), 50)): url, count = gt_rankings[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Newsaholic Top 5') for i in range(min(len(ranks.newsaholics), 5)): url, count = ranks.newsaholics[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Active Top 5') for i in range(min(len(ranks.active_users), 5)): url, count = ranks.active_users[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Common Top 5') for i in range(min(len(ranks.common_users), 5)): url, count = ranks.common_users[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('nonexpert Top 5') for i in range(min(len(ranks.non_experts), 5)): url, count = ranks.non_experts[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (Precision) Top 5') for i in range(min(len(ranks.precision), 5)): url, count = ranks.precision[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (fscore) Top 5') for i in range(min(len(ranks.fscore), 5)): url, count = ranks.fscore[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Expert (ci) Top 5') for i in range(min(len(ranks.ci), 5)): url, count = ranks.ci[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Super Expert Top 5') for i in range(min(len(ranks.super_experts), 5)): url, count = ranks.super_experts[i] log('[%s] %s\t%s' %(i, url.strip(), count)) log('-----------------------------------') log('Social Bias Expert Top 5') for i in range(min(len(ranks.social_bias), 5)): url, count = ranks.social_bias[i] log('[%s] %s\t%s' %(i, url.strip(), count)) market_rank_to_url = {} newsaholic_rank_to_url = {} active_rank_to_url = {} common_rank_to_url = {} expert_p_rank_to_url = {} expert_f_rank_to_url = {} expert_c_rank_to_url = {} expert_s_rank_to_url = {} for rank, (url, count) in enumerate(ranks.newsaholics): newsaholic_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.population): market_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.active_users): active_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.common_users): common_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.precision): expert_p_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.fscore): expert_f_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.ci): expert_c_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.super_experts): expert_s_rank_to_url[rank] = url population_url_to_rank = {} market_url_to_rank = {} precision_url_to_rank = {} fscore_url_to_rank = {} ci_url_to_rank = {} ci_1_url_to_rank = {} ci_2_url_to_rank = {} ci_3_url_to_rank = {} common_url_to_rank = {} for rank, (url, count) in enumerate(ranks.population): population_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.non_experts): market_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.precision): precision_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.fscore): fscore_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci): ci_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_1): ci_1_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_2): ci_2_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_3): ci_3_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.common_users): common_url_to_rank[url] = rank precisions, recalls = precision_recall.get_precision_recalls(gt_rankings, ranks) mixed_rankings = mixed_model.get_mixed_rankings(market_url_to_rank, precisions.non_experts, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_inact_rankings = mixed_model.get_mixed_rankings(common_url_to_rank, precisions.common_users, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_ci_rankings = mixed_model.get_mixed_rankings(market_url_to_rank, precisions.non_experts, ci_1_url_to_rank, precisions.ci_1, ci_2_url_to_rank, precisions.ci_2, ci_3_url_to_rank, precisions.ci_3, ground_truth_url_to_rank) mixed_precisions, mixed_recalls = precision_recall.calc_precision_recall(gt_rankings, mixed_rankings) mixed_inact_precisions, mixed_inact_recalls = precision_recall.calc_precision_recall(gt_rankings, mixed_inact_rankings) mixed_ci_precisions, mixed_ci_recalls = precision_recall.calc_precision_recall(gt_rankings, mixed_ci_rankings) log('-----------------------------------') log('Mixed (min) Top 5') for i in range(min(len(mixed_rankings), 5)): url, count = mixed_rankings[i] log('[%s] %s\t%s' %(i + 1, url, count)) log('-----------------------------------') with open('%sranking_comparisons_%s.tsv' % (info_output_dir, run_params_str), 'w') as out_file: for gt_rank, (gt_url, _) in enumerate(gt_rankings): market_rank = 0 precision_rank = 0 ci_rank = 0 fscore_rank = 0 inactive_crowd_rank = 0 if gt_url in market_url_to_rank: market_rank = market_url_to_rank[gt_url] + 1 if gt_url in precision_url_to_rank: precision_rank = precision_url_to_rank[gt_url] + 1 if gt_url in ci_url_to_rank: ci_rank = ci_url_to_rank[gt_url] + 1 if gt_url in fscore_url_to_rank: fscore_rank = fscore_url_to_rank[gt_url] + 1 if gt_url in common_url_to_rank: inactive_crowd_rank = common_url_to_rank[gt_url] + 1 line = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (gt_url, gt_rank + 1, market_rank, inactive_crowd_rank, precision_rank, ci_rank, fscore_rank) out_file.write(line) with open('%sground_truth_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for url, count in gt_rankings: output_file.write('%s\t%s\n' % (url.strip(), count)) with open('%smarket_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%snewsaholic_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.newsaholics): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sactive_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.active_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%scommon_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%snonexpert_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.non_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_p_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.precision): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_f_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.fscore): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_c_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.ci): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%sexpert_s_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.super_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('%smixed_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(mixed_rankings): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open('../data/FolkWisdom/market_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.common_users: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/nonexpert_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.non_experts: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/expert_p_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.precision: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/expert_f_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.fscore: out_file.write('%s\n' % precision) with open('../data/FolkWisdom/expert_c_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.ci: out_file.write('%s\n' % precision) log('Drawing summary precision-recall graphs...') # draw_precision_recall_graph(market_precisions, market_recalls, precision_recall.draw([precisions.newsaholics, precisions.active_users, precisions.common_users, precisions.precision, precisions.fscore, precisions.ci, precisions.super_experts], [recalls.newsaholics, recalls.active_users, recalls.common_users, recalls.precision, recalls.fscore, recalls.ci, recalls.super_experts], ['Newsaholics', 'Active', 'Common', 'Precision', 'F-score', 'CI', 'Super Experts'], 'precision_recall_all', run_params_str) # Draw via old method because it has fancy markings. experts.draw_precision_recall_experts(precisions.non_experts, recalls.non_experts, precisions.precision, recalls.precision, precisions.fscore, recalls.fscore, precisions.ci, recalls.ci, run_params_str) log('Drawing experts precision-recall graph...') # precision_recall.draw_with_markers([precisions.population, precisions.non_experts, precisions.precision, # precisions.fscore, precisions.ci], # [recalls.population, recalls.non_experts, recalls.precision, # recalls.fscore, recalls.ci], # ['Population', 'Crowd', 'Precision', 'F-score', 'CI'], # 'precision_recall_experts', # 0, run_params_str) log('Drawing mixed + inact graph...') precision_recall.draw_with_markers([precisions.non_experts, precisions.common_users, mixed_inact_precisions], [recalls.non_experts, recalls.common_users, mixed_inact_recalls], ['Crowd', 'Inactive Crowd', 'Mixed + Inactive'], 'precision_recall_mixed_and_inactive', 3, run_params_str, zoom=True) log('Drawing ci breakdown by followers precisions-recall graph...') precision_recall.draw([precisions.non_experts, precisions.ci, precisions.ci_hi, precisions.ci_li], [recalls.non_experts, recalls.ci, recalls.ci_hi, recalls.ci_li], ['Crowd', 'CI', 'CI High', 'CI Low'], 'precision_recall_ci_followers_breakdown', run_params_str) log('Drawing social bias precision-recall graph...') precision_recall.draw([precisions.non_experts, precisions.social_bias, precisions.precision, precisions.fscore, precisions.ci], [recalls.non_experts, recalls.social_bias, recalls.precision, recalls.fscore, recalls.ci], ['Crowd', 'Influence Experts', 'Precision', 'F-score', 'CI'], 'precision_recall_social_bias', run_params_str) log('Drawing basic groups precision-recall graph...') precision_recall.draw([precisions.newsaholics, precisions.active_users, precisions.common_users], [recalls.newsaholics, recalls.active_users, recalls.common_users], ['Newsaholics', 'Active Users', 'Common Users'], 'precision_recall_basic_groups', run_params_str) log('Drawing crowd def precision-recall graph...') precision_recall.draw([precisions.non_experts, precisions.common_users], [recalls.non_experts, recalls.common_users], ['Crowd', 'Inactive Crowd'], 'precision_recall_crowd_def', run_params_str, zoom=True) log('Drawing non_expert_sampling precision-recall graph...') precision_recall.draw_with_markers([precisions.non_experts, precisions.non_experts_sampled, precisions.non_experts_10, precisions.non_experts_25, precisions.non_experts_1, precisions.ci], [recalls.non_experts, recalls.non_experts_sampled, recalls.non_experts_10, recalls.non_experts_25, recalls.non_experts_1, recalls.ci], ['Crowd', 'Crowd (33% sample)', 'Crowd (10% sample)', 'Crowd (5% sample)', 'Crowd (2% sample)', 'Experts (CI)'], 'precision_recall_non_expert_sampling', 3, run_params_str, ncol=2) # TODO: Replace with new method. log('Drawing mixed model precision-recall graph...') mixed_model.draw_precision_recall_mixed(precisions.non_experts, recalls.non_experts, mixed_precisions, mixed_recalls, run_params_str, zoom=True) log('Drawing mixed ci model precision-recall graph...') precision_recall.draw([precisions.non_experts, mixed_ci_precisions], [recalls.non_experts, mixed_ci_recalls], ['Crowd', 'Mixed'], 'precision_recall_mixed_ci', run_params_str) log('Drawing weighted followers precision-recall graph...') precision_recall.draw([precisions.non_experts, precisions.weighted_followers, precisions.ci], [recalls.non_experts, recalls.weighted_followers, recalls.ci], ['Crowd', 'Weighted Followers', 'CI'], 'precision_recall_weighted_followers', run_params_str) log('Drawing ci weighted graph...') precision_recall.draw([precisions.population, precisions.ci, precisions.ci_weighted], [recalls.population, recalls.ci, recalls.ci_weighted], ['Crowd', 'CI', 'CI (Weighted)'], 'precision_recall_ci_weighted', run_params_str) log('Drawing weighted graph...') precision_recall.draw([precisions.population, precisions.weighted], [recalls.population, recalls.weighted], ['Crowd', 'Crowd (Weighted)'], 'precision_recall_weighted', run_params_str) log('Drawing weighted both graph...') precision_recall.draw([precisions.population, precisions.weighted, precisions.weighted_both], [recalls.population, recalls.weighted, recalls.weighted_both], ['Crowd', 'Crowd (Weighted)', 'Crowd (Weighted Both)'], 'precision_recall_weighted_both', run_params_str)
def magic(should_train=True, should_scrape=False, data_year_available_from=1993, data_year_collect_from=2006): # Function(s) that don't have to be executed every time # 1. OVA data from sofifa_scraper (Warning: This takes a long time to run) # SOFIFA updates their stat two or three times every month, but they don't change data much # Uncomment below to scrape team overall stat data if should_scrape: scrape_team_ova_all(OVA_FILE_PATH, data_year_collect_from, CURRENT_YEAR) # Preprocessing # 1. Latest premier league results # This data can also be retrieved from http://www.football-data.co.uk/englandm.php # Uncomment below to get the latest match results get_current_fixtures(RAW_DATA_FILE_PATH_CURRENT) # 2. Standings (from 1993 to curent year) # Uncomment below to run the function get_rankings_all(data_year_available_from, CURRENT_YEAR, RAW_CLEANED_DATA_FILE_PATH, STANDINGS_PATH) # Run the functions below to start generating necessary data # 1. From raw data, remove all data but the selected columns. # Produces: cleaned data csv located in CLEANED_DATA_FILE_PATH clean_all(RAW_DATA_FILE_PATH, RAW_CLEANED_DATA_FILE_PATH, data_year_available_from, CURRENT_YEAR) # 2. From 1, add Overall Rating columns # Produces: cleaned csv modified, located in CLEANED_DATA_FILE_PATH. Now all cleaned csv from 2006-2018 have OVA column. merge_ova_to_cleaned_all(OVA_FILE_PATH, RAW_CLEANED_DATA_FILE_PATH, data_year_collect_from, CURRENT_YEAR) # 3. From 2, copy cleaned raw data to cleaned data for prediction purpose # Produces: copy csv from RAW_CLEANED_DATA_FILE_PATH to CLEANED_DATA_FILE_PATH copy_csv(RAW_CLEANED_DATA_FILE_PATH, CLEANED_DATA_FILE_PATH) # 4. From 3, add current status columns (current point, current goal for,against,difference, match played, losing/winning streaks, last 5 games) # Produces: cleaned csv modified, located in CLEANED_DATA_FILE_PATH. Now all cleaned csv from 1993-2018 have additional columns add_current_details_all(CLEANED_DATA_FILE_PATH, CLEANED_DATA_FILE_PATH, STANDINGS_PATH, data_year_available_from, CURRENT_YEAR, data_year_available_from) # 5. From 4, merge all csv files from startYear to endYear together. # FOR NOW, I only collect data from 2006 because sofifa only provides ova data from 2006, and model tends to perform better with this approach # Produces: new csv file on FINAL_FILE combine_matches(CLEANED_DATA_FILE_PATH, FINAL_FILE, data_year_collect_from, CURRENT_YEAR) # 6. From 5, get all head-to-head results (match results against the other team over time) # Produces: editted final.csv file under DATA_PATH get_match_results_against(FINAL_FILE, CLEANED_DATA_FILE_PATH, DATA_PATH, data_year_available_from, CURRENT_YEAR) # 7. Once all data is aggregated, we can now build a classifer that make preidctions. # If 'recalculate' is set True, it runs multiple classifiers on this data, # and do some grid search on it if necessary, and finally generates 'model confidence.csv' that records confidence score of each classifier. # If 'recalculate' is set False, and if clf_file exists, then it simply loads the clf from clf_file. # Produces: returns the best clf. best_clf, _, best_clf_average = get_clf(FINAL_FILE, CONFIDENCE_FILE, CLF_FILE, recalculate=should_train) # 8. Now we make prediction. This process is done by first predicting the upcoming round, then aggregate the result, then predict the next, # and repeat the process until there are no more games to predict. "predict_next_round" also produces prediction probabilities # for each matches on stat_path. # - 1. predict_next_round predicts next round and save the result in RAW_CLEANED_DATA_FILE_PATH_CURRENT. # - 2. add_current_details, as its name suggests, it adds current details. # - 3. combine_matches combine all matches from 2006 to 2018 # - 4. get_match_results_against adds head-to-head results between two teams for each match is_first = True # First save current ranking before predicting results remove_directory(STATISTICS_PATH) now = datetime.datetime.now().date().strftime('%Y-%m-%d') pred_ranking_round_file = os.path.join( PRED_RANKING_ROUND_PATH, 'prediction_ranking_{}.csv'.format(now)) get_rankings(RAW_CLEANED_DATA_FILE_PATH_CURRENT, pred_ranking_round_file, include_prediction=True, predicted_date_so_far=now, ranking_summary_file=PRED_RANKING_ROUND_SUMMARY_FILE) while True: is_next_round, date = predict_next_round( best_clf, FINAL_FILE, RAW_CLEANED_DATA_FILE_PATH_CURRENT, statistics=True, stat_path=PREDICTION_FILE, first=is_first) if not is_next_round: break add_current_details(RAW_CLEANED_DATA_FILE_PATH_CURRENT, CLEANED_DATA_FILE_PATH_CURRENT, STANDINGS_PATH, data_year_available_from) combine_matches(CLEANED_DATA_FILE_PATH, FINAL_FILE, data_year_collect_from, CURRENT_YEAR) get_match_results_against(FINAL_FILE, CLEANED_DATA_FILE_PATH, DATA_PATH, data_year_available_from, CURRENT_YEAR) pred_ranking_round_file = os.path.join( PRED_RANKING_ROUND_PATH, 'prediction_ranking_{}.csv'.format(date)) get_rankings(PREDICTION_FILE, pred_ranking_round_file, include_prediction=True, predicted_date_so_far=date, ranking_summary_file=PRED_RANKING_ROUND_SUMMARY_FILE) is_first = False # 9. Now prediction is done. Produce a season standing with using the prediction result. winning_team = get_rankings(PREDICTION_FILE, PRED_RANKING_FILE, include_prediction=True) # 10. Put previous results, prediction results, standing predictions to the database save_new_data_to_database(DATABASE_PATH, FINAL_FILE, PREDICTION_FILE, PRED_RANKING_ROUND_SUMMARY_FILE) # 11. Summary to database if should_train: save_summary_to_database(DATABASE_PATH, best_clf_average, winning_team)
def run(): """Contains the main logic for this analysis.""" FileLog.set_log_dir() seeds = Util.load_seeds() for category in _CATEGORIES: log('Preforming analysis for category: %s' % category) size_top_news = _SIZE_TOP_NEWS if category: size_top_news = .10 data_set = DataSet.TESTING retweets = set() if _SWITCHED: data_set = DataSet.TRAINING if _EXCLUDE_RETWEETS: retweets = ground_truths.find_retweets(_TESTING_SET_MONTHS) log('Num retweets to exclude: %s' % len(retweets)) gt_rankings = ground_truths.get_gt_rankings( seeds, data_set, category, exclude_tweets_within_delta=_EXCLUDE_TWEETS_WITHIN_DELTA, retweets=retweets) log('Num ground_truth_rankings: %s' % len(gt_rankings)) # Format for use later. ground_truth_url_to_rank = {} for rank, (url, count) in enumerate(gt_rankings): ground_truth_url_to_rank[url] = rank target_news = ground_truths.find_target_news(gt_rankings, size_top_news) log('Size target_news: %s' % len(target_news)) for delta in _DELTAS: run_params_str = 'd%s_t%s_e%s_%s' % (delta, int( size_top_news * 100), int(_SIZE_EXPERTS * 100), category) info_output_dir = '../graph/FolkWisdom/%s/info/' % run_params_str Util.ensure_dir_exist(info_output_dir) groups, d_num_followers = user_groups.get_all_user_groups( delta, category) log('Num experts (precision): %s' % len(groups.precision)) log('Num experts (fscore): %s' % len(groups.fscore)) log('Num experts (ci): %s' % len(groups.ci)) log('Num Super Experts: %s' % len(groups.super_experts)) log('Num Social Bias Experts: %s' % len(groups.social_bias)) log('Finding rankings with an %s hour delta.' % delta) ranks = rankings.get_rankings(delta, seeds, groups, category, d_num_followers) # Output some interesting info to file size_market_unfiltered = '0' with open('../data/FolkWisdom/size_of_market_unfiltered.txt' ) as in_file: size_market_unfiltered = in_file.readline().strip() with open( '%suser_demographics_%s.txt' % (info_output_dir, run_params_str), 'w') as output_file: output_file.write('Number of Newsaholics: %s\n' % len(groups.newsaholics)) output_file.write('Number of Active Users: %s\n' % len(groups.active_users)) output_file.write('Number of Common Users: %s\n' % len(groups.common_users)) output_file.write('\n') output_file.write('Number of Precision Experts: %s\n' % len(groups.precision)) output_file.write('Number of F-Score Experts: %s\n' % len(groups.fscore)) output_file.write('Number of CI Experts: %s\n' % len(groups.ci)) output_file.write('Number of Social Bias Experts: %s\n' % len(groups.social_bias)) output_file.write('Total number of unique experts: %s\n' % len(groups.all_experts)) output_file.write( 'Number of Precision and F-Score Experts: %s\n' % len(groups.precision.intersection(groups.fscore))) output_file.write( 'Number of Precision and CI Experts: %s\n' % len(groups.precision.intersection(groups.ci))) output_file.write('Number of F-Score and CI Experts: %s\n' % len(groups.fscore.intersection(groups.ci))) output_file.write('Number of Super Experts: %s\n' % len(groups.super_experts)) output_file.write('\n') output_file.write( 'Number of Users (Total): %s\n' % (len(groups.newsaholics) + len(groups.active_users) + len(groups.common_users))) output_file.write('Size of market (unfiltered): %s\n' % size_market_unfiltered) output_file.write('\n') # output_file.write('Number of votes by Newsaholics: %s\n' # % num_votes_newsaholics) # output_file.write('Number of votes by Market: %s\n' % num_votes_market) # output_file.write('Number of votes by Active Users: %s\n' # % num_votes_active) # output_file.write('Number of votes by Common Users: %s\n' # % num_votes_common) # output_file.write('\n'); # output_file.write('Number of votes by Expert (Precision) Users: %s\n' # % num_votes_expert_precision) # output_file.write('Number of votes by Expert (fscore) Users: %s\n' # % num_votes_expert_fscore) # output_file.write('Number of votes by Expert (ci) Users: %s\n' # % num_votes_expert_ci) # output_file.write('Number of votes by Super Experts: %s\n' # % num_votes_expert_s) # output_file.write('Number of votes by Social Bias Experts: %s\n' # % num_votes_expert_sb) # output_file.write('\n') # output_file.write('Total Number of votes cast: %s\n' # % (num_votes_newsaholics + num_votes_active # + num_votes_common)) # output_file.write('\n') output_file.write('Total Number of Good News: %s\n' % len(target_news)) log('Ground Truth Top 50') for i in range(min(len(gt_rankings), 50)): url, count = gt_rankings[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Newsaholic Top 5') for i in range(min(len(ranks.newsaholics), 5)): url, count = ranks.newsaholics[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Active Top 5') for i in range(min(len(ranks.active_users), 5)): url, count = ranks.active_users[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Common Top 5') for i in range(min(len(ranks.common_users), 5)): url, count = ranks.common_users[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('nonexpert Top 5') for i in range(min(len(ranks.non_experts), 5)): url, count = ranks.non_experts[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Expert (Precision) Top 5') for i in range(min(len(ranks.precision), 5)): url, count = ranks.precision[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Expert (fscore) Top 5') for i in range(min(len(ranks.fscore), 5)): url, count = ranks.fscore[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Expert (ci) Top 5') for i in range(min(len(ranks.ci), 5)): url, count = ranks.ci[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Super Expert Top 5') for i in range(min(len(ranks.super_experts), 5)): url, count = ranks.super_experts[i] log('[%s] %s\t%s' % (i, url.strip(), count)) log('-----------------------------------') log('Social Bias Expert Top 5') for i in range(min(len(ranks.social_bias), 5)): url, count = ranks.social_bias[i] log('[%s] %s\t%s' % (i, url.strip(), count)) market_rank_to_url = {} newsaholic_rank_to_url = {} active_rank_to_url = {} common_rank_to_url = {} expert_p_rank_to_url = {} expert_f_rank_to_url = {} expert_c_rank_to_url = {} expert_s_rank_to_url = {} for rank, (url, count) in enumerate(ranks.newsaholics): newsaholic_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.population): market_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.active_users): active_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.common_users): common_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.precision): expert_p_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.fscore): expert_f_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.ci): expert_c_rank_to_url[rank] = url for rank, (url, count) in enumerate(ranks.super_experts): expert_s_rank_to_url[rank] = url population_url_to_rank = {} market_url_to_rank = {} precision_url_to_rank = {} fscore_url_to_rank = {} ci_url_to_rank = {} ci_1_url_to_rank = {} ci_2_url_to_rank = {} ci_3_url_to_rank = {} common_url_to_rank = {} for rank, (url, count) in enumerate(ranks.population): population_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.non_experts): market_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.precision): precision_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.fscore): fscore_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci): ci_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_1): ci_1_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_2): ci_2_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.ci_3): ci_3_url_to_rank[url] = rank for rank, (url, count) in enumerate(ranks.common_users): common_url_to_rank[url] = rank precisions, recalls = precision_recall.get_precision_recalls( gt_rankings, ranks) mixed_rankings = mixed_model.get_mixed_rankings( market_url_to_rank, precisions.non_experts, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_inact_rankings = mixed_model.get_mixed_rankings( common_url_to_rank, precisions.common_users, precision_url_to_rank, precisions.precision, fscore_url_to_rank, precisions.fscore, ci_url_to_rank, precisions.ci, ground_truth_url_to_rank) mixed_ci_rankings = mixed_model.get_mixed_rankings( market_url_to_rank, precisions.non_experts, ci_1_url_to_rank, precisions.ci_1, ci_2_url_to_rank, precisions.ci_2, ci_3_url_to_rank, precisions.ci_3, ground_truth_url_to_rank) mixed_precisions, mixed_recalls = precision_recall.calc_precision_recall( gt_rankings, mixed_rankings) mixed_inact_precisions, mixed_inact_recalls = precision_recall.calc_precision_recall( gt_rankings, mixed_inact_rankings) mixed_ci_precisions, mixed_ci_recalls = precision_recall.calc_precision_recall( gt_rankings, mixed_ci_rankings) log('-----------------------------------') log('Mixed (min) Top 5') for i in range(min(len(mixed_rankings), 5)): url, count = mixed_rankings[i] log('[%s] %s\t%s' % (i + 1, url, count)) log('-----------------------------------') with open( '%sranking_comparisons_%s.tsv' % (info_output_dir, run_params_str), 'w') as out_file: for gt_rank, (gt_url, _) in enumerate(gt_rankings): market_rank = 0 precision_rank = 0 ci_rank = 0 fscore_rank = 0 inactive_crowd_rank = 0 if gt_url in market_url_to_rank: market_rank = market_url_to_rank[gt_url] + 1 if gt_url in precision_url_to_rank: precision_rank = precision_url_to_rank[gt_url] + 1 if gt_url in ci_url_to_rank: ci_rank = ci_url_to_rank[gt_url] + 1 if gt_url in fscore_url_to_rank: fscore_rank = fscore_url_to_rank[gt_url] + 1 if gt_url in common_url_to_rank: inactive_crowd_rank = common_url_to_rank[gt_url] + 1 line = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( gt_url, gt_rank + 1, market_rank, inactive_crowd_rank, precision_rank, ci_rank, fscore_rank) out_file.write(line) with open( '%sground_truth_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for url, count in gt_rankings: output_file.write('%s\t%s\n' % (url.strip(), count)) with open( '%smarket_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%snewsaholic_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.newsaholics): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sactive_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.active_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%scommon_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.common_users): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%snonexpert_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.non_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_p_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.precision): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_f_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.fscore): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_c_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.ci): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%sexpert_s_user_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(ranks.super_experts): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '%smixed_rankings_%s.tsv' % (info_output_dir, run_params_str), 'w') as output_file: for rank, (url, count) in enumerate(mixed_rankings): output_file.write('%s\t%s\t(%s,%s)\n' % (url.strip(), count, rank, ground_truth_url_to_rank[url])) with open( '../data/FolkWisdom/market_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.common_users: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/nonexpert_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.non_experts: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/expert_p_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.precision: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/expert_f_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.fscore: out_file.write('%s\n' % precision) with open( '../data/FolkWisdom/expert_c_precisions_%s.txt' % run_params_str, 'w') as out_file: for precision in precisions.ci: out_file.write('%s\n' % precision) log('Drawing summary precision-recall graphs...') # draw_precision_recall_graph(market_precisions, market_recalls, precision_recall.draw([ precisions.newsaholics, precisions.active_users, precisions.common_users, precisions.precision, precisions.fscore, precisions.ci, precisions.super_experts ], [ recalls.newsaholics, recalls.active_users, recalls.common_users, recalls.precision, recalls.fscore, recalls.ci, recalls.super_experts ], [ 'Newsaholics', 'Active', 'Common', 'Precision', 'F-score', 'CI', 'Super Experts' ], 'precision_recall_all', run_params_str) # Draw via old method because it has fancy markings. experts.draw_precision_recall_experts( precisions.non_experts, recalls.non_experts, precisions.precision, recalls.precision, precisions.fscore, recalls.fscore, precisions.ci, recalls.ci, run_params_str) log('Drawing experts precision-recall graph...') # precision_recall.draw_with_markers([precisions.population, precisions.non_experts, precisions.precision, # precisions.fscore, precisions.ci], # [recalls.population, recalls.non_experts, recalls.precision, # recalls.fscore, recalls.ci], # ['Population', 'Crowd', 'Precision', 'F-score', 'CI'], # 'precision_recall_experts', # 0, run_params_str) log('Drawing mixed + inact graph...') precision_recall.draw_with_markers( [ precisions.non_experts, precisions.common_users, mixed_inact_precisions ], [ recalls.non_experts, recalls.common_users, mixed_inact_recalls ], ['Crowd', 'Inactive Crowd', 'Mixed + Inactive'], 'precision_recall_mixed_and_inactive', 3, run_params_str, zoom=True) log('Drawing ci breakdown by followers precisions-recall graph...') precision_recall.draw([ precisions.non_experts, precisions.ci, precisions.ci_hi, precisions.ci_li ], [recalls.non_experts, recalls.ci, recalls.ci_hi, recalls.ci_li], ['Crowd', 'CI', 'CI High', 'CI Low'], 'precision_recall_ci_followers_breakdown', run_params_str) log('Drawing social bias precision-recall graph...') precision_recall.draw([ precisions.non_experts, precisions.social_bias, precisions.precision, precisions.fscore, precisions.ci ], [ recalls.non_experts, recalls.social_bias, recalls.precision, recalls.fscore, recalls.ci ], ['Crowd', 'Influence Experts', 'Precision', 'F-score', 'CI'], 'precision_recall_social_bias', run_params_str) log('Drawing basic groups precision-recall graph...') precision_recall.draw([ precisions.newsaholics, precisions.active_users, precisions.common_users ], [ recalls.newsaholics, recalls.active_users, recalls.common_users ], ['Newsaholics', 'Active Users', 'Common Users'], 'precision_recall_basic_groups', run_params_str) log('Drawing crowd def precision-recall graph...') precision_recall.draw( [precisions.non_experts, precisions.common_users], [recalls.non_experts, recalls.common_users], ['Crowd', 'Inactive Crowd'], 'precision_recall_crowd_def', run_params_str, zoom=True) log('Drawing non_expert_sampling precision-recall graph...') precision_recall.draw_with_markers( [ precisions.non_experts, precisions.non_experts_sampled, precisions.non_experts_10, precisions.non_experts_25, precisions.non_experts_1, precisions.ci ], [ recalls.non_experts, recalls.non_experts_sampled, recalls.non_experts_10, recalls.non_experts_25, recalls.non_experts_1, recalls.ci ], [ 'Crowd', 'Crowd (33% sample)', 'Crowd (10% sample)', 'Crowd (5% sample)', 'Crowd (2% sample)', 'Experts (CI)' ], 'precision_recall_non_expert_sampling', 3, run_params_str, ncol=2) # TODO: Replace with new method. log('Drawing mixed model precision-recall graph...') mixed_model.draw_precision_recall_mixed(precisions.non_experts, recalls.non_experts, mixed_precisions, mixed_recalls, run_params_str, zoom=True) log('Drawing mixed ci model precision-recall graph...') precision_recall.draw( [precisions.non_experts, mixed_ci_precisions], [recalls.non_experts, mixed_ci_recalls], ['Crowd', 'Mixed'], 'precision_recall_mixed_ci', run_params_str) log('Drawing weighted followers precision-recall graph...') precision_recall.draw([ precisions.non_experts, precisions.weighted_followers, precisions.ci ], [recalls.non_experts, recalls.weighted_followers, recalls.ci], ['Crowd', 'Weighted Followers', 'CI'], 'precision_recall_weighted_followers', run_params_str) log('Drawing ci weighted graph...') precision_recall.draw( [precisions.population, precisions.ci, precisions.ci_weighted], [recalls.population, recalls.ci, recalls.ci_weighted], ['Crowd', 'CI', 'CI (Weighted)'], 'precision_recall_ci_weighted', run_params_str) log('Drawing weighted graph...') precision_recall.draw([precisions.population, precisions.weighted], [recalls.population, recalls.weighted], ['Crowd', 'Crowd (Weighted)'], 'precision_recall_weighted', run_params_str) log('Drawing weighted both graph...') precision_recall.draw( [ precisions.population, precisions.weighted, precisions.weighted_both ], [recalls.population, recalls.weighted, recalls.weighted_both], ['Crowd', 'Crowd (Weighted)', 'Crowd (Weighted Both)'], 'precision_recall_weighted_both', run_params_str)