def main(): args = parse_args() db = mongo.get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password) year = int(args.phrase_year) query = db[args.phrase_collection].find({'year': year}) titles = [] for cursor in query: titles.append(cursor['title'][:-7]) stream.init_crawler(args.consumer_key, args.consumer_secret, args.access_token, args.access_token_secret, db[args.collection], titles)
def main(): args = parse_args() db = mongo.get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password) crawler.init_crawler(args.app_key, args.app_secret, args.access_token, args.refresh_token, db[args.collection])
def main(): import persistent as p args, parser = parse_args() # Get current time to use it as a filename for output files filename_prefix = "data/" + args.description # filename_prefix = datetime.today().strftime("%d-%m-%Y-%H.%M.%S") if args.city: external = args.external or str(args.k_min) city = args.city filename_prefix = '_'.join([city, external, str(args.n_components)]) filename_prefix = 'comparisons/' + filename_prefix args.query = '{{"bboxCity": "{}"}}'.format(args.city) # connect to mongo, load and standardize data db = get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password) # TODO: Get this from command line venue_extractors = [io.venue_primary_category_extractor] checkin_extractors = [io.checkin_time_extractor_hard, io.checkin_user_extractor, io.checkin_day_extractor] data, scaler = io.load_data_mongo(db[args.venuecoll], db[args.checkincoll], args.query, venue_extractors, checkin_extractors, filename_prefix, args.n_components, args.venue_threshold) # Split into train and test train, test = io.split_train_test_with_common_vocabulary(data, test_size=0.2) print("Loaded {0} ({1} train, {2} test) data points.".format( data["coordinates"].shape[0], train["coordinates"].shape[0], test["coordinates"].shape[0]), file=sys.stderr) # set centers of topics initial_topic_centers = None initial_topic_covar = None if args.external: initial_topic_centers, initial_topic_covar = \ p.load_var('comparisons/{}_{}.preset'.format(city, args.external)) # Run EM n times best_train_likelihood = -1 * np.inf best_test_likelihood = -1 * np.inf best_k = None best_lambda = None best_model = None lambda_list = args.lambdas k_list = range(args.k_min, 1 + args.k_max, args.k_step) train_likelihood_across_k = -np.inf * np.ones((len(lambda_list), len(k_list))) test_likelihood_across_k = -np.inf * np.ones((len(lambda_list), len(k_list))) track_params = args.trackparams if args.plot: likelihood_fig = plt.figure() if initial_topic_centers is not None: k_list = [len(initial_topic_centers)] for lidx, Lambda in enumerate(lambda_list): for kidx, num_topics in enumerate(k_list): print("\n====== lambda = {0}, k = {1} ======\n\n".format(Lambda, num_topics), file=sys.stderr) # n_jobs=-2 -> Leave only one logical core unused models = Parallel(n_jobs=-2, backend="threading")( delayed(run)(train, Lambda, num_topics, i, args, initial_topic_centers, initial_topic_covar, track_params) for i in range(args.runs)) # TODO remove this or add command line option # Swap to this for serial processing # models = [run(train, Lambda, num_topics, i, args, # initial_topic_centers, initial_topic_covar, # track_params) # for i in range(args.runs)] best_model_index_for_parameters = np.argmax( [model.latest_statistics.likelihood for model in models]) best_model_in_k = models[best_model_index_for_parameters] train_likelihood_across_k[lidx][kidx] = \ best_model_in_k.latest_statistics.likelihood test_likelihood_for_parameters = \ best_model_in_k.predict_log_probs(test) test_likelihood_across_k[lidx][kidx] = \ test_likelihood_for_parameters if test_likelihood_for_parameters > best_test_likelihood: best_train_likelihood = \ best_model_in_k.latest_statistics.likelihood best_test_likelihood = test_likelihood_for_parameters best_k = num_topics best_model = best_model_in_k gc.collect() print("Results of the best model:\n", file=sys.stderr) print_stuff(data["unigrams"], best_model.get_params()) print("Best train likelihood: {0}\n".format(best_train_likelihood), file=sys.stderr) print("Best test likelihood: {0}\n".format(best_test_likelihood), file=sys.stderr) print("PROB VS VARIATIONAL") print(best_model.predict_log_probs(test)) print(best_model.predict_log_probs_variational(test)) if args.save: query = "synthetic" try: if args.query: query = args.query except: pass io.save_model(best_model, scaler, query, data["unigrams"], filename_prefix) # PLOTS if args.plot: x_plot_num = 1 y_plot_num = 1 if len(k_list) > 1: plotting.plot_across_lambda_and_k(lambda_list, k_list, train_likelihood_across_k, test_likelihood_across_k, train["coordinates"].shape[0], data["coordinates"].shape[0], filename_prefix, save=True) if track_params: best_statistics_history = best_model.get_statistics_history() # Plot likelihood graph likelihood_plot = plotting.plot_statistics_history(likelihood_fig, best_statistics_history, x_plot_num, y_plot_num, 0) # Put the legend on the last likelihood plot likelihood_fig.legend(list(likelihood_plot), ['Likelihood', 'User likelihood', 'Location likelihood', 'Topic likelihood', 'Sigma likelihood', 'Phi entropy']) # TODO add command line option # Uncomment to enable animated plots # phi_animated_fig, phi_animated_ax = plt.subplots(1, 1) # anim = plotting.plot_phi_animated(phi_animated_fig, # phi_animated_ax, train, best_statistics_history) # anim.save('anim.gif', writer='imagemagick', fps=10, dpi=300) plt.show()
def main(): args = parse_args() db = mongo.get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password) print(db["imdb_data"])
def main(): import persistent as p args, parser = parse_args() # Get current time to use it as a filename for output files filename_prefix = "data/" + args.description # filename_prefix = datetime.today().strftime("%d-%m-%Y-%H.%M.%S") if args.city: external = args.external or str(args.k_min) city = args.city filename_prefix = '_'.join([city, external, str(args.n_components)]) filename_prefix = 'comparisons/' + filename_prefix args.query = '{{"bboxCity": "{}"}}'.format(args.city) # connect to mongo, load and standardize data db = get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password) # TODO: Get this from command line venue_extractors = [io.venue_primary_category_extractor] checkin_extractors = [ io.checkin_time_extractor_hard, io.checkin_user_extractor, io.checkin_day_extractor ] data, scaler = io.load_data_mongo(db[args.venuecoll], db[args.checkincoll], args.query, venue_extractors, checkin_extractors, filename_prefix, args.n_components, args.venue_threshold) # Split into train and test train, test = io.split_train_test_with_common_vocabulary(data, test_size=0.2) print("Loaded {0} ({1} train, {2} test) data points.".format( data["coordinates"].shape[0], train["coordinates"].shape[0], test["coordinates"].shape[0]), file=sys.stderr) # set centers of topics initial_topic_centers = None initial_topic_covar = None if args.external: initial_topic_centers, initial_topic_covar = \ p.load_var('comparisons/{}_{}.preset'.format(city, args.external)) # Run EM n times best_train_likelihood = -1 * np.inf best_test_likelihood = -1 * np.inf best_k = None best_lambda = None best_model = None lambda_list = args.lambdas k_list = range(args.k_min, 1 + args.k_max, args.k_step) train_likelihood_across_k = -np.inf * np.ones( (len(lambda_list), len(k_list))) test_likelihood_across_k = -np.inf * np.ones( (len(lambda_list), len(k_list))) track_params = args.trackparams if args.plot: likelihood_fig = plt.figure() if initial_topic_centers is not None: k_list = [len(initial_topic_centers)] for lidx, Lambda in enumerate(lambda_list): for kidx, num_topics in enumerate(k_list): print("\n====== lambda = {0}, k = {1} ======\n\n".format( Lambda, num_topics), file=sys.stderr) # n_jobs=-2 -> Leave only one logical core unused models = Parallel(n_jobs=-2, backend="threading")(delayed(run)( train, Lambda, num_topics, i, args, initial_topic_centers, initial_topic_covar, track_params) for i in range(args.runs)) # TODO remove this or add command line option # Swap to this for serial processing # models = [run(train, Lambda, num_topics, i, args, # initial_topic_centers, initial_topic_covar, # track_params) # for i in range(args.runs)] best_model_index_for_parameters = np.argmax( [model.latest_statistics.likelihood for model in models]) best_model_in_k = models[best_model_index_for_parameters] train_likelihood_across_k[lidx][kidx] = \ best_model_in_k.latest_statistics.likelihood test_likelihood_for_parameters = \ best_model_in_k.predict_log_probs(test) test_likelihood_across_k[lidx][kidx] = \ test_likelihood_for_parameters if test_likelihood_for_parameters > best_test_likelihood: best_train_likelihood = \ best_model_in_k.latest_statistics.likelihood best_test_likelihood = test_likelihood_for_parameters best_k = num_topics best_model = best_model_in_k gc.collect() print("Results of the best model:\n", file=sys.stderr) print_stuff(data["unigrams"], best_model.get_params()) print("Best train likelihood: {0}\n".format(best_train_likelihood), file=sys.stderr) print("Best test likelihood: {0}\n".format(best_test_likelihood), file=sys.stderr) print("PROB VS VARIATIONAL") print(best_model.predict_log_probs(test)) print(best_model.predict_log_probs_variational(test)) if args.save: query = "synthetic" try: if args.query: query = args.query except: pass io.save_model(best_model, scaler, query, data["unigrams"], filename_prefix) # PLOTS if args.plot: x_plot_num = 1 y_plot_num = 1 if len(k_list) > 1: plotting.plot_across_lambda_and_k(lambda_list, k_list, train_likelihood_across_k, test_likelihood_across_k, train["coordinates"].shape[0], data["coordinates"].shape[0], filename_prefix, save=True) if track_params: best_statistics_history = best_model.get_statistics_history() # Plot likelihood graph likelihood_plot = plotting.plot_statistics_history( likelihood_fig, best_statistics_history, x_plot_num, y_plot_num, 0) # Put the legend on the last likelihood plot likelihood_fig.legend(list(likelihood_plot), [ 'Likelihood', 'User likelihood', 'Location likelihood', 'Topic likelihood', 'Sigma likelihood', 'Phi entropy' ]) # TODO add command line option # Uncomment to enable animated plots # phi_animated_fig, phi_animated_ax = plt.subplots(1, 1) # anim = plotting.plot_phi_animated(phi_animated_fig, # phi_animated_ax, train, best_statistics_history) # anim.save('anim.gif', writer='imagemagick', fps=10, dpi=300) plt.show()
log_theta_for_z = np.log(theta[0, z]) geo_log_prob[z] += log_theta_for_z feature_log_prob[z] += log_theta_for_z # Log-sum-exp over topics, then sum over data points final_geo_log_prob = log_sum(geo_log_prob, axis=0) final_feature_log_prob = log_sum(feature_log_prob, axis=0) return np.sum(final_geo_log_prob + final_feature_log_prob) if __name__ == '__main__': if len(sys.argv) != 6: print("Usage: python -m visualization.feature_contribution dbhost dbport dbuser dbpassword model_dir") exit(0) dbhost = sys.argv[1] dbport = int(sys.argv[2]) dbuser = sys.argv[3] dbpassword = sys.argv[4] model_path = sys.argv[5] db = get_mongo_database_with_auth(dbhost, dbport, "combined", dbuser, dbpassword) venue_extractors = [io.venue_primary_category_extractor] checkin_extractors = [io.checkin_time_extractor_hard, io.checkin_user_extractor, io.checkin_day_extractor] total_results = compute_feature_contribution(db, model_path, venue_extractors, checkin_extractors) print(total_results)
return np.sum(final_geo_log_prob + final_feature_log_prob) if __name__ == '__main__': if len(sys.argv) != 6: print( "Usage: python -m visualization.feature_contribution dbhost dbport dbuser dbpassword model_dir" ) exit(0) dbhost = sys.argv[1] dbport = int(sys.argv[2]) dbuser = sys.argv[3] dbpassword = sys.argv[4] model_path = sys.argv[5] db = get_mongo_database_with_auth(dbhost, dbport, "combined", dbuser, dbpassword) venue_extractors = [io.venue_primary_category_extractor] checkin_extractors = [ io.checkin_time_extractor_hard, io.checkin_user_extractor, io.checkin_day_extractor ] total_results = compute_feature_contribution(db, model_path, venue_extractors, checkin_extractors) print(total_results)