def main():
    args = parse_args()
    db = mongo.get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password)

    year = int(args.phrase_year)
    query = db[args.phrase_collection].find({'year': year})
    titles = []
    for cursor in query:
        titles.append(cursor['title'][:-7])
    stream.init_crawler(args.consumer_key, args.consumer_secret, args.access_token, args.access_token_secret,
                        db[args.collection], titles)
def main():
    args = parse_args()
    db = mongo.get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password)
    crawler.init_crawler(args.app_key, args.app_secret, args.access_token, args.refresh_token, db[args.collection])
Exemple #3
0
def main():
    import persistent as p
    args, parser = parse_args()

    # Get current time to use it as a filename for output files
    filename_prefix = "data/" + args.description
    # filename_prefix = datetime.today().strftime("%d-%m-%Y-%H.%M.%S")
    if args.city:
        external = args.external or str(args.k_min)
        city = args.city
        filename_prefix = '_'.join([city, external, str(args.n_components)])
        filename_prefix = 'comparisons/' + filename_prefix
        args.query = '{{"bboxCity": "{}"}}'.format(args.city)

    # connect to mongo, load and standardize data
    db = get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname,
        args.username, args.password)

    # TODO: Get this from command line
    venue_extractors = [io.venue_primary_category_extractor]
    checkin_extractors = [io.checkin_time_extractor_hard,
                    io.checkin_user_extractor, io.checkin_day_extractor]

    data, scaler = io.load_data_mongo(db[args.venuecoll],
        db[args.checkincoll], args.query, venue_extractors,
        checkin_extractors, filename_prefix, args.n_components,
        args.venue_threshold)


    # Split into train and test
    train, test = io.split_train_test_with_common_vocabulary(data,
        test_size=0.2)

    print("Loaded {0} ({1} train, {2} test) data points.".format(
        data["coordinates"].shape[0], train["coordinates"].shape[0],
        test["coordinates"].shape[0]), file=sys.stderr)

    # set centers of topics
    initial_topic_centers = None
    initial_topic_covar = None
    if args.external:
        initial_topic_centers, initial_topic_covar = \
            p.load_var('comparisons/{}_{}.preset'.format(city, args.external))

    # Run EM n times
    best_train_likelihood = -1 * np.inf
    best_test_likelihood = -1 * np.inf
    best_k = None
    best_lambda = None
    best_model = None

    lambda_list = args.lambdas
    k_list = range(args.k_min, 1 + args.k_max, args.k_step)
    train_likelihood_across_k = -np.inf * np.ones((len(lambda_list), len(k_list)))
    test_likelihood_across_k = -np.inf * np.ones((len(lambda_list), len(k_list)))

    track_params = args.trackparams

    if args.plot:
        likelihood_fig = plt.figure()

    if initial_topic_centers is not None:
        k_list = [len(initial_topic_centers)]

    for lidx, Lambda in enumerate(lambda_list):

        for kidx, num_topics in enumerate(k_list):
            print("\n====== lambda = {0}, k = {1} ======\n\n".format(Lambda,
                 num_topics), file=sys.stderr)

            # n_jobs=-2 -> Leave only one logical core unused
            models = Parallel(n_jobs=-2, backend="threading")(
                delayed(run)(train, Lambda, num_topics, i, args,
                             initial_topic_centers, initial_topic_covar,
                             track_params) for i in
                range(args.runs))

            # TODO remove this or add command line option
            # Swap to this for serial processing
            # models = [run(train, Lambda, num_topics, i, args,
            #               initial_topic_centers, initial_topic_covar,
            #               track_params)
            #           for i in range(args.runs)]

            best_model_index_for_parameters = np.argmax(
                [model.latest_statistics.likelihood for model in models])

            best_model_in_k = models[best_model_index_for_parameters]

            train_likelihood_across_k[lidx][kidx] = \
                best_model_in_k.latest_statistics.likelihood
            test_likelihood_for_parameters = \
                best_model_in_k.predict_log_probs(test)
            test_likelihood_across_k[lidx][kidx] = \
                test_likelihood_for_parameters

            if test_likelihood_for_parameters > best_test_likelihood:
                best_train_likelihood = \
                    best_model_in_k.latest_statistics.likelihood
                best_test_likelihood = test_likelihood_for_parameters

                best_k = num_topics
                best_model = best_model_in_k

            gc.collect()

    print("Results of the best model:\n", file=sys.stderr)
    print_stuff(data["unigrams"], best_model.get_params())
    print("Best train likelihood: {0}\n".format(best_train_likelihood),
        file=sys.stderr)
    print("Best test likelihood: {0}\n".format(best_test_likelihood),
        file=sys.stderr)

    print("PROB VS VARIATIONAL")
    print(best_model.predict_log_probs(test))
    print(best_model.predict_log_probs_variational(test))

    if args.save:
        query = "synthetic"
        try:
            if args.query:
                query = args.query
        except:
            pass

        io.save_model(best_model, scaler, query, data["unigrams"], filename_prefix)

    # PLOTS
    if args.plot:
        x_plot_num = 1
        y_plot_num = 1

        if len(k_list) > 1:
            plotting.plot_across_lambda_and_k(lambda_list, k_list,
                train_likelihood_across_k, test_likelihood_across_k,
                train["coordinates"].shape[0], data["coordinates"].shape[0],
                filename_prefix, save=True)

        if track_params:
            best_statistics_history = best_model.get_statistics_history()

            # Plot likelihood graph
            likelihood_plot = plotting.plot_statistics_history(likelihood_fig,
               best_statistics_history, x_plot_num, y_plot_num, 0)

            # Put the legend on the last likelihood plot
            likelihood_fig.legend(list(likelihood_plot),
                ['Likelihood', 'User likelihood', 'Location likelihood',
                'Topic likelihood', 'Sigma likelihood', 'Phi entropy'])

            # TODO add command line option
            #  Uncomment to enable animated plots
            # phi_animated_fig, phi_animated_ax = plt.subplots(1, 1)
            # anim = plotting.plot_phi_animated(phi_animated_fig, 
                # phi_animated_ax, train, best_statistics_history)

            # anim.save('anim.gif', writer='imagemagick', fps=10, dpi=300)

        plt.show()
def main():
    args = parse_args()
    db = mongo.get_mongo_database_with_auth(args.dbhost, args.dbport,
                                            args.dbname, args.username,
                                            args.password)
    print(db["imdb_data"])
Exemple #5
0
def main():
    import persistent as p
    args, parser = parse_args()

    # Get current time to use it as a filename for output files
    filename_prefix = "data/" + args.description
    # filename_prefix = datetime.today().strftime("%d-%m-%Y-%H.%M.%S")
    if args.city:
        external = args.external or str(args.k_min)
        city = args.city
        filename_prefix = '_'.join([city, external, str(args.n_components)])
        filename_prefix = 'comparisons/' + filename_prefix
        args.query = '{{"bboxCity": "{}"}}'.format(args.city)

    # connect to mongo, load and standardize data
    db = get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname,
                                      args.username, args.password)

    # TODO: Get this from command line
    venue_extractors = [io.venue_primary_category_extractor]
    checkin_extractors = [
        io.checkin_time_extractor_hard, io.checkin_user_extractor,
        io.checkin_day_extractor
    ]

    data, scaler = io.load_data_mongo(db[args.venuecoll], db[args.checkincoll],
                                      args.query, venue_extractors,
                                      checkin_extractors, filename_prefix,
                                      args.n_components, args.venue_threshold)

    # Split into train and test
    train, test = io.split_train_test_with_common_vocabulary(data,
                                                             test_size=0.2)

    print("Loaded {0} ({1} train, {2} test) data points.".format(
        data["coordinates"].shape[0], train["coordinates"].shape[0],
        test["coordinates"].shape[0]),
          file=sys.stderr)

    # set centers of topics
    initial_topic_centers = None
    initial_topic_covar = None
    if args.external:
        initial_topic_centers, initial_topic_covar = \
            p.load_var('comparisons/{}_{}.preset'.format(city, args.external))

    # Run EM n times
    best_train_likelihood = -1 * np.inf
    best_test_likelihood = -1 * np.inf
    best_k = None
    best_lambda = None
    best_model = None

    lambda_list = args.lambdas
    k_list = range(args.k_min, 1 + args.k_max, args.k_step)
    train_likelihood_across_k = -np.inf * np.ones(
        (len(lambda_list), len(k_list)))
    test_likelihood_across_k = -np.inf * np.ones(
        (len(lambda_list), len(k_list)))

    track_params = args.trackparams

    if args.plot:
        likelihood_fig = plt.figure()

    if initial_topic_centers is not None:
        k_list = [len(initial_topic_centers)]

    for lidx, Lambda in enumerate(lambda_list):

        for kidx, num_topics in enumerate(k_list):
            print("\n====== lambda = {0}, k = {1} ======\n\n".format(
                Lambda, num_topics),
                  file=sys.stderr)

            # n_jobs=-2 -> Leave only one logical core unused
            models = Parallel(n_jobs=-2, backend="threading")(delayed(run)(
                train, Lambda, num_topics, i, args, initial_topic_centers,
                initial_topic_covar, track_params) for i in range(args.runs))

            # TODO remove this or add command line option
            # Swap to this for serial processing
            # models = [run(train, Lambda, num_topics, i, args,
            #               initial_topic_centers, initial_topic_covar,
            #               track_params)
            #           for i in range(args.runs)]

            best_model_index_for_parameters = np.argmax(
                [model.latest_statistics.likelihood for model in models])

            best_model_in_k = models[best_model_index_for_parameters]

            train_likelihood_across_k[lidx][kidx] = \
                best_model_in_k.latest_statistics.likelihood
            test_likelihood_for_parameters = \
                best_model_in_k.predict_log_probs(test)
            test_likelihood_across_k[lidx][kidx] = \
                test_likelihood_for_parameters

            if test_likelihood_for_parameters > best_test_likelihood:
                best_train_likelihood = \
                    best_model_in_k.latest_statistics.likelihood
                best_test_likelihood = test_likelihood_for_parameters

                best_k = num_topics
                best_model = best_model_in_k

            gc.collect()

    print("Results of the best model:\n", file=sys.stderr)
    print_stuff(data["unigrams"], best_model.get_params())
    print("Best train likelihood: {0}\n".format(best_train_likelihood),
          file=sys.stderr)
    print("Best test likelihood: {0}\n".format(best_test_likelihood),
          file=sys.stderr)

    print("PROB VS VARIATIONAL")
    print(best_model.predict_log_probs(test))
    print(best_model.predict_log_probs_variational(test))

    if args.save:
        query = "synthetic"
        try:
            if args.query:
                query = args.query
        except:
            pass

        io.save_model(best_model, scaler, query, data["unigrams"],
                      filename_prefix)

    # PLOTS
    if args.plot:
        x_plot_num = 1
        y_plot_num = 1

        if len(k_list) > 1:
            plotting.plot_across_lambda_and_k(lambda_list,
                                              k_list,
                                              train_likelihood_across_k,
                                              test_likelihood_across_k,
                                              train["coordinates"].shape[0],
                                              data["coordinates"].shape[0],
                                              filename_prefix,
                                              save=True)

        if track_params:
            best_statistics_history = best_model.get_statistics_history()

            # Plot likelihood graph
            likelihood_plot = plotting.plot_statistics_history(
                likelihood_fig, best_statistics_history, x_plot_num,
                y_plot_num, 0)

            # Put the legend on the last likelihood plot
            likelihood_fig.legend(list(likelihood_plot), [
                'Likelihood', 'User likelihood', 'Location likelihood',
                'Topic likelihood', 'Sigma likelihood', 'Phi entropy'
            ])

            # TODO add command line option
            #  Uncomment to enable animated plots
            # phi_animated_fig, phi_animated_ax = plt.subplots(1, 1)
            # anim = plotting.plot_phi_animated(phi_animated_fig,
            # phi_animated_ax, train, best_statistics_history)

            # anim.save('anim.gif', writer='imagemagick', fps=10, dpi=300)

        plt.show()
def main():
    args = parse_args()
    db = mongo.get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password)
    print(db["imdb_data"])
        log_theta_for_z = np.log(theta[0, z])
        geo_log_prob[z] += log_theta_for_z
        feature_log_prob[z] += log_theta_for_z

    # Log-sum-exp over topics, then sum over data points
    final_geo_log_prob = log_sum(geo_log_prob, axis=0)
    final_feature_log_prob = log_sum(feature_log_prob, axis=0)

    return np.sum(final_geo_log_prob + final_feature_log_prob)


if __name__ == '__main__':
    if len(sys.argv) != 6:
        print("Usage: python -m visualization.feature_contribution dbhost dbport dbuser dbpassword model_dir")
        exit(0)

    dbhost = sys.argv[1]
    dbport = int(sys.argv[2])
    dbuser = sys.argv[3]
    dbpassword = sys.argv[4]
    model_path = sys.argv[5]

    db = get_mongo_database_with_auth(dbhost, dbport, "combined", dbuser, dbpassword)

    venue_extractors = [io.venue_primary_category_extractor]
    checkin_extractors = [io.checkin_time_extractor_hard, io.checkin_user_extractor, io.checkin_day_extractor]

    total_results = compute_feature_contribution(db, model_path, venue_extractors, checkin_extractors)
    print(total_results)
Exemple #8
0
    return np.sum(final_geo_log_prob + final_feature_log_prob)


if __name__ == '__main__':
    if len(sys.argv) != 6:
        print(
            "Usage: python -m visualization.feature_contribution dbhost dbport dbuser dbpassword model_dir"
        )
        exit(0)

    dbhost = sys.argv[1]
    dbport = int(sys.argv[2])
    dbuser = sys.argv[3]
    dbpassword = sys.argv[4]
    model_path = sys.argv[5]

    db = get_mongo_database_with_auth(dbhost, dbport, "combined", dbuser,
                                      dbpassword)

    venue_extractors = [io.venue_primary_category_extractor]
    checkin_extractors = [
        io.checkin_time_extractor_hard, io.checkin_user_extractor,
        io.checkin_day_extractor
    ]

    total_results = compute_feature_contribution(db, model_path,
                                                 venue_extractors,
                                                 checkin_extractors)
    print(total_results)