Ejemplo n.º 1
0
def publish_results_via_rabbitmq(rabbitmq_connection,
                                 rabbitmq_queue,
                                 rabbitmq_exchange,
                                 rabbitmq_routing_key,
                                 user_topic_gen):
    """
    What is says on the tin.

    Inputs:
            - user_topic_gen: A python generator that generates users and a generator of associated topic keywords.
    """
    for user_twitter_id, topic_to_score in user_topic_gen:
        results_dictionary = dict()
        results_dictionary["contributor_id"] = str(user_twitter_id)
        results_dictionary["type"] = [[str(user_type), repr(score)] for user_type, score in topic_to_score.items()]

        results_string = json.dumps(results_dictionary)

        # Publish the message.
        simple_notification(rabbitmq_connection,
                            rabbitmq_queue,
                            rabbitmq_exchange,
                            rabbitmq_routing_key,
                            results_string)
def user_network_profile_classifier(mongo_uri,
                                    assessment_id,
                                    twitter_app_key,
                                    twitter_app_secret,
                                    rabbitmq_uri,
                                    rabbitmq_queue,
                                    rabbitmq_exchange,
                                    rabbitmq_routing_key,
                                    pserver_host_name,
                                    pserver_client_name,
                                    pserver_client_pass,
                                    latest_n,
                                    lower_timestamp,
                                    upper_timestamp,
                                    restart_probability,
                                    number_of_threads,
                                    number_of_users_to_annotate,
                                    max_number_of_labels,
                                    user_network_profile_classifier_db,
                                    local_resources_folder):
    """
    Performs Online Social Network user classification.

    Specifically:
           - Establishes a connection with a Mongo database and gets the newest tweets.
           - Forms graphs and text-based vector representation for the users involved.
           - Fetches Twitter lists for influential users.
           - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics.
           - Extracts graph-based features using the ARCTE algorithm.
           - Performs user classification for the rest of the users.
           - Stores the results at PServer.

    Input: - mongo_uri: A mongo client URI.
           - assessment_id:
           - twitter_app_key:
           - twitter_app_secret:
           - rabbitmq_uri:
           - rabbitmq_queue:
           - rabbitmq_exchange:
           - rabbitmq_routing_key:
           - pserver_host_name:
           - pserver_client_name:
           - pserver_client_pass:
           - latest_n: Get only the N most recent documents.
           - lower_timestamp: Get only documents created after this UNIX timestamp.
           - upper_timestamp: Get only documents created before this UNIX timestamp.
           - restart_probability:
           - number_of_threads:
           - number_of_users_to_annotate:
           - max_number_of_labels:
           - user_network_profile_classifier_db:
           - local_resources_folder: The preprocessed Twitter lists for a number of users are stored here.
    """
    ####################################################################################################################
    # Manage argument input.
    ####################################################################################################################
    spec = make_time_window_filter(lower_timestamp, upper_timestamp)

    if number_of_threads is None:
        number_of_threads = get_threads_number()

    ####################################################################################################################
    # Establish MongoDB connection.
    ####################################################################################################################
    client,\
    tweet_input_database_name,\
    tweet_input_collection_name = safe_establish_mongo_connection(mongo_uri, assessment_id)

    ####################################################################################################################
    # Preprocess tweets.
    ####################################################################################################################
    mention_graph,\
    retweet_graph,\
    user_id_set,\
    node_to_id = get_graphs_and_lemma_matrix(client,
                                             tweet_input_database_name,
                                             tweet_input_collection_name,
                                             spec,
                                             latest_n)

    adjacency_matrix, node_to_id, features, node_importances = integrate_graphs(mention_graph,
                                                                                retweet_graph,
                                                                                node_to_id,
                                                                                restart_probability,
                                                                                number_of_threads)

    ####################################################################################################################
    # Annotate users.
    ####################################################################################################################
    twitter_lists_gen,\
    user_ids_to_annotate,\
    user_twitter_ids_mongo,\
    user_twitter_ids_local = fetch_twitter_lists(client,
                                                 twitter_app_key,
                                                 twitter_app_secret,
                                                 user_network_profile_classifier_db,
                                                 local_resources_folder,
                                                 node_importances,
                                                 number_of_users_to_annotate,
                                                 node_to_id)

    user_label_matrix,\
    annotated_user_ids,\
    label_to_lemma,\
    lemma_to_keyword = annotate_users(client,
                                      twitter_lists_gen,
                                      user_ids_to_annotate,
                                      user_twitter_ids_mongo,
                                      user_twitter_ids_local,
                                      local_resources_folder,
                                      user_network_profile_classifier_db,
                                      node_to_id,
                                      max_number_of_labels)

    ####################################################################################################################
    # Perform user classification.
    ####################################################################################################################
    prediction = user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads)

    ####################################################################################################################
    # Write to Mongo, PServer and/or RabbitMQ.
    ####################################################################################################################
    # Write data to mongo.
    write_results_to_mongo(client,
                           user_network_profile_classifier_db,
                           get_user_topic_generator(prediction,
                                                    node_to_id,
                                                    label_to_lemma,
                                                    lemma_to_keyword))

    # Write data to pserver.
    if pserver_host_name is not None:
        topic_list = list(lemma_to_keyword.values())
        try:
            write_topics_to_pserver(pserver_host_name,
                                    pserver_client_name,
                                    pserver_client_pass,
                                    get_user_topic_generator(prediction,
                                                             node_to_id,
                                                             label_to_lemma,
                                                             lemma_to_keyword),
                                    topic_list)
        except Exception:
            print("Unable to write results to PServer.")

    # Publish results and success message on RabbitMQ.
    rabbitmq_server_service("restart")
    rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri)

    publish_results_via_rabbitmq(rabbitmq_connection,
                                 rabbitmq_queue,
                                 rabbitmq_exchange,
                                 rabbitmq_routing_key,
                                 get_user_topic_generator(prediction,
                                                          node_to_id,
                                                          label_to_lemma,
                                                          lemma_to_keyword))

    simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "SUCCESS")
    rabbitmq_connection.close()
def user_network_profile_classifier(mongo_uri,
                                    assessment_id,
                                    twitter_app_key,
                                    twitter_app_secret,
                                    rabbitmq_uri,
                                    rabbitmq_queue,
                                    rabbitmq_exchange,
                                    rabbitmq_routing_key,
                                    wp5_rabbitmq_uri,
                                    wp5_rabbitmq_queue,
                                    wp5_rabbitmq_exchange,
                                    wp5_rabbitmq_routing_key,
                                    pserver_host_name,
                                    pserver_client_name,
                                    pserver_client_pass,
                                    latest_n,
                                    lower_timestamp,
                                    upper_timestamp,
                                    timestamp_sort_key,
                                    restart_probability,
                                    number_of_threads,
                                    number_of_users_to_annotate,
                                    max_number_of_labels,
                                    user_network_profile_classifier_db,
                                    local_resources_folder,
                                    twitter_credentials):
    """
    Performs Online Social Network user classification.

    Specifically:
           - Establishes a connection with a Mongo database and gets the newest tweets.
           - Forms graphs and text-based vector representation for the users involved.
           - Fetches Twitter lists for influential users.
           - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics.
           - Extracts graph-based features using the ARCTE algorithm.
           - Performs user classification for the rest of the users.
           - Stores the results at PServer.

    Input: - mongo_uri: A mongo client URI.
           - assessment_id:
           - twitter_app_key:
           - twitter_app_secret:
           - rabbitmq_uri:
           - rabbitmq_queue:
           - rabbitmq_exchange:
           - rabbitmq_routing_key:
           - pserver_host_name:
           - pserver_client_name:
           - pserver_client_pass:
           - latest_n: Get only the N most recent documents.
           - lower_timestamp: Get only documents created after this UNIX timestamp.
           - upper_timestamp: Get only documents created before this UNIX timestamp.
           - timestamp_sort_key:
           - restart_probability:
           - number_of_threads:
           - number_of_users_to_annotate:
           - max_number_of_labels:
           - user_network_profile_classifier_db:
           - local_resources_folder: The preprocessed Twitter lists for a number of users are stored here.
    """
    good_labels_path = local_resources_folder + "good_labels.txt"
    bad_labels_path = local_resources_folder + "bad_labels.txt"

    ####################################################################################################################
    # Manage argument input.
    ####################################################################################################################
    spec = make_time_window_filter(lower_timestamp, upper_timestamp)

    if number_of_threads is None:
        number_of_threads = get_threads_number()

    ####################################################################################################################
    # Establish MongoDB connection.
    ####################################################################################################################
    client,\
    tweet_input_database_name,\
    tweet_input_collection_name = safe_establish_mongo_connection(mongo_uri, assessment_id)
    print("MongoDB connection established.")

    ####################################################################################################################
    # Preprocess tweets.
    ####################################################################################################################
    mention_graph,\
    retweet_graph,\
    user_lemma_matrix,\
    user_id_set,\
    node_to_id,\
    lemma_to_attribute,\
    id_to_name,\
    id_to_username,\
    id_to_listedcount = get_graphs_and_lemma_matrix(client,
                                                    tweet_input_database_name,
                                                    tweet_input_collection_name,
                                                    spec,
                                                    latest_n,
                                                    timestamp_sort_key)
    print("Users and user interactions extracted")

    adjacency_matrix,\
    node_to_id,\
    features,\
    node_importances,\
    old_node_list = integrate_graphs(mention_graph,
                                     retweet_graph,
                                     user_lemma_matrix,
                                     node_to_id,
                                     lemma_to_attribute,
                                     restart_probability,
                                     number_of_threads)
    number_of_users = adjacency_matrix.shape[0]
    print("Number of users in fused graph: ", number_of_users)
    if number_of_users < 2:
        rabbitmq_server_service("restart")
        rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri)

        simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "NOT_ENOUGH_CONNECTIONS")
        rabbitmq_connection.close()
        print("Failure message published via RabbitMQ.")
        return

    ####################################################################################################################
    # Annotate users.
    ####################################################################################################################

    twitter_lists_gen,\
    user_ids_to_annotate,\
    user_twitter_ids_mongo,\
    user_twitter_ids_local = fetch_twitter_lists(client,
                                                 twitter_app_key,
                                                 twitter_app_secret,
                                                 tweet_input_database_name,
                                                 local_resources_folder,
                                                 id_to_listedcount,
                                                 node_importances,
                                                 number_of_users_to_annotate,
                                                 node_to_id,
                                                 twitter_credentials)

    print("Annotating users with Twitter ids: ", user_ids_to_annotate)


    user_label_matrix,\
    annotated_user_ids,\
    label_to_lemma,\
    lemma_to_keyword = annotate_users(client,
                                      twitter_lists_gen,
                                      user_ids_to_annotate,
                                      user_twitter_ids_mongo,
                                      user_twitter_ids_local,
                                      local_resources_folder,
                                      tweet_input_database_name,
                                      node_to_id,
                                      max_number_of_labels,
                                      good_labels_path,
                                      bad_labels_path,
                                      user_lemma_matrix,
                                      old_node_list,
                                      lemma_to_attribute)
    number_of_labels = user_label_matrix.shape[1]
    print("Number of labels for classification: ", number_of_labels)
    if number_of_labels < 2:
        rabbitmq_server_service("restart")
        rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri)

        simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "NOT_ENOUGH_KEYWORDS")
        rabbitmq_connection.close()
        print("Failure message published via RabbitMQ.")
        return

    ####################################################################################################################
    # Perform user classification.
    ####################################################################################################################
    prediction = user_classification(features, user_label_matrix, annotated_user_ids, node_to_id, number_of_threads)
    print("User classification complete.")

    ####################################################################################################################
    # Write to Mongo, PServer and/or RabbitMQ.
    ####################################################################################################################
    # Write data to mongo.
    # write_results_to_mongo(client,
    #                        user_network_profile_classifier_db,
    #                        get_user_topic_generator(prediction,
    #                                                 node_to_id,
    #                                                 label_to_lemma,
    #                                                 lemma_to_keyword))
    write_results_to_mongo(client,
                           tweet_input_database_name,
                           get_user_topic_generator(prediction,
                                                    node_to_id,
                                                    label_to_lemma,
                                                    lemma_to_keyword),
                                                    id_to_name,
                                                    id_to_username)
    print("Results written in MongoDB.")

    # write_results_to_txt("/home/georgerizos/Documents/test.txt",
    #                      get_user_topic_generator(prediction,
    #                                               node_to_id,
    #                                               label_to_lemma,
    #                                               lemma_to_keyword))

    # Write data to pserver.
    if pserver_host_name is not None:
        topic_list = list(lemma_to_keyword.values())
        try:
            write_topics_to_pserver(pserver_host_name,
                                    pserver_client_name,
                                    pserver_client_pass,
                                    get_user_topic_generator(prediction,
                                                             node_to_id,
                                                             label_to_lemma,
                                                             lemma_to_keyword),
                                    topic_list)
            print("Results written in PServer.")
        except Exception:
            print("Unable to write results to PServer.")

    # Publish results and success message on RabbitMQ.
    wp5_rabbitmq_uri,\
    wp5_rabbitmq_queue,\
    wp5_rabbitmq_exchange,\
    wp5_rabbitmq_routing_key = check_wp5_rabbitmq_connection(wp5_rabbitmq_uri,
                                                             wp5_rabbitmq_queue,
                                                             wp5_rabbitmq_exchange,
                                                             wp5_rabbitmq_routing_key,
                                                             rabbitmq_uri,
                                                             rabbitmq_queue,
                                                             rabbitmq_exchange,
                                                             rabbitmq_routing_key,
                                                             tweet_input_database_name)

    rabbitmq_server_service("restart")
    wp5_rabbitmq_connection = establish_rabbitmq_connection(wp5_rabbitmq_uri)

    publish_results_via_rabbitmq(rabbitmq_connection=wp5_rabbitmq_connection,
                                 rabbitmq_queue=wp5_rabbitmq_queue,
                                 rabbitmq_exchange=wp5_rabbitmq_exchange,
                                 rabbitmq_routing_key=wp5_rabbitmq_routing_key,
                                 user_topic_gen=get_user_topic_generator(prediction,
                                                                         node_to_id,
                                                                         label_to_lemma,
                                                                         lemma_to_keyword),
                                 id_to_name=id_to_name)
    print("Results published via RabbitMQ.")

    rabbitmq_server_service("restart")
    rabbitmq_connection = establish_rabbitmq_connection(rabbitmq_uri)

    simple_notification(rabbitmq_connection, rabbitmq_queue, rabbitmq_exchange, rabbitmq_routing_key, "SUCCESS")
    print("Success message published via RabbitMQ.")