def annotate_users(client, twitter_lists_gen, user_ids_to_annotate, user_twitter_ids_mongo, user_twitter_ids_local, local_resources_folder, user_network_profile_classifier_db, node_to_id, max_number_of_labels): """ Forms a user-to-label matrix by annotating certain users. Inputs: - client: A MongoDB client. - twitter_lists_gen: A python generator that generates Twitter list generators. - user_ids_to_annotate: A list of Twitter user ids. - node_to_id: A node to Twitter id map as a python dictionary. Outputs: - user_label_matrix: A user-to-label matrix in scipy sparse matrix format. - annotated_user_ids: A list of Twitter user ids. """ # Process lists and store keywords in mongo. # TODO: Do asynchronous I/O and preprocessing. user_twitter_list_keywords_gen = extract_user_keywords_generator(twitter_lists_gen, lemmatizing="wordnet") store_user_documents(user_twitter_list_keywords_gen, client=client, mongo_database_name=user_network_profile_classifier_db, mongo_collection_name="twitter_list_keywords_collection") # Read local resources as well. # Calculate which user annotations to fetch. user_twitter_ids_local = np.intersect1d(np.array(list(node_to_id.values()), dtype=int), np.array(user_twitter_ids_local, dtype=int)) local_user_twitter_list_keywords_gen = read_local_user_annotations(json_folder=local_resources_folder, user_twitter_ids=user_twitter_ids_local) # Calculate which user annotations to fetch. user_ids_to_fetch = np.intersect1d(np.array(list(node_to_id.values()), dtype=int), np.array(user_twitter_ids_mongo, dtype=int)) mongo_user_twitter_list_keywords_gen = read_user_documents_generator(user_ids_to_fetch, client=client, mongo_database_name=user_network_profile_classifier_db, mongo_collection_name="twitter_list_keywords_collection") user_twitter_list_keywords_gen = itertools.chain(local_user_twitter_list_keywords_gen, mongo_user_twitter_list_keywords_gen) # Annotate users. id_to_node = dict(zip(node_to_id.values(), node_to_id.keys())) user_label_matrix, annotated_user_ids, label_to_lemma, lemma_to_keyword = form_user_label_matrix(user_twitter_list_keywords_gen, id_to_node, max_number_of_labels) return user_label_matrix, annotated_user_ids, label_to_lemma, lemma_to_keyword
def write_results_to_mongo(client, user_network_profile_classifier_db, user_topic_gen): """ What it says on the tin. Inputs: - client: A MongoDB client. - user_network_profile_classifier_db: - user_topic_gen: A python generator that generates users and a generator of associated topic keywords. """ store_user_documents(user_topic_gen, client=client, mongo_database_name=user_network_profile_classifier_db, mongo_collection_name="user_topics_collection")