def test_keyword_analysis_methods(self, log_data, nicks, nick_same_list):
        update_expected_output_directory(log_data)
        keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words, keywords_for_channels = \
                           user.keywords(log_data, nicks, nick_same_list);
        expected_keywords_filtered = []
        expected_user_keyword_freq_dict = []
        expected_user_word_dict = []
        expected_nicks_for_stop_words = []

        unjson('keywords_filtered.json', expected_keywords_filtered)
        unjson('user_words_dict.json', expected_user_word_dict)
        unjson('user_keyword_freq_dict.json', expected_user_keyword_freq_dict)
        unjson('nicks_for_stop_words.json', expected_nicks_for_stop_words)

        self.assertListEqual(user_keyword_freq_dict, expected_user_keyword_freq_dict[0], msg=None)
        self.assertListEqual(user_words_dict, expected_user_word_dict[0], msg=None)
        self.assertListEqual(keywords_filtered,expected_keywords_filtered[0])
        self.assertListEqual(nicks_for_stop_words,expected_nicks_for_stop_words[0])
Beispiel #2
0
    def test_keywords(self):
        expected_keywords_filtered = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/keywords_filtered")
        expected_user_keyword_freq_dict = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/user_keyword_freq_dict")
        expected_user_words_dict = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/user_words_dict")
        expected_nicks_for_stop_words = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/nicks_for_stop_words")
        expected_sorted_keywords_for_channels = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/sorted_keywords_for_channels")
        expected_captured_output = util.load_from_disk(
            self.current_directory +
            "/data/user_test/keywords/stdout_captured_output")
        captured_output = StringIO.StringIO()
        sys.stdout = captured_output
        keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words, sorted_keywords_for_channels = user.keywords(
            self.log_data, self.nicks, self.nick_same_list)
        sys.stdout = sys.__stdout__
        output = captured_output.getvalue()
        captured_output.close()

        self.assertEqual(expected_captured_output, output)
        self.assertEqual(expected_keywords_filtered, keywords_filtered)
        self.assertEqual(expected_user_keyword_freq_dict,
                         user_keyword_freq_dict)
        self.assertEqual(expected_user_words_dict, user_words_dict)
        self.assertEqual(expected_nicks_for_stop_words, nicks_for_stop_words)
        self.assertEqual(expected_sorted_keywords_for_channels,
                         sorted_keywords_for_channels)
Beispiel #3
0
    def test_keywords(self, mock_top_keywords_for_nick,
                      mock_extended_stop_words, mock_rec_list_splice,
                      mock_correct_nick_for_, mock_splice_find,
                      mock_correct_last_char_list, mock_correctLastCharCR,
                      mock_check_if_msg_line, mock_get_nick_representative):
        mock_get_nick_representative.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/user_test/get_nick_representative_list")
        mock_check_if_msg_line.side_effect = util.load_from_disk(
            self.current_directory + "/data/user_test/check_if_msg_line_list")
        mock_correctLastCharCR.side_effect = util.load_from_disk(
            self.current_directory + "/data/user_test/correctLastCharCR_list")
        mock_correct_last_char_list.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/user_test/correct_last_char_list_list")
        mock_splice_find.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/user_test/keywords/splice_find_list")
        mock_correct_nick_for_.side_effect = util.load_from_disk(
            self.current_directory + "/data/user_test/correct_nick_for_list")
        mock_rec_list_splice.side_effect = util.load_from_disk(
            self.current_directory + "/data/user_test/rec_list_splice_list")
        mock_extended_stop_words.return_value = util.load_from_disk(
            self.current_directory +
            "/data/user_test/keywords/extended_stop_words")
        mock_top_keywords_for_nick.side_effect = util.load_from_disk(
            self.current_directory +
            "/data/user_test/keywords/top_keywords_for_nick")
        expected_keywords_filtered = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/keywords_filtered")
        expected_user_keyword_freq_dict = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/user_keyword_freq_dict")
        expected_user_words_dict = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/user_words_dict")
        expected_nicks_for_stop_words = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/nicks_for_stop_words")
        expected_sorted_keywords_for_channels = util.load_from_disk(
            self.current_directory +
            "/../../../data/user_test/keywords/sorted_keywords_for_channels")
        expected_captured_output = util.load_from_disk(
            self.current_directory +
            "/data/user_test/keywords/stdout_captured_output")
        captured_output = StringIO.StringIO()
        sys.stdout = captured_output
        keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words, sorted_keywords_for_channels = user.keywords(
            self.log_data, self.nicks, self.nick_same_list)
        sys.stdout = sys.__stdout__
        output = captured_output.getvalue()
        captured_output.close()

        self.assertEqual(expected_captured_output, output)
        self.assertEqual(expected_keywords_filtered, keywords_filtered)
        self.assertEqual(expected_user_keyword_freq_dict,
                         user_keyword_freq_dict)
        self.assertEqual(expected_user_words_dict, user_words_dict)
        self.assertEqual(expected_nicks_for_stop_words, nicks_for_stop_words)
        self.assertEqual(expected_sorted_keywords_for_channels,
                         sorted_keywords_for_channels)
Beispiel #4
0
def identify_hubs_and_experts(log_dict, nicks, nick_same_list):
    """
        uses message_number graph to identify hubs and experts in the network

    Args:
        log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name}
        nicks(list): list of all the nicks
        nick_same_list(list): list of lists mentioning nicks which belong to same users
    """
    message_graph = message_number_graph(log_dict, nicks, nick_same_list)
    hubs, authority_values = nx.hits(message_graph)

    keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words, keywords_for_channels = user.keywords(log_dict, nicks, nick_same_list)
    if config.DEBUGGER:
        print "========> USERS"
        print user_keyword_freq_dict
        print "========> CHANNELS"
        print keywords_for_channels, len(keywords_for_channels)

    top_keywords_for_channels = []
    for word_tuple in keywords_for_channels[:config.NUMBER_OF_KEYWORDS_CHANNEL_FOR_OVERLAP]:
        top_keywords_for_channels.append(word_tuple[0])

    overlap_word_number = []
    for keyword_tuple in user_keyword_freq_dict:
        keywords_for_user = keyword_tuple['keywords']
        username = keyword_tuple['nick']
        overlapping_keywords = list(set(top_keywords_for_channels).intersection([x[0] for x in keywords_for_user]))
        if len(overlapping_keywords) > 0:
            overlap_word_number.append([username, len(overlapping_keywords)])

    top_hubs_with_score = util.find_top_n_element_after_sorting(hubs.items(), 1, True, config.HOW_MANY_TOP_EXPERTS)
    top_auth_with_score = util.find_top_n_element_after_sorting(authority_values.items(), 1, True, config.HOW_MANY_TOP_EXPERTS)
    top_keyword_overlap_with_score = util.find_top_n_element_after_sorting(overlap_word_number, 1, True, config.HOW_MANY_TOP_EXPERTS)

    print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " HUBS\n", top_hubs_with_score
    print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " AUTH\n", top_auth_with_score
    print "TOP " + str(config.HOW_MANY_TOP_EXPERTS) + " KEYWORD OVERLAP\n", top_keyword_overlap_with_score

    top_hub = [hub_tuple[0] for hub_tuple in top_hubs_with_score]
    top_auth = [auth_tuple[0] for auth_tuple in top_auth_with_score]
    top_keyword_overlap = [key_overlap_tuple[0] for key_overlap_tuple in top_keyword_overlap_with_score]

    for node_name in message_graph:
        # mark EXPERTS
        message_graph.node[node_name]['style'] = 'filled'
        if node_name in top_auth and node_name in top_keyword_overlap:
            message_graph.node[node_name]['color'] = '#ff000'
        elif node_name in top_auth:
            message_graph.node[node_name]['color'] = '#00ff00'
        elif node_name in top_keyword_overlap:
            message_graph.node[node_name]['color'] = '#0000ff'
        else:
            message_graph.node[node_name]['color'] = '#cccccc'
        # mark HUBS
        if node_name in top_hub:
            message_graph.node[node_name]['shape'] = 'square'

    return message_graph, top_hub, top_keyword_overlap, top_auth