コード例 #1
0
def correlational_activity(log_directory, output_directory, channel_name):
    pearson = []
    for month in xrange(1, 12):
        log_data_m1 = reader.linux_input(log_directory, channel_name,
                                         "2013-" + str(month) + "-1",
                                         "2013-" + str(month) + "-31")
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        bin_matrix_m1, total_messages_m1 = network.message_number_bins_csv(
            log_data_m1, nicks_m1, nick_same_list_m1)
        monthly_sum_bins_m1 = [sum(i) for i in zip(*bin_matrix_m1)]

        log_data_m2 = reader.linux_input(log_directory, channel_name,
                                         "2013-" + str(month + 1) + "-1",
                                         "2013-" + str(month + 1) + "-31")
        nicks_m2, nick_same_list_m2 = nickTracker.nick_tracker(log_data_m2)
        bin_matrix_m2, total_messages_m2 = network.message_number_bins_csv(
            log_data_m2, nicks_m2, nick_same_list_m2)
        monthly_sum_bins_m2 = [sum(i) for i in zip(*bin_matrix_m2)]
        corr = np.corrcoef(monthly_sum_bins_m1, monthly_sum_bins_m2)[0, 1]

        print "\n----------------------------------"
        print "For months", month, "and", month + 1
        print "Bins for M1:", monthly_sum_bins_m1
        print "Bins for M2:", monthly_sum_bins_m2
        print "Pearson correlation:", corr
        pearson.append(corr)

    vis.box_plot(pearson, output_directory, "pearson2013")
    saver.save_csv([pearson], output_directory, "pearson2013")
コード例 #2
0
    def test_linux_input_invalid_path(self):
        with self.assertRaises(IOError) as ex:
            reader.linux_input("some non existent path/", self.channel_name,
                               self.starting_date, self.starting_date)

        self.assertEqual(
            str(ex.exception),
            "Path some non existent path/2013/01/01/ doesn't exist")
コード例 #3
0
ファイル: validate.py プロジェクト: soundarya98/IRCLogParser
def correlational_activity(log_directory, output_directory, channel_name,
                           start_date, end_date):
    """
        The function selects a month in the given date range and creates heatmap bins for the current month and the next
        month. It then calculates the correlational calculates the correlational vectors between the two heatmaps and
        then produces a box plot for all the correlational coefficients of the months in the given date range.

    Args:
        log_directory(str): path to the location of Logs
        output_directory(str):  path to the location where the results are to be stored
        channel_name(list): channels for which the analysis is to be done
        start_date(datetime): starting date for the logs to be analysed. This has to be the beginning of the month.
        end_date(datetime): ending date for which the logs are to be analysed. This has to be the end of the month.

    Returns:
       null

    """
    start_date = start_date.strptime('%Y-%m-%d')
    end_date = end_date.strptime('%Y-%m-%d')
    pearson = []
    for dt in rrule(MONTHLY, dtstart=start_date, until=end_date):
        last_day_of_the_month1 = dt + relativedelta(
            months=1) - datetime.timedelta(days=1)

        log_data_m1 = reader.linux_input(
            log_directory, channel_name, dt.strftime("%Y-%m-%d"),
            last_day_of_the_month1.strftime("%Y-%m-%d"))
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        bin_matrix_m1, total_messages_m1 = network.message_number_bins_csv(
            log_data_m1, nicks_m1, nick_same_list_m1)
        monthly_sum_bins_m1 = [sum(i) for i in zip(*bin_matrix_m1)]

        next_month_dt = dt + relativedelta(months=1)
        last_day_of_the_month2 = next_month_dt + relativedelta(
            months=1) - datetime.timedelta(days=1)
        log_data_m2 = reader.linux_input(
            log_directory, channel_name, next_month_dt.strftime("%Y-%m-%d"),
            last_day_of_the_month2.strftime("%Y-%m-%d"))
        nicks_m2, nick_same_list_m2 = nickTracker.nick_tracker(log_data_m2)
        bin_matrix_m2, total_messages_m2 = network.message_number_bins_csv(
            log_data_m2, nicks_m2, nick_same_list_m2)
        monthly_sum_bins_m2 = [sum(i) for i in zip(*bin_matrix_m2)]
        corr = np.corrcoef(monthly_sum_bins_m1, monthly_sum_bins_m2)[0, 1]

        print "\n----------------------------------"
        print "For months", dt.month, "and", dt.month + 1
        print "Bins for M1:", monthly_sum_bins_m1
        print "Bins for M2:", monthly_sum_bins_m2
        print "Pearson correlation:", corr
        pearson.append(corr)

    vis.box_plot(pearson, output_directory, "pearson2013")
    saver.save_csv([pearson], output_directory, "pearson2013")
コード例 #4
0
    def test_community_analysis_single_channel_cutoff_20(self):
        log_data = reader.linux_input(self.log_data_dir, ["#kubuntu-devel"],
                                      self.start_date, self.end_date)
        expected_result = util.load_from_disk(
            self.current_directory +
            '/data/output/community_analysis_single_channel_cutoff_20')
        nicks, nick_same_list = nickTracker.nick_tracker(log_data)
        default_cutoff = config.THRESHOLD_MESSAGE_NUMBER_GRAPH
        config.THRESHOLD_MESSAGE_NUMBER_GRAPH = 20
        message_number_graph = network.message_number_graph(
            log_data, nicks, nick_same_list, False)
        saver.save_net_nx_graph(
            message_number_graph, self.current_directory,
            "message-exchange-" + self.start_date + "-cutoff-" +
            str(config.THRESHOLD_MESSAGE_NUMBER_GRAPH))

        expected_output = community.infomap_igraph(
            ig_graph=None,
            net_file_location=self.current_directory + "/message-exchange-" +
            self.start_date + "-cutoff-" +
            str(config.THRESHOLD_MESSAGE_NUMBER_GRAPH) + '.net')
        os.remove(self.current_directory + "/message-exchange-" +
                  self.start_date + "-cutoff-" +
                  str(config.THRESHOLD_MESSAGE_NUMBER_GRAPH) + '.net')
        config.THRESHOLD_MESSAGE_NUMBER_GRAPH = default_cutoff

        self.assertTrue(expected_result[0].isomorphic(expected_output[0]))
        self.assertEqual(
            compare_communities(expected_result[1], expected_output[1]), 0)
コード例 #5
0
 def setUp(self):
     self.current_directory = os.path.dirname(os.path.realpath(__file__))
     self.out_dir = self.current_directory + "/data/output/2013/01/user_tracking/"
     self.log_data = reader.linux_input(
         self.current_directory + "/data/input/",
         ["#kubuntu-devel", "#kubuntu", "#ubuntu-devel"], "2013-1-1",
         "2013-1-7")
コード例 #6
0
 def setUp(self):
     self.current_directory = os.path.dirname(os.path.realpath(__file__))
     self.log_data_dir = self.current_directory + "/data/input/"
     self.log_data_kubuntu_devel = reader.linux_input(
         self.log_data_dir, ["#kubuntu-devel"], '2013-01-01', '2013-01-31')
     self.nicks, self.nick_same_list = nickTracker.nick_tracker(
         self.log_data_kubuntu_devel)
コード例 #7
0
def keywords_hits_overlap(log_directory, output_directory, channel_name):
    # Correlational: overlap for keyword digest and HITS
    for month in xrange(1, 13):
        log_data_m1 = reader.linux_input(log_directory, channel_name,
                                         "2013-" + str(month) + "-1",
                                         "2013-" + str(month) + "-31")
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        message_graph_m1, top_hubs_m1, top_keyword_overlap_m1, top_auth_m1 = network.identify_hubs_and_experts(
            log_data_m1, nicks_m1, nick_same_list_m1)
        saver.draw_nx_graph(message_graph_m1, output_directory,
                            "expert-month-" + str(month))

        log_data_m2 = reader.linux_input(log_directory, channel_name,
                                         "2013-" + str(month + 1) + "-1",
                                         "2013-" + str(month + 1) + "-31")
        nicks_m2, nick_same_list_m2 = nickTracker.nick_tracker(log_data_m1)
        message_graph_m2, top_hubs_m2, top_keyword_overlap_with_score_m2, top_auth_m2 = network.identify_hubs_and_experts(
            log_data_m2, nicks_m2, nick_same_list_m2)

        print "Top 10 HUBS for Month [HITS]", month, ":", top_hubs_m1
        print "Top 10 HUBS for Month [HITS]", month + 1, ":", top_hubs_m2
        print "Number of common HUBS (from 10) between above 2 months:", len(
            list(set(top_hubs_m1).intersection(top_hubs_m2)))

        print "Top 10 Experts by keywords for Months", month, ":", top_keyword_overlap_m1
        print "Top 10 Experts by keywords for Months", month + 1, ":", top_keyword_overlap_with_score_m2
        print "Number of common Experts by keywords (from 10) between above 2 months:", len(
            list(
                set(top_keyword_overlap_m1).intersection(
                    top_keyword_overlap_with_score_m2)))

        print "Top 10 AUTH for Month [HITS]", month, ":", top_auth_m1
        print "Top 10 AUTH for Month [HITS]", month + 1, ":", top_auth_m2
        print "Number of common AUTH (from 10) between above 2 months:", len(
            list(set(top_auth_m1).intersection(top_auth_m2)))

        print "Number of users common btw HUBS from HITS and Experts by Keywords (from 10) for month", month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_hubs_m1)))
        print "Number of users common btw AUTH from HITS and Experts by Keywords (from 10) for month", month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS from HITS and AUTH from HITS (from 10) for month", month, ":", len(
            list(set(top_hubs_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS, HITS and KEYWORDS", month, ":", len(
            set(list(set(top_keyword_overlap_m1).intersection(
                top_hubs_m1))).intersection(top_auth_m1))
コード例 #8
0
def box_plot_for_degree(log_directory, output_directory, channel_name):
    cutoff = 0
    for channel_name_iter in channel_name:
        out_degree_fit_parameters = np.zeros((12, 4))
        in_degree_fit_parameters = np.zeros((12, 4))
        total_degree_fit_parameters = np.zeros((12, 4))
        for month in range(1, 13):
            log_data = reader.linux_input(log_directory, channel_name_iter,
                                          "2013-" + str(month) + "-1",
                                          "2013-" + str(month) + "-31")
            nicks, nick_same_list = nickTracker.nick_tracker(log_data)

            message_number_graph = network.message_number_graph(
                log_data, nicks, nick_same_list, False)
            degree_anal_message_number = network.degree_analysis_on_graph(
                message_number_graph)

            out_degree_fit_parameters[month - 1] = vis.generate_log_plots(
                degree_anal_message_number["out_degree"]["raw_for_vis"],
                output_directory, channel_name_iter[0])
            in_degree_fit_parameters[month - 1] = vis.generate_log_plots(
                degree_anal_message_number["in_degree"]["raw_for_vis"],
                output_directory, channel_name_iter[0])
            total_degree_fit_parameters[month - 1] = vis.generate_log_plots(
                degree_anal_message_number["total_degree"]["raw_for_vis"],
                output_directory, channel_name_iter[0])

        parameters = ['slope', 'intercept', 'r_square']
        for para_ind in range(len(parameters)):
            vis.box_plot(
                out_degree_fit_parameters[:, para_ind], output_directory,
                "out_degree_" + str(parameters[para_ind]) + "_2013_" +
                channel_name_iter[0] + "_cut_" + str(cutoff))
            vis.box_plot(
                in_degree_fit_parameters[:, para_ind], output_directory,
                "in_degree_" + str(parameters[para_ind]) + "_2013_" +
                channel_name_iter[0] + "_cut_" + str(cutoff))
            vis.box_plot(
                total_degree_fit_parameters[:, para_ind], output_directory,
                "total_degree_" + str(parameters[para_ind]) + "_2013_" +
                channel_name_iter[0] + "_cut_" + str(cutoff))

            saver.save_csv([out_degree_fit_parameters[:, para_ind].tolist()],
                           output_directory, "out_degree_" +
                           str(parameters[para_ind]) + "_2013_" +
                           channel_name_iter[0] + "_cut_" + str(cutoff))
            saver.save_csv([in_degree_fit_parameters[:, para_ind].tolist()],
                           output_directory, "in_degree_" +
                           str(parameters[para_ind]) + "_2013_" +
                           channel_name_iter[0] + "_cut_" + str(cutoff))
            saver.save_csv([total_degree_fit_parameters[:, para_ind].tolist()],
                           output_directory, "total_degree_" +
                           str(parameters[para_ind]) + "_2013_" +
                           channel_name_iter[0] + "_cut_" + str(cutoff))
コード例 #9
0
 def test_message_exchange_network(self):
     log_data = reader.linux_input(self.log_data_dir, ["#kubuntu-devel"],
                                   self.start_date, self.end_date)
     expected_result = util.load_from_disk(
         self.current_directory +
         '/data/output/degree_anal_message_number_graph_kubuntu-devel')
     nicks, nick_same_list = nickTracker.nick_tracker(log_data)
     message_number_graph = network.message_number_graph(
         log_data, nicks, nick_same_list, False)
     expected_output = network.degree_analysis_on_graph(
         message_number_graph)
     self.assertEqual(expected_result, expected_output)
コード例 #10
0
    def test_linux_input_non_existent_file(self):
        expected_captured_output = util.load_from_disk(self.current_directory + "/data/stdout_captured_linux_input_non_existent_file")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput
        expected_log_data = reader.linux_input(self.current_directory + "/data/log/", ["some non existent file","#kubuntu-devel"], self.starting_date, self.ending_date)
        output = capturedOutput.getvalue()
        capturedOutput.close()
        sys.stdout = sys.__stdout__

        output = re.sub(r'(?P<begin>.+ )/.+/(?P<constant>IRCLogParser/.+\n)',r'\g<begin>\g<constant>', output)
        self.assertEqual(self.log_data, expected_log_data)
        self.assertEqual(expected_captured_output, output)
コード例 #11
0
 def test_reduced_networks_cutoff_20(self):
     default_config = config.THRESHOLD_MESSAGE_NUMBER_GRAPH
     config.THRESHOLD_MESSAGE_NUMBER_GRAPH = 20
     log_data = reader.linux_input(self.log_data_dir, ["#kubuntu-devel"],
                                   self.start_date, self.end_date)
     expected_result = util.load_from_disk(
         self.current_directory +
         '/data/output/message_number_graph_cutoff_20')
     nicks, nick_same_list = nickTracker.nick_tracker(log_data, False)
     expected_output = network.message_number_graph(log_data, nicks,
                                                    nick_same_list, False)
     config.THRESHOLD_MESSAGE_NUMBER_GRAPH = default_config
     self.assertTrue(nx.is_isomorphic(expected_result, expected_output))
コード例 #12
0
    def test_linux_input_all_channels(self):
        expected_capturedOutput = util.load_from_disk(self.current_directory + "/data/stdout_captured_linux_input_all_channels")
        expected_log_data = util.load_from_disk(self.current_directory + "/data/log_data_for_test_linux_input_all_channels")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput
        log_data = reader.linux_input(self.current_directory + "/data/log_to_test_for_all_channels/", ["ALL"], "2013-1-1", "2013-1-2")
        output = capturedOutput.getvalue()
        capturedOutput.close()
        sys.stdout = sys.__stdout__

        #See https://docs.python.org/2/library/re.html for more details.
        output = re.sub(r'(?P<begin>.+ )/.+/(?P<constant>IRCLogParser/.+\n)', r'\g<begin>\g<constant>', output)

        self.assertEqual(expected_log_data, log_data)
        self.assertEqual(expected_capturedOutput, output)
コード例 #13
0
    def test_linux_input(self):
        expected_capturedOutput = util.load_from_disk(self.current_directory + "/data/stdout_captured_linux_input")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput
        log_data = reader.linux_input(self.current_directory + "/data/log/", self.channel_name, self.starting_date, self.ending_date)
        output = capturedOutput.getvalue()
        capturedOutput.close()
        sys.stdout = sys.__stdout__
        #See https://docs.python.org/2/library/re.html for more details.
        # string 'Working on: /any_valid_path/IRCLogParser/test/unit-test/test_lib/test_in_out/data/log/2013/01/04/#kubuntu-devel.txt\n' is replaced by
        # 'Working on: IRCLogParser/test/unit-test/test_lib/test_in_out/data/log/2013/01/04/#kubuntu-devel.txt\n'
        output = re.sub(r'(?P<begin>.+ )/.+/(?P<constant>IRCLogParser/.+\n)', r'\g<begin>\g<constant>', output)

        self.assertEqual(log_data, self.log_data)
        self.assertEqual(expected_capturedOutput, output)
コード例 #14
0
    def test_degree_distribution_multi_channel(self):
        log_data = reader.linux_input(self.log_data_dir, ["ALL"],
                                      self.start_date, self.end_date)
        expected_result_CC_degree_curve_fit = util.load_from_disk(
            self.current_directory + '/data/output/CC_degree_curve_fit')
        expected_result_CU_degree_curve_fit = util.load_from_disk(
            self.current_directory + '/data/output/CU_degree_curve_fit')
        expected_result_UU_degree_curve_fit = util.load_from_disk(
            self.current_directory + '/data/output/UU_degree_curve_fit')

        nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash = nickTracker.nick_tracker(
            log_data, True)
        dict_out, graph = network.channel_user_presence_graph_and_csv(
            nicks, nick_same_list, channels_for_user, nick_channel_dict,
            nicks_hash, channels_hash)
        degree_anal_message_number_CC = network.degree_analysis_on_graph(
            dict_out["CC"]["graph"], directed=False)
        degree_anal_message_number_UU = network.degree_analysis_on_graph(
            dict_out["UU"]["graph"], directed=False)
        degree_anal_message_number_CU = network.degree_analysis_on_graph(
            dict_out["CU"]["graph"], directed=False)

        Y = degree_anal_message_number_CU["degree"]["raw_for_vis"][1:]
        data = [(i, Y[i]) for i in range(len(Y))]
        CU_truncated, cutoff = channel.truncate_table(data, 0.5)
        CU_T = [data[1] for data in list(CU_truncated)]
        expected_output_CC_degree_curve_fit = vis.generate_log_plots(
            degree_anal_message_number_CC["degree"]["raw_for_vis"],
            self.current_directory, "CC_degree_curve_fit")

        expected_output_CU_degree_curve_fit = vis.generate_log_plots(
            CU_T, self.current_directory, "CU_degree_curve_fit")

        expected_output_UU_degree_curve_fit = vis.generate_log_plots(
            degree_anal_message_number_UU["degree"]["raw_for_vis"],
            self.current_directory, "UU_degree_curve_fit")
        os.remove(self.current_directory + "/CC_degree_curve_fit" + ".png")
        os.remove(self.current_directory + "/CU_degree_curve_fit" + ".png")
        os.remove(self.current_directory + "/UU_degree_curve_fit" + ".png")

        self.assertEqual(expected_result_CC_degree_curve_fit,
                         expected_output_CC_degree_curve_fit)
        self.assertEqual(expected_result_CU_degree_curve_fit,
                         expected_output_CU_degree_curve_fit)
        self.assertEqual(expected_result_UU_degree_curve_fit,
                         expected_output_UU_degree_curve_fit)
コード例 #15
0
 def test_degree_distribution_message_exchange_network(self):
     degree_type = ["out_degree", "in_degree", "total_degree"]
     log_data = reader.linux_input(self.log_data_dir, ["#kubuntu-devel"],
                                   self.start_date, self.end_date)
     expected_result = util.load_from_disk(
         self.current_directory +
         '/data/output/message_exchange_network_curve_fit')
     nicks, nick_same_list = nickTracker.nick_tracker(log_data)
     message_number_graph = network.message_number_graph(
         log_data, nicks, nick_same_list, False)
     degree_anal_message_number = network.degree_analysis_on_graph(
         message_number_graph)
     expected_output = {}
     for dtype in degree_type:
         expected_output[dtype] = vis.generate_log_plots(
             degree_anal_message_number[dtype]["raw_for_vis"],
             self.current_directory, "#kubuntu-devel" + dtype)
         os.remove(self.current_directory + "/#kubuntu-devel" + dtype +
                   ".png")
     self.assertEqual(expected_result, expected_output)
コード例 #16
0
def codelengths(log_directory, output_directory, channel_name):
    codelengths = []
    for month in xrange(1, 13):
        log_data_m1 = reader.linux_input(log_directory, channel_name,
                                         "2013-" + str(month) + "-1",
                                         "2013-" + str(month) + "-31")
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        message_number_graph_m1 = network.message_number_graph(
            log_data_m1, nicks_m1, nick_same_list_m1, False)
        try:
            #FOS is a reserved word in igraph and if 'fos' is a username in the nx graph, it generates an error
            saver.save_net_nx_graph(message_number_graph_m1, output_directory,
                                    "message-exchange-" + str(month))
            msg_igraph, msg_community = community.infomap_igraph(
                ig_graph=None,
                net_file_location=output_directory + "message-exchange-" +
                str(month) + '.net')
            codelengths.append(msg_community.codelength)
        except:
            node_labels = message_number_graph_m1.nodes()
            labels = {}
            for label in node_labels:
                if label == "fos":
                    labels[label] = "fos_"
                else:
                    labels[label] = label

            message_number_graph_m1 = nx.relabel_nodes(message_number_graph_m1,
                                                       labels)
            saver.save_net_nx_graph(message_number_graph_m1, output_directory,
                                    "message-exchange-" + str(month))
            print "error in", month

        msg_igraph, msg_community = community.infomap_igraph(
            ig_graph=None,
            net_file_location=output_directory + "message-exchange-" +
            str(month) + '.net')
        codelengths.append(msg_community.codelength)

    vis.box_plot(codelengths, output_directory, "codelengths2013")
    saver.save_csv([codelengths], output_directory, "codelengths2013")
コード例 #17
0
    def test_community_analysis_multi_channel(self):
        log_data = reader.linux_input(self.log_data_dir, ["ALL"],
                                      self.start_date, self.end_date)
        expected_result = util.load_from_disk(
            self.current_directory +
            '/data/output/community_analysis_multi_channel')
        nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash = nickTracker.nick_tracker(
            log_data, True)
        dict_out, graph = network.channel_user_presence_graph_and_csv(
            nicks, nick_same_list, channels_for_user, nick_channel_dict,
            nicks_hash, channels_hash)

        presence_type = ["CC", "UU", "CU"]
        expected_output = {ptype: {} for ptype in presence_type}
        for ptype in presence_type:
            saver.save_net_nx_graph(dict_out[ptype]["graph"],
                                    self.current_directory, "adj" + ptype)
            saver.save_net_nx_graph(dict_out[ptype]["reducedGraph"],
                                    self.current_directory, "radj" + ptype)
            expected_output[ptype]['adj'] = community.infomap_igraph(
                ig_graph=None,
                net_file_location=self.current_directory + '/adj' + ptype +
                '.net')
            expected_output[ptype]['radj'] = community.infomap_igraph(
                ig_graph=None,
                net_file_location=self.current_directory + '/radj' + ptype +
                '.net')

            os.remove(self.current_directory + '/adj' + ptype + '.net')
            os.remove(self.current_directory + '/radj' + ptype + '.net')

            self.assertTrue(expected_result[ptype]['adj'][0].isomorphic(
                expected_output[ptype]['adj'][0]))
            self.assertEqual(
                compare_communities(expected_output[ptype]['adj'][1],
                                    expected_result[ptype]['adj'][1]), 0)
            self.assertTrue(expected_result[ptype]['radj'][0].isomorphic(
                expected_output[ptype]['radj'][0]))
            self.assertEqual(
                compare_communities(expected_output[ptype]['radj'][1],
                                    expected_result[ptype]['radj'][1]), 0)
コード例 #18
0
 def test_presence_networks(self):
     log_data = reader.linux_input(self.log_data_dir, ["ALL"],
                                   self.start_date, self.end_date)
     expected_result = util.load_from_disk(
         self.current_directory + '/data/output/presence_graph_dict')
     nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash = nickTracker.nick_tracker(
         log_data, True)
     expected_output, graph = network.channel_user_presence_graph_and_csv(
         nicks, nick_same_list, channels_for_user, nick_channel_dict,
         nicks_hash, channels_hash)
     edge_types = ['CC', 'UU', 'CU']
     for edge_type in edge_types:
         self.assertTrue(
             nx.is_isomorphic(expected_output[edge_type]['graph'],
                              expected_result[edge_type]['graph']))
         self.assertTrue(
             nx.is_isomorphic(expected_output[edge_type]['reducedGraph'],
                              expected_result[edge_type]['reducedGraph']))
         expected_output[edge_type].pop('graph')
         expected_output[edge_type].pop('reducedGraph')
         expected_result[edge_type].pop('graph')
         expected_result[edge_type].pop('reducedGraph')
     np.testing.assert_equal(expected_output, expected_result)
コード例 #19
0
ファイル: sample.py プロジェクト: rohangoel96/IRCLogParser
import sys
sys.path.insert(0, "IRCLogParser/")
from lib.in_out import reader, saver
from lib import nickTracker, config, vis, validate
from lib.analysis import network, channel, user, community

log_directory = config.LOG_DIRECTORY
channel_name = config.CHANNEL_NAME
starting_date = config.STARTING_DATE
ending_date = config.ENDING_DATE
output_directory = config.OUTPUT_DIRECTORY

# ============== INPUT==================
log_data = reader.linux_input(log_directory, channel_name, starting_date, ending_date)
nicks, nick_same_list = nickTracker.nick_tracker(log_data)

# ============== ANALYSIS =============
message_number_graph = network.message_number_graph(log_data, nicks, nick_same_list, False)
message_number_graph_day_list = network.message_number_graph(log_data, nicks, nick_same_list, True)
degree_anal_message_numder = network.degree_analysis_on_graph(message_number_graph)
message_time_graph_list = network.message_time_graph(log_data, nicks, nick_same_list, True)
message_time_graph = network.message_time_graph(log_data, nicks, nick_same_list, False)
out_degree_node_number, in_degree_node_number, total_degree_node_number = network.degree_node_number_csv(log_data, nicks, nick_same_list)
nick_change_graph_list =  user.nick_change_graph(log_data, True)
bin_matrix, total_messages = network.message_number_bins_csv(log_data, nicks, nick_same_list)
conv_len, conv_ref_time = channel.conv_len_conv_refr_time(log_data, nicks, nick_same_list)
resp_time = channel.response_time(log_data, nicks, nick_same_list)

user.keywords_clusters(log_data, nicks, nick_same_list)
network.degree_analysis_on_graph(message_number_graph)
コード例 #20
0
ファイル: tests.py プロジェクト: rohangoel96/IRCLogParser
import json
from os import path
sys.path.insert(0, '../IRCLogParser')
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
from lib.analysis import network, user, channel
from lib.nickTracker import nick_tracker
from ddt import ddt, data, unpack
from lib import config
from lib.in_out import reader
import networkx as nx
current_dir = os.path.dirname(__file__)

log_directory = os.path.join(current_dir, 'data/input/')
expected_output_directory = os.path.join(current_dir, 'data/output/')
channel_name = config.CHANNEL_NAME
log_for_jan = reader.linux_input(log_directory, channel_name, "2013-1-1", "2013-1-31")
nicks_for_jan, nick_same_list_for_jan = nick_tracker(log_for_jan)
log_for_aug = reader.linux_input(log_directory, channel_name, "2013-8-1", "2013-8-31")
nicks_for_aug, nick_same_list_for_aug = nick_tracker(log_for_aug)

def update_expected_output_directory(log_data):
    key = log_data.keys()[0]  #get any key as months and year will be same since log_data has monthly data
    global expected_output_directory
    expected_output_directory = os.path.join(current_dir, 'data/output/' + str(key.year)+'/')
    month = key.month
    temp = str(month)
    if (month < 10):
        temp = '0' + str(month)
    expected_output_directory += temp +'/'

コード例 #21
0
ファイル: validate.py プロジェクト: soundarya98/IRCLogParser
def codelengths(log_directory, output_directory, channel_name, start_date,
                end_date):
    """
        The function iterate through the months in the given date range and computes the infomap number. It then plots a
        box plot for the infomap numbers of all the whole months in the given time period.

    Args:
        log_directory(str): path to the location of Logs
        output_directory(str):  path to the location where the results are to be stored
        channel_name(list): channels for which the analysis is to be done
        start_date(datetime): starting date for the logs to be analysed. This has to be the beginning of the month.
        end_date(datetime): ending date for which the logs are to be analysed. This has to be the end of the month.

    Returns:
       null

    """
    start_date = start_date.strptime('%Y-%m-%d')
    end_date = end_date.strptime('%Y-%m-%d')
    codelengths = []
    for dt in rrule(MONTHLY, dtstart=start_date, until=end_date):
        last_day_of_the_month1 = dt + relativedelta(
            months=1) - datetime.timedelta(days=1)
        log_data_m1 = reader.linux_input(
            log_directory, channel_name, dt.strftime("%Y-%m-%d"),
            last_day_of_the_month1.strftime("%Y-%m-%d"))
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        message_number_graph_m1 = network.message_number_graph(
            log_data_m1, nicks_m1, nick_same_list_m1, False)
        try:
            #FOS is a reserved word in igraph and if 'fos' is a username in the nx graph, it generates an error
            saver.save_net_nx_graph(message_number_graph_m1, output_directory,
                                    "message-exchange-" + str(dt.month))
            msg_igraph, msg_community = community.infomap_igraph(
                ig_graph=None,
                net_file_location=output_directory + "message-exchange-" +
                str(dt.month) + '.net')
            codelengths.append(msg_community.codelength)
        except:
            node_labels = message_number_graph_m1.nodes()
            labels = {}
            for label in node_labels:
                if label == "fos":
                    labels[label] = "fos_"
                else:
                    labels[label] = label

            message_number_graph_m1 = nx.relabel_nodes(message_number_graph_m1,
                                                       labels)
            saver.save_net_nx_graph(message_number_graph_m1, output_directory,
                                    "message-exchange-" + str(dt.month))
            print "error in", dt.month

        msg_igraph, msg_community = community.infomap_igraph(
            ig_graph=None,
            net_file_location=output_directory + "message-exchange-" +
            str(dt.month) + '.net')
        codelengths.append(msg_community.codelength)

    vis.box_plot(codelengths, output_directory, "codelengths2013")
    saver.save_csv([codelengths], output_directory, "codelengths2013")
コード例 #22
0
ファイル: validate.py プロジェクト: soundarya98/IRCLogParser
def keywords_hits_overlap(log_directory, output_directory, channel_name,
                          start_date, end_date):
    """
        The function iterates through the months in the given date range and produces the authorities, top keywords and
        top hubs for the current month and the next month. It also produces the overlap of authorities, top keywords and
        top hubs between the current and the next month.

    Args:
        log_directory(str): path to the location of Logs
        output_directory(str):  path to the location where the results are to be stored
        channel_name(list): channels for which the analysis is to be done
        start_date(datetime): starting date for the logs to be analysed. This has to be the beginning of the month.
        end_date(datetime): ending date for which the logs are to be analysed. This has to be the end of the month.

    Returns:
       null

    """
    start_date = start_date.strptime('%Y-%m-%d')
    end_date = end_date.strptime('%Y-%m-%d')
    for dt in rrule(MONTHLY, dtstart=start_date, until=end_date):
        last_day_of_the_month1 = dt + relativedelta(
            months=1) - datetime.timedelta(days=1)
        log_data_m1 = reader.linux_input(
            log_directory, channel_name, dt.strftime("%Y-%m-%d"),
            last_day_of_the_month1.strftime("%Y-%m-%d"))
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        message_graph_m1, top_hubs_m1, top_keyword_overlap_m1, top_auth_m1 = network.identify_hubs_and_experts(
            log_data_m1, nicks_m1, nick_same_list_m1)
        saver.draw_nx_graph(message_graph_m1, output_directory,
                            "expert-month-" + str(dt.month))

        next_month_dt = dt + relativedelta(months=1)
        last_day_of_the_month2 = next_month_dt + relativedelta(
            months=1) - datetime.timedelta(days=1)
        log_data_m2 = reader.linux_input(
            log_directory, channel_name, next_month_dt.strftime("%Y-%m-%d"),
            last_day_of_the_month2.strftime("%Y-%m-%d"))
        nicks_m2, nick_same_list_m2 = nickTracker.nick_tracker(log_data_m2)
        message_graph_m2, top_hubs_m2, top_keyword_overlap_with_score_m2, top_auth_m2 = network.identify_hubs_and_experts(
            log_data_m2, nicks_m2, nick_same_list_m2)

        print "Top 10 HUBS for Month [HITS]", dt.month, ":", top_hubs_m1
        print "Top 10 HUBS for Month [HITS]", next_month_dt.month, ":", top_hubs_m2
        print "Number of common HUBS (from 10) between above 2 months:", len(
            list(set(top_hubs_m1).intersection(top_hubs_m2)))

        print "Top 10 Experts by keywords for Months", dt.month, ":", top_keyword_overlap_m1
        print "Top 10 Experts by keywords for Months", next_month_dt.month, ":", top_keyword_overlap_with_score_m2
        print "Number of common Experts by keywords (from 10) between above 2 months:", len(
            list(
                set(top_keyword_overlap_m1).intersection(
                    top_keyword_overlap_with_score_m2)))

        print "Top 10 AUTH for Month [HITS]", dt.month, ":", top_auth_m1
        print "Top 10 AUTH for Month [HITS]", next_month_dt.month, ":", top_auth_m2
        print "Number of common AUTH (from 10) between above 2 months:", len(
            list(set(top_auth_m1).intersection(top_auth_m2)))

        print "Number of users common btw HUBS from HITS and Experts by Keywords (from 10) for month", dt.month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_hubs_m1)))
        print "Number of users common btw AUTH from HITS and Experts by Keywords (from 10) for month", dt.month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS from HITS and AUTH from HITS (from 10) for month", dt.month, ":", len(
            list(set(top_hubs_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS, HITS and KEYWORDS", dt.month, ":", len(
            set(list(set(top_keyword_overlap_m1).intersection(
                top_hubs_m1))).intersection(top_auth_m1))
コード例 #23
0
ファイル: validate.py プロジェクト: soundarya98/IRCLogParser
def correlational_CL_RT_CRT(log_directory, output_directory, start_date,
                            end_date):
    """
        Correlational : statistical distribution as illustrated by box plot for RT, CL, CRT parameters. The function
        takes the given time duration and selects one month at a time for generation of a degree distribution sample. Each
        degree distribution sample shall have 3 curve fit parameters namely a,b & c. The function collects these parameters
        for all the months of the given time duration. The function produces box plot separately for each parameter.


    Args:
        log_directory(str): path to the location of Logs
        output_directory(str):  path to the location where the results are to be stored
        channel_name(list): channels for which the analysis is to be done
        start_date(datetime): starting date for the logs to be analysed. This has to be the beginning of the month.
        end_date(datetime): ending date for which the logs are to be analysed. This has to be the end of the month.

    Returns:
       null

    """
    start_date = start_date.strptime('%Y-%m-%d')
    end_date = end_date.strptime('%Y-%m-%d')
    percentiles = [0, 1, 5, 10, 20]
    for channel_name_iter in [["#kubuntu-devel"], ["#ubuntu-devel"],
                              ["#kubuntu"]]:
        for cutoff in percentiles:
            conv_len_curve_fit_parameters = np.zeros((12, 4))
            resp_time_curve_fit_parameters = np.zeros((12, 4))
            conv_ref_time_curve_fit_parameters = np.zeros((12, 5))
            for dt in rrule(MONTHLY, dtstart=start_date, until=end_date):
                last_day_of_the_month = dt + relativedelta(
                    months=1) - datetime.timedelta(days=1)

                log_data = reader.linux_input(
                    log_directory, channel_name_iter, dt.strftime("%Y-%m-%d"),
                    last_day_of_the_month.strftime("%Y-%m-%d"))
                nicks, nick_same_list = nickTracker.nick_tracker(log_data)
                default_cutoff = config.CUTOFF_PERCENTILE

                config.CUTOFF_PERCENTILE = cutoff
                truncated_rt, rt_cutoff_time = channel.response_time(
                    log_data, nicks, nick_same_list, config.CUTOFF_PERCENTILE)
                conv_len, conv_ref_time = channel.conv_len_conv_refr_time(
                    log_data, nicks, nick_same_list, rt_cutoff_time,
                    config.CUTOFF_PERCENTILE)
                conv_len_curve_fit_parameters[
                    dt.month - 1] = vis.exponential_curve_fit_and_plot(
                        conv_len, output_directory,
                        "conv_len_cutoff" + str(cutoff))
                resp_time_curve_fit_parameters[
                    dt.month - 1] = vis.exponential_curve_fit_and_plot(
                        truncated_rt, output_directory,
                        "resp_time_cutoff" + str(cutoff))
                conv_ref_time_curve_fit_parameters[
                    dt.month -
                    1] = vis.exponential_curve_fit_and_plot_x_shifted(
                        conv_ref_time, output_directory,
                        "conv_ref_time_cutoff" + str(cutoff))

            parameters = ['a', 'b', 'c']
            for para_ind in range(len(parameters)):
                vis.box_plot(
                    conv_len_curve_fit_parameters[:, para_ind],
                    output_directory, "conv_len_" + str(parameters[para_ind]) +
                    "_2013_" + channel_name_iter[0] + "_cut_" + str(cutoff))
                vis.box_plot(
                    resp_time_curve_fit_parameters[:,
                                                   para_ind], output_directory,
                    "resp_time_" + str(parameters[para_ind]) + "_2013_" +
                    channel_name_iter[0] + "_cut_" + str(cutoff))
                vis.box_plot(
                    conv_ref_time_curve_fit_parameters[:, para_ind],
                    output_directory,
                    "conv_refr_" + str(parameters[para_ind]) + "_2013_" +
                    channel_name_iter[0] + "_cut_" + str(cutoff))

                saver.save_csv(
                    [conv_len_curve_fit_parameters[:, para_ind].tolist()],
                    output_directory, "conv_len_" + str(parameters[para_ind]) +
                    "_2013_" + channel_name_iter[0] + "_cut_" + str(cutoff))
                saver.save_csv(
                    [resp_time_curve_fit_parameters[:, para_ind].tolist()],
                    output_directory,
                    "resp_time_" + str(parameters[para_ind]) + "_2013_" +
                    channel_name_iter[0] + "_cut_" + str(cutoff))
                saver.save_csv(
                    [conv_ref_time_curve_fit_parameters[:, para_ind].tolist()],
                    output_directory,
                    "conv_refr_" + str(parameters[para_ind]) + "_2013_" +
                    channel_name_iter[0] + "_cut_" + str(cutoff))

    config.CUTOFF_PERCENTILE = default_cutoff
コード例 #24
0
ファイル: sample.py プロジェクト: krishnacharya/IRCLogParser
import sys
sys.path.insert(0, "IRCLogParser/")
from lib.in_out import reader, saver
from lib import nickTracker, config, vis, validate
from lib.analysis import network, channel, user, community

log_directory = config.LOG_DIRECTORY
channel_name = config.CHANNEL_NAME
starting_date = config.STARTING_DATE
ending_date = config.ENDING_DATE
output_directory = config.OUTPUT_DIRECTORY

# ============== INPUT==================
log_data = reader.linux_input(log_directory, channel_name, starting_date,
                              ending_date)
nicks, nick_same_list = nickTracker.nick_tracker(log_data)

# ============== ANALYSIS =============
message_number_graph = network.message_number_graph(log_data, nicks,
                                                    nick_same_list, False)
message_number_graph_day_list = network.message_number_graph(
    log_data, nicks, nick_same_list, True)
degree_anal_message_numder = network.degree_analysis_on_graph(
    message_number_graph)
message_time_graph_list = network.message_time_graph(log_data, nicks,
                                                     nick_same_list, True)
message_time_graph = network.message_time_graph(log_data, nicks,
                                                nick_same_list, False)
out_degree_node_number, in_degree_node_number, total_degree_node_number = network.degree_node_number_csv(
    log_data, nicks, nick_same_list)
nick_change_graph_list = user.nick_change_graph(log_data, True)
コード例 #25
0
import json
from os import path
sys.path.insert(0, '../IRCLogParser')
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
from lib.analysis import network, user, channel
from lib.nickTracker import nick_tracker
from ddt import ddt, data, unpack
from lib import config
from lib.in_out import reader
import networkx as nx
current_dir = os.path.dirname(__file__)

log_directory = os.path.join(current_dir, 'data/input/')
expected_output_directory = os.path.join(current_dir, 'data/output/')
channel_name = config.CHANNEL_NAME
log_for_jan = reader.linux_input(log_directory, channel_name, "2013-1-1",
                                 "2013-1-31")
nicks_for_jan, nick_same_list_for_jan = nick_tracker(log_for_jan)
log_for_aug = reader.linux_input(log_directory, channel_name, "2013-8-1",
                                 "2013-8-31")
nicks_for_aug, nick_same_list_for_aug = nick_tracker(log_for_aug)


def update_expected_output_directory(log_data):
    key = list(
        log_data.keys()
    )[0]  #get any key as months and year will be same since log_data has monthly data
    global expected_output_directory
    expected_output_directory = os.path.join(
        current_dir, 'data/output/' + str(key.year) + '/')
    month = key.month
    temp = str(month)
コード例 #26
0
ファイル: validate.py プロジェクト: soundarya98/IRCLogParser
def box_plot_for_degree(log_directory, output_directory, channel_name,
                        start_date, end_date):
    """
        Correlational : statistical distribution of curve fit parameters generated for degree distribution. The function
        takes the given time duration and selects one month at a time for generation of a degree distribution sample. Each
        degree distribution sample shall have 3 curve fit parameters namely slope, intercept & r_square. The function collects these parameters
        for all the months of the given time duration. The function produces box plot separately for each parameter.

    Args:
        log_directory(str): path to the location of Logs
        output_directory(str):  path to the location where the results are to be stored
        channel_name(list): channels for which the analysis is to be done.
        start_date(datetime): starting date for the logs to be analysed. This has to be the beginning of the month.
        end_date(datetime): ending date for which the logs are to be analysed. This has to be the end of the month.

    Returns:
       null

    """
    start_date = start_date.strptime('%Y-%m-%d')
    end_date = end_date.strptime('%Y-%m-%d')
    cutoff = 0
    for channel_name_iter in channel_name:
        out_degree_fit_parameters = np.zeros((12, 4))
        in_degree_fit_parameters = np.zeros((12, 4))
        total_degree_fit_parameters = np.zeros((12, 4))
        for dt in rrule(MONTHLY, dtstart=start_date, until=end_date):
            last_day_of_the_month = dt + relativedelta(
                months=1) - datetime.timedelta(days=1)
            # for month in range(1, 13):
            log_data = reader.linux_input(
                log_directory, channel_name_iter, dt.strftime("%Y-%m-%d"),
                last_day_of_the_month.strftime("%Y-%m-%d"))
            nicks, nick_same_list = nickTracker.nick_tracker(log_data)

            message_number_graph = network.message_number_graph(
                log_data, nicks, nick_same_list, False)
            degree_anal_message_number = network.degree_analysis_on_graph(
                message_number_graph)

            out_degree_fit_parameters[dt.month - 1] = vis.generate_log_plots(
                degree_anal_message_number["out_degree"]["raw_for_vis"],
                output_directory, channel_name_iter[0])
            in_degree_fit_parameters[dt.month - 1] = vis.generate_log_plots(
                degree_anal_message_number["in_degree"]["raw_for_vis"],
                output_directory, channel_name_iter[0])
            total_degree_fit_parameters[dt.month - 1] = vis.generate_log_plots(
                degree_anal_message_number["total_degree"]["raw_for_vis"],
                output_directory, channel_name_iter[0])

        parameters = ['slope', 'intercept', 'r_square']
        for para_ind in range(len(parameters)):
            vis.box_plot(
                out_degree_fit_parameters[:, para_ind], output_directory,
                "out_degree_" + str(parameters[para_ind]) + "_2013_" +
                channel_name_iter[0] + "_cut_" + str(cutoff))
            vis.box_plot(
                in_degree_fit_parameters[:, para_ind], output_directory,
                "in_degree_" + str(parameters[para_ind]) + "_2013_" +
                channel_name_iter[0] + "_cut_" + str(cutoff))
            vis.box_plot(
                total_degree_fit_parameters[:, para_ind], output_directory,
                "total_degree_" + str(parameters[para_ind]) + "_2013_" +
                channel_name_iter[0] + "_cut_" + str(cutoff))

            saver.save_csv([out_degree_fit_parameters[:, para_ind].tolist()],
                           output_directory, "out_degree_" +
                           str(parameters[para_ind]) + "_2013_" +
                           channel_name_iter[0] + "_cut_" + str(cutoff))
            saver.save_csv([in_degree_fit_parameters[:, para_ind].tolist()],
                           output_directory, "in_degree_" +
                           str(parameters[para_ind]) + "_2013_" +
                           channel_name_iter[0] + "_cut_" + str(cutoff))
            saver.save_csv([total_degree_fit_parameters[:, para_ind].tolist()],
                           output_directory, "total_degree_" +
                           str(parameters[para_ind]) + "_2013_" +
                           channel_name_iter[0] + "_cut_" + str(cutoff))
コード例 #27
0
ファイル: ubuntu.py プロジェクト: Kausam/IRCLogParser
from lib.in_out import reader, saver
import lib.nickTracker as nickTracker, lib.config as config, lib.vis as vis, lib.validate as validate, lib.util as util
from lib.analysis import network, channel, user, community
import numpy as np
import networkx as nx
log_directory = config.LOG_DIRECTORY
channel_name = config.CHANNEL_NAME
starting_date = config.STARTING_DATE
ending_date = config.ENDING_DATE
output_directory = config.OUTPUT_DIRECTORY

degree_type = ["out_degree", "in_degree", "total_degree"]
presence_type = ["CC", "UU", "CU"]

# ============== INPUT==================
log_data = reader.linux_input(log_directory, channel_name, starting_date, ending_date)
nicks, nick_same_list = nickTracker.nick_tracker(log_data)

# ============== ANALYSIS =============
message_number_graph = network.message_number_graph(log_data, nicks, nick_same_list, False)

degree_anal_message_number = network.degree_analysis_on_graph(message_number_graph)

bin_matrix, total_messages = network.message_number_bins_csv(log_data, nicks, nick_same_list)
data = [[i for i in range(len(bin_matrix[0]))]]
data.append([sum(i) for i in zip(*bin_matrix)])

default_cutoff = config.CUTOFF_PERCENTILE
percentiles = [0, 1, 5, 10, 20]

for cutoff in percentiles: