Python splice_findの例

プログラミング言語: Python

名前空間/パッケージ名: lib.util

メソッド/関数: splice_find

hotexamples.comのコード掲載数: 12

Python splice_find - 12件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのlib.util.splice_findの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: user.py プロジェクト: rohangoel96/IRCLogParser

def nick_change_graph(log_dict, DAY_BY_DAY_ANALYSIS=False):

    """ creates a graph which tracks the nick changes of the users
    where each edge has a time stamp denoting the time 
    at which the nick was changed by the user

    Args:
        log_dict (str): Dictionary of logs created using reader.py    

    Returns:
       list of the day_to_day nick changes if config.DAY_BY_DAY_ANALYSIS=True or else an aggregate nick change graph for the 
       given time period.
    """     

    rem_time = None #remembers the time of the last message of the file parsed before the current file
    nick_change_day_list = []
    aggregate_nick_change_graph = nx.MultiDiGraph() # graph for nick changes in the whole time span (not day to day)
    
    for day_content_all_channels in log_dict.values():      
        
        for day_content in day_content_all_channels:            
                day_log = day_content["log_data"]                   
                
                today_nick_change_graph = nx.MultiDiGraph()   #using networkx
                current_line_no = -1
                
                for line in day_log:
                    current_line_no = current_line_no + 1
                    
                    if(line[0] == '=' and "changed the topic of" not in line):  #excluding the condition when user changes the topic. Search for only nick changes
                        nick1 = util.splice_find(line, "=", " is", 3)
                        nick2 = util.splice_find(line, "wn as", "\n", 5)                        
                        earlier_line_no = current_line_no

                        while earlier_line_no >= 0: #to find the line just before "=="" so as to find time of Nick Change
                            earlier_line_no = earlier_line_no - 1
                            if(day_log[earlier_line_no][0] != '='):                             
                                year, month, day = util.get_year_month_day(day_content)
                                util.build_graphs(nick1, nick2, day_log[earlier_line_no][1:6], year, month, day, today_nick_change_graph, aggregate_nick_change_graph)
                                break

                        if(earlier_line_no == -1):
                            today_nick_change_graph.add_edge(nick1, nick2, weight=rem_time)                                              
                            aggregate_nick_change_graph.add_edge(nick1, nick2, weight = rem_time)
                
                count = len(day_log) - 1 #setting up the rem_time for next file, by noting the last message sent on that file.
                
                while(count >= 0):
                    if(day_log[count][0] != '='):
                        rem_time = day_log[count][1:6]
                        break
                    count = count-1
                
                nick_change_day_list.append(today_nick_change_graph)    
                        
    if DAY_BY_DAY_ANALYSIS:
        return nick_change_day_list
    else:
        return aggregate_nick_change_graph

コード例 #2

ファイルを表示

ファイル: channel.py プロジェクト: kaushik-rohit/IRCLogParser

    def parse_log_lines_for_conv(log_dict, nicks, conn_comp_list,
                                 conversations):
        dateadd = -1  #Variable used for response time calculation. Varies from 0-365.
        for day_content_all_channels in log_dict.values():
            for day_content in day_content_all_channels:
                day_log = day_content["log_data"]

                dateadd = dateadd + 1
                send_time = [
                ]  #list of all the times a user sends a message to another user
                #code for making relation map between clients
                for line in day_log:
                    flag_comma = 0
                    if (util.check_if_msg_line(line)):
                        nick_sender = ""
                        nick_receiver = ""
                        m = re.search(r"\<(.*?)\>", line)
                        nick_to_search = util.correctLastCharCR(
                            m.group(0)[1:-1])
                        nick_sender = util.get_nick_sen_rec(
                            len(nicks), nick_to_search, conn_comp_list,
                            nick_sender)

                        for nick in nicks:
                            rec_list = [e.strip() for e in line.split(':')]
                            util.rec_list_splice(rec_list)
                            if not rec_list[1]:
                                break
                            rec_list = util.correct_last_char_list(rec_list)
                            conversations, nick_receiver, send_time = \
                             build_conversation(rec_list, nick, send_time,
                                    nick_to_search, nick_receiver, nick_sender,
                                    dateadd, conversations, conn_comp_list, line)

                            if "," in rec_list[1]:
                                flag_comma = 1
                                rec_list_2 = [
                                    e.strip() for e in rec_list[1].split(',')
                                ]
                                rec_list_2 = util.correct_last_char_list(
                                    rec_list_2)
                                conversations, nick_receiver, send_time = \
                                 build_conversation(rec_list_2, nick, send_time,
                                        nick_to_search, nick_receiver,
                                        nick_sender, dateadd, conversations,
                                        conn_comp_list, line)

                            if (flag_comma == 0):
                                rec = util.splice_find(line, ">", ", ", 1)
                                conversations, nick_receiver, send_time = \
                                 conv_helper(rec, nick, send_time, nick_to_search,
                                    nick_receiver, nick_sender, dateadd,
                                    conversations, conn_comp_list, line)

        return conversations, nick_receiver, send_time

コード例 #3

ファイルを表示

def nick_change_graph(log_dict, DAY_BY_DAY_ANALYSIS=False):
    """ creates a graph which tracks the nick changes of the users
    where each edge has a time stamp denoting the time 
    at which the nick was changed by the user

    Args:
        log_dict (str): Dictionary of logs created using reader.py    

    Returns:
       list of the day_to_day nick changes if config.DAY_BY_DAY_ANALYSIS=True or else an aggregate nick change graph for the 
       given time period.
    """

    rem_time = None  #remembers the time of the last message of the file parsed before the current file
    nick_change_day_list = []
    aggregate_nick_change_graph = nx.MultiDiGraph(
    )  # graph for nick changes in the whole time span (not day to day)

    for day_content_all_channels in log_dict.values():

        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]

            today_nick_change_graph = nx.MultiDiGraph()  #using networkx
            current_line_no = -1

            for line in day_log:
                current_line_no = current_line_no + 1

                if (
                        line[0] == '=' and "changed the topic of" not in line
                ):  #excluding the condition when user changes the topic. Search for only nick changes
                    nick1 = util.splice_find(line, "=", " is", 3)
                    nick2 = util.splice_find(line, "wn as", "\n", 5)
                    earlier_line_no = current_line_no

                    while earlier_line_no >= 0:  #to find the line just before "=="" so as to find time of Nick Change
                        earlier_line_no = earlier_line_no - 1
                        if (day_log[earlier_line_no][0] != '='):
                            year, month, day = util.get_year_month_day(
                                day_content)
                            util.build_graphs(nick1, nick2,
                                              day_log[earlier_line_no][1:6],
                                              year, month, day,
                                              today_nick_change_graph,
                                              aggregate_nick_change_graph)
                            break

                    if (earlier_line_no == -1):
                        today_nick_change_graph.add_edge(nick1,
                                                         nick2,
                                                         weight=rem_time)
                        aggregate_nick_change_graph.add_edge(nick1,
                                                             nick2,
                                                             weight=rem_time)

            count = len(
                day_log
            ) - 1  #setting up the rem_time for next file, by noting the last message sent on that file.

            while (count >= 0):
                if (day_log[count][0] != '='):
                    rem_time = day_log[count][1:6]
                    break
                count = count - 1

            nick_change_day_list.append(today_nick_change_graph)

    if DAY_BY_DAY_ANALYSIS:
        return nick_change_day_list
    else:
        return aggregate_nick_change_graph

コード例 #4

ファイルを表示

def keywords(log_dict, nicks, nick_same_list):
    """
    Returns keywods for all users

    Args:   
        log_dict (str): Dictionary of logs data created using reader.py
        nicks(List) : list of nickname created using nickTracker.py
        nick_same_list :List of same_nick names created using nickTracker.py

    Returns
        keywords_filtered: filtered keywords for user
        user_keyword_freq_dict: dictionary for each user having keywords and their frequency
        user_words_dict: keywods for user
        nicks_for_stop_words: stop words
    """
    user_words_dict = []
    user_keyword_freq_dict = []
    keywords_filtered = []
    no_messages = 0

    def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name,
                          nicks, nick_same_list):
        if (rec == nick_name):
            if (nick_to_compare != nick_name):
                nick_receiver = iter_nicks(nick_receiver, nicks,
                                           nick_same_list, nick_name)
        return nick_receiver

    def iter_nicks(nick_sender_receiver, nicks, nick_same_list, nick_comp):
        for i in range(len(nicks)):
            if nick_comp in nick_same_list[i]:
                nick_sender_receiver = nick_same_list[i][0]
                break
            else:
                nick_sender_receiver = nick_comp
        return nick_sender_receiver

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            for line in day_log:
                flag_comma = 0
                if (util.check_if_msg_line(line)):
                    m = re.search(r"\<(.*?)\>", line)
                    nick_to_compare = util.correctLastCharCR(
                        (m.group(0)[1:-1]))
                    nick_sender = ''
                    nick_sender = iter_nicks(nick_sender, nicks,
                                             nick_same_list, nick_to_compare)

                    nick_receiver = ''
                    for nick_name in nicks:
                        rec_list = [e.strip() for e in line.split(':')
                                    ]  #receiver list splited about :
                        util.rec_list_splice(rec_list)
                        if not rec_list[1]:  #index 0 will contain time 14:02
                            break
                        rec_list = util.correct_last_char_list(rec_list)
                        for rec in rec_list:
                            nick_receiver = get_nick_receiver(
                                nick_receiver, rec, nick_to_compare, nick_name,
                                nicks, nick_same_list)

                        if "," in rec_list[
                                1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [
                                e.strip() for e in rec_list[1].split(',')
                            ]
                            rec_list_2 = util.correct_last_char_list(
                                rec_list_2)
                            for rec in rec_list_2:
                                nick_receiver = get_nick_receiver(
                                    nick_receiver, rec, nick_to_compare,
                                    nick_name, nicks, nick_same_list)

                        if (flag_comma == 0
                            ):  #receiver list can be <Dhruv> Rohan, Hi!
                            rec = util.splice_find(line, ">", ", ", 1)
                            nick_receiver = get_nick_receiver(
                                nick_receiver, rec, nick_to_compare, nick_name,
                                nicks, nick_same_list)

                    #generating the words written by the sender
                    message = rec_list[1:]
                    no_messages += 1
                    correctedNickReciever = util.correct_nick_for_(
                        nick_receiver)
                    if correctedNickReciever in message:
                        message.remove(correctedNickReciever)

                    lmtzr = WordNetLemmatizer()

                    #limit word size = 3, drop numbers.
                    word_list_temp = re.sub(
                        r'\d+', '', " ".join(
                            re.findall(r'\w{3,}', ":".join(message).replace(
                                ",", " ")))).split(" ")
                    word_list = []

                    #remove punctuations
                    for word in word_list_temp:
                        word = word.lower()
                        word_list.append(word.replace("'", ""))
                    word_list_lemmatized = []

                    try:
                        word_list_lemmatized = map(
                            lmtzr.lemmatize,
                            map(lambda x: lmtzr.lemmatize(x, 'v'), word_list))
                    except UnicodeDecodeError:
                        pass

                    fr = 1
                    for dic in user_words_dict:
                        if dic['sender'] == nick_sender:
                            dic['words'].extend(word_list_lemmatized)
                            fr = 0
                    if fr:
                        user_words_dict.append({
                            'sender': nick_sender,
                            'words': word_list_lemmatized
                        })

    nicks_for_stop_words = []
    stop_word_without_apostrophe = []

    for l in nick_same_list:
        nicks_for_stop_words.extend(l)

    for dictonary in user_words_dict:
        nicks_for_stop_words.append(dictonary['sender'])

    nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words])

    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'", ""))

    stop_words_extended = extended_stop_words(nicks_for_stop_words,
                                              stop_word_without_apostrophe)

    count_vect = CountVectorizer(analyzer='word',
                                 stop_words=stop_words_extended,
                                 min_df=1)

    for dictonary in user_words_dict:
        try:
            matrix = count_vect.fit_transform(dictonary['words'])
            freqs = [[word, matrix.getcol(idx).sum()]
                     for word, idx in count_vect.vocabulary_.items()]
            keywords = sorted(freqs, key=lambda x: -x[1])
            total_freq = 0.0
            for freq_tuple in keywords:
                total_freq += freq_tuple[1]

            for freq_tuple in keywords:
                freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5))
            user_keyword_freq_dict.append({
                'nick': dictonary['sender'],
                'keywords': keywords
            })
        except ValueError:
            pass
    for data in user_keyword_freq_dict:
        keywords, normal_scores = top_keywords_for_nick(
            user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD,
            config.KEYWORDS_MIN_WORDS)
        if config.DEBUGGER:
            print "Nick:", data['nick']
            print "Keywords with normalised score > 0.01\n", keywords
            print "Their Normal scores\n", normal_scores
            print "\n"
        if keywords:
            keywords_filtered.append({
                'nick': data['nick'],
                'keywords': keywords
            })

    return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words

コード例 #5

ファイルを表示

 def test_splice_find(self, line, search_param1, search_param2,
                      splice_index, expected_result):
     self.assertEqual(
         util.splice_find(line, search_param1, search_param2, splice_index),
         expected_result)

コード例 #6

ファイルを表示

ファイル: channel.py プロジェクト: kaushik-rohit/IRCLogParser

def response_time(log_dict, nicks, nick_same_list, cutoff_percentile):
    """ finds the response time of a message 
	i.e. the best guess for the time at which one can expect a reply for his/her message.

	Args:
		log_dict (str): Dictionary of logs data created using reader.py
		nicks(List) : List of nickname created using nickTracker.py
		nick_same_list :List of same_nick names created using nickTracker.py
		cutoff_percentile (int): Cutoff percentile indicating statistical significance
		
	Returns:
	   rows_RT(zip List): Response Time (This refers to the response
		time of a message i.e. the best guess for the time at
		which one can expect a reply for his/her message)

	"""
    G = util.to_graph(nick_same_list)
    conn_comp_list = list(connected_components(G))

    util.create_connected_nick_list(conn_comp_list)

    graph_cumulative = []
    graph_x_axis = []
    graph_y_axis = []

    def build_mean_list(conversations, index, mean_list):
        for j in range(2, len(conversations[index])):
            mean_list.append(conversations[index][j])
        return mean_list

    def resp_helper(rec, nick, send_time, nick_to_search, nick_receiver,
                    nick_sender, conversations, conn_comp_list):
        if (rec == nick):
            send_time.append(line[1:6])
            if (nick_to_search != nick):
                nick_receiver = util.get_nick_sen_rec(len(nicks), nick,
                                                      conn_comp_list,
                                                      nick_receiver)
                for i in range(config.MAX_RESPONSE_CONVERSATIONS):
                    if (nick_sender in conversations[i]
                            and nick_receiver in conversations[i]):
                        conversations[i].append(line[1:6])
                        break
                    if (len(conversations[i]) == 0):
                        conversations[i].append(nick_sender)
                        conversations[i].append(nick_receiver)
                        conversations[i].append(line[1:6])
                        break
        return conversations, nick_receiver, send_time

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]

            send_time = [
            ]  #list of all the times a user sends a message to another user
            meanstd_list = []
            totalmeanstd_list = []
            x_axis = []
            y_axis = []
            real_y_axis = []
            conversations = [[]
                             for i in range(config.MAX_RESPONSE_CONVERSATIONS)]

            #code for making relation map between clients
            for line in day_log:
                flag_comma = 0
                if (util.check_if_msg_line(line)):
                    nick_sender = ""
                    nick_receiver = ""
                    m = re.search(r"\<(.*?)\>", line)
                    nick_to_search = util.correctLastCharCR(m.group(0)[1:-1])
                    nick_sender = util.get_nick_sen_rec(
                        len(nicks), nick_to_search, conn_comp_list,
                        nick_sender)
                    for nick in nicks:
                        rec_list = [e.strip() for e in line.split(':')]
                        util.rec_list_splice(rec_list)

                        if not rec_list[1]:
                            break
                        rec_list = util.correct_last_char_list(rec_list)

                        for name in rec_list:
                            conversations, nick_receiver, send_time = resp_helper(
                                name, nick, send_time, nick_to_search,
                                nick_receiver, nick_sender, conversations,
                                conn_comp_list)

                        if "," in rec_list[1]:
                            flag_comma = 1
                            rec_list_2 = [
                                e.strip() for e in rec_list[1].split(',')
                            ]
                            rec_list_2 = util.correct_last_char_list(
                                rec_list_2)

                            for name in rec_list_2:
                                conversations, nick_receiver, send_time = resp_helper(
                                    name, nick, send_time, nick_to_search,
                                    nick_receiver, nick_sender, conversations,
                                    conn_comp_list)

                        if (flag_comma == 0):
                            rec = util.splice_find(line, ">", ", ", 1)
                            conversations, nick_receiver, send_time = resp_helper(
                                rec, nick, send_time, nick_to_search,
                                nick_receiver, nick_sender, conversations,
                                conn_comp_list)

            for i in range(config.MAX_RESPONSE_CONVERSATIONS):
                if (len(conversations[i]) != 0):
                    for j in range(2, len(conversations[i]) - 1):
                        conversations[i][j] = (
                            int(conversations[i][j + 1][0:2]) *
                            config.MINS_PER_HOUR +
                            int(conversations[i][j + 1][3:5])) - (
                                int(conversations[i][j][0:2]) *
                                config.MINS_PER_HOUR +
                                int(conversations[i][j][3:5]))

            for i in range(config.MAX_RESPONSE_CONVERSATIONS):
                if (len(conversations[i]) != 0):
                    if (len(conversations[i]) == 3):
                        conversations[i][2] = int(conversations[i][2][
                            0:2]) * config.MINS_PER_HOUR + int(
                                conversations[i][2][3:5])
                    else:
                        del conversations[i][-1]

        #Explanation provided in parser-CL+CRT.py
            for i in range(config.MAX_RESPONSE_CONVERSATIONS):
                if (len(conversations[i]) != 0):
                    totalmeanstd_list = build_mean_list(
                        conversations, i, totalmeanstd_list)

            if (len(totalmeanstd_list) != 0):
                for i in range(max(totalmeanstd_list) + 1):
                    x_axis.append(i)

                for i in x_axis:
                    y_axis.append(
                        float(totalmeanstd_list.count(i)) /
                        float(len(totalmeanstd_list)))

                #finding the probability of each RT to occur=No. of occurence/total occurences.
                real_y_axis.append(y_axis[0])
                for i in range(len(y_axis)):
                    real_y_axis.append(
                        float(real_y_axis[i - 1]) + float(y_axis[i]))

            #to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry.
            for i in range(len(totalmeanstd_list)):
                graph_cumulative.append(totalmeanstd_list[i])

            if len(totalmeanstd_list) > 0:
                totalmeanstd_list.append(numpy.mean(totalmeanstd_list))
                totalmeanstd_list.append(
                    numpy.mean(totalmeanstd_list) +
                    2 * numpy.std(totalmeanstd_list))

            for i in range(config.MAX_RESPONSE_CONVERSATIONS):
                if (len(conversations[i]) != 0):
                    meanstd_list = build_mean_list(conversations, i,
                                                   meanstd_list)
                    conversations[i].append(numpy.mean(meanstd_list))
                    conversations[i].append(
                        numpy.mean(meanstd_list) +
                        (2 * numpy.std(meanstd_list)))
                    meanstd_list[:] = []

    graph_cumulative.sort()

    truncated_rt = None
    rt_cutoff_time = None
    if graph_cumulative:
        for i in range(graph_cumulative[len(graph_cumulative) - 1] + 1):
            graph_y_axis.append(graph_cumulative.count(
                i))  # problem when ti=0 count is unexpectedly large
            graph_x_axis.append(i)

        #Finally storing the RT values along with their frequencies in a csv file; no need to invoke build_stat_dist() function
        rows_rt = zip(graph_x_axis, graph_y_axis)
        truncated_rt, rt_cutoff_time = truncate_table(rows_rt,
                                                      cutoff_percentile)

        if config.CUTOFF_TIME_STRATEGY == "TWO_SIGMA":
            resp_time, resp_frequency_tuple = zip(*truncated_rt)
            resp_frequency = list(resp_frequency_tuple)
            rt_cutoff_time_frac = numpy.mean(
                resp_frequency) + 2 * numpy.std(resp_frequency)
            rt_cutoff_time = int(numpy.ceil(rt_cutoff_time_frac))

    return truncated_rt, rt_cutoff_time

コード例 #7

ファイルを表示

ファイル: nickTracker.py プロジェクト: kaushik-rohit/IRCLogParser

def nick_tracker(log_dict, track_users_on_channels = False):
    """ 
        Tracks all nicks and the identifies nicks which point to same user

    Args:
        log_dict(dictionary): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name}

    Returns:
       nicks(list): all nicks
       nick_same_list(list): list of lists with each list corresponding to nicks of same user 

    """
    nicks = []  # list of all the nicknames
    nick_same_list = [[] for i in range(config.MAX_EXPECTED_DIFF_NICKS)]  
    nick_channel_dict = []
    channels_for_user = []
    nicks_hash = []
    channels_hash = []

    #Getting all the nicknames in a list

    def nick_append(nick, nicks, nicks_today_on_this_channel, track_users_on_channels):
        if track_users_on_channels and (nick not in nicks_today_on_this_channel):
            nicks_today_on_this_channel.append(nick) #not nicks as there are same nicks spread across multiple channels
            nicks.append(nick)
        elif nick not in nicks:
            nicks.append(nick)
        return nicks, nicks_today_on_this_channel


    for day_content_all_channels in log_dict.values():
        #traverse over data of different channels for that day
        
        channels_for_user_day = {}#empty for next day usage

        for day_content in day_content_all_channels:
            
            day_log = day_content["log_data"]
            channel_name = day_content["auxiliary_data"]["channel"]
            nicks_today_on_this_channel = []

            for i in day_log:
                # use regex to get the string between <> and appended it to the nicks list
                if(util.check_if_msg_line (i)):
                    m = re.search(r"\<(.*?)\>", i)
                    nick = util.correctLastCharCR(m.group(0)[1:-1])
                    nicks, nicks_today_on_this_channel = nick_append(nick, nicks, nicks_today_on_this_channel, track_users_on_channels)
                    
            ''' Forming list of lists for avoiding nickname duplicacy '''
            for line in day_log:
                if(line[0] == '=' and "changed the topic of" not in line):
                    old_nick = util.splice_find(line, "=", " is", 3)
                    new_nick = util.splice_find(line, "wn as", "\n", 5)
                    nicks, nicks_today_on_this_channel = nick_append(old_nick, nicks, nicks_today_on_this_channel, track_users_on_channels)                
                    nicks, nicks_today_on_this_channel = nick_append(new_nick, nicks, nicks_today_on_this_channel, track_users_on_channels)    
                    
                        #nicks.append(new_nick)
                    for i in range(config.MAX_EXPECTED_DIFF_NICKS):
                        if old_nick in nick_same_list[i] or new_nick in nick_same_list[i]:
                            if old_nick not in nick_same_list[i]:
                                nick_same_list[i].append(old_nick)
                            if new_nick not in nick_same_list[i]:
                                nick_same_list[i].append(new_nick)
                            break
                        if not nick_same_list[i]:
                            if old_nick not in nick_same_list[i]:
                                nick_same_list[i].append(old_nick)
                            if new_nick not in nick_same_list[i]:
                                nick_same_list[i].append(new_nick)
                            break

            if track_users_on_channels:
                '''
                    Creating list of dictionaries nick_channel_dict of the format : 
                        [{'nickname':'rohan', 'channels':['[#abc', 0],['#bcd', 0]]},{}]
                '''
                considered_nicks = []
                if config.DEBUGGER:
                    print "Analysis on", (str(day_content["auxiliary_data"]["day"]) + "-" + str(day_content["auxiliary_data"]["month"])), channel_name
                
                for user in nicks_today_on_this_channel: 
                    f = 1
                    for nick_tuple in nick_same_list:
                        if user in nick_tuple:
                            user_nick = nick_tuple[0]
                            f = 0
                            break
                    if f:
                        user_nick = user

                    '''for channels of user on a day'''
                    if channels_for_user_day.has_key(user_nick) and channel_name not in channels_for_user_day[user_nick]:
                        channels_for_user_day[user_nick].append(channel_name)
                    else:
                        channels_for_user_day[user_nick] = [channel_name]

                    flag = 1
                    for dictionary in nick_channel_dict:
                        if dictionary['nickname'] == user_nick and user_nick not in considered_nicks:
                            index = searchChannel(channel_name, dictionary['channels'])
                            if index == -1:
                                dictionary['channels'].append([channel_name,1])
                            else:
                                dictionary['channels'][index][1]+=1
                            flag = 0
                            considered_nicks.append(user_nick)
                            break
                    if flag:
                        nick_channel_dict.append({'nickname':user_nick, 'channels': [[channel_name, 1]]})
                        considered_nicks.append(user_nick)

        channels_for_user.append(channels_for_user_day)
        

    for nick in nicks:
        for index in range(config.MAX_EXPECTED_DIFF_NICKS):
            if nick in nick_same_list[index]:
                break
            if not nick_same_list[index]:
                nick_same_list[index].append(nick)
                break

    if config.DEBUGGER:
        print "========> 30 on " + str(len(nicks)) + " nicks"
        print nicks[:30]
        print "========> 30 on " + str(len(nick_same_list)) + " nick_same_list"
        print nick_same_list[:30]

    if not track_users_on_channels:
        return [nicks, nick_same_list]

    else:
        for dicts in nick_channel_dict:
            nick = dicts['nickname']
            if nick not in nicks_hash:
                nicks_hash.append(nick)

            for channel in dicts['channels']:
                if channel[0] not in channels_hash:
                    channels_hash.append(channel[0])
        
        return [nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash]

コード例 #8

ファイルを表示

 def test_splice_find(self, line, search_param1, search_param2,
                      splice_index, expected_result, mock_correctLastChar):
     mock_correctLastChar.side_effect = mock_correctLastCharCR
     self.assertEqual(
         util.splice_find(line, search_param1, search_param2, splice_index),
         expected_result)

コード例 #9

ファイルを表示

def conv_len_conv_refr_time(log_dict, nicks, nick_same_list):
    """ Calculates the conversation length (CL) that is the length of time for which two users communicate 
	i.e. if a message is not replied to within Response Time(RT), 
	then it is considered as a part of another conversation.
	This function also calculates the conversation refresh time(CRT)
	For a pair of users, this is the time when one conversation ends and another one starts.
	Args:   
		log_dict (str): Dictionary of logs data created using reader.py
		nicks(List) : list of nickname created using nickTracker.py
		nick_same_list :List of same_nick names created using nickTracker.py        
	Returns:
		row_cl(zip List): Conversation Length
		row_crt(zip List) :Conversation Refresh time
	   
	"""
    conv = []
    conv_diff = []

    G = util.to_graph(nick_same_list)
    conn_comp_list = list(connected_components(G))

    util.create_connected_nick_list(conn_comp_list)

    # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g.
    #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user.

    conversations = [
        [] for i in range(config.MAX_CONVERSATIONS)
    ]  #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too.
    ## I would advice on using a different data structure which does not have an upper bound like we do in arrays.
    graphx1 = []
    graphy1 = []
    graphx2 = []
    graphy2 = []

    dateadd = -1  #Variable used for response time calculation. Varies from 0-365.

    def build_conversation(rec_list, nick, send_time, nick_to_search,
                           nick_receiver, nick_sender, dateadd, conversations,
                           conn_comp_list):
        for names in rec_list:
            conversations, nick_receiver, send_time = conv_helper(
                names, nick, send_time, nick_to_search, nick_receiver,
                nick_sender, dateadd, conversations, conn_comp_list)
        return conversations, nick_receiver, send_time

    def conv_helper(rec, nick, send_time, nick_to_search, nick_receiver,
                    nick_sender, dateadd, conversations, conn_comp_list):
        if (rec == nick):
            send_time.append(line[1:6])
            if (nick_to_search != nick):
                nick_receiver = util.get_nick_sen_rec(len(nicks), nick,
                                                      conn_comp_list,
                                                      nick_receiver)
                for i in range(config.MAX_CONVERSATIONS):
                    if (nick_sender in conversations[i]
                            and nick_receiver in conversations[i]):
                        conversations = conv_append(conversations, i, dateadd,
                                                    line)
                        break
                    if (len(conversations[i]) == 0):
                        conversations[i].append(nick_sender)
                        conversations[i].append(nick_receiver)
                        conversations = conv_append(conversations, i, dateadd,
                                                    line)
                        break
        return conversations, nick_receiver, send_time

    def conv_mat_diff(i, j, conversations):
        """
		i(int): matrix index for row 
		j(int): matrix index for column
		"""
        return (conversations[i][j] - conversations[i][j - 1])

    def conv_append(conversations, index, dateadd, line):
        conversations[index].append(
            config.HOURS_PER_DAY * config.MINS_PER_HOUR * dateadd +
            int(line[1:6][0:2]) * config.MINS_PER_HOUR + int(line[1:6][3:5]))
        return conversations

    for day_content_all_channels in list(log_dict.values()):
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]

            dateadd = dateadd + 1
            send_time = [
            ]  #list of all the times a user sends a message to another user
            #code for making relation map between clients
            for line in day_log:
                flag_comma = 0
                if (util.check_if_msg_line(line)):
                    nick_sender = ""
                    nick_receiver = ""
                    m = re.search(r"\<(.*?)\>", line)
                    nick_to_search = util.correctLastCharCR(m.group(0)[1:-1])
                    nick_sender = util.get_nick_sen_rec(
                        len(nicks), nick_to_search, conn_comp_list,
                        nick_sender)

                    for nick in nicks:
                        rec_list = [e.strip() for e in line.split(':')]
                        util.rec_list_splice(rec_list)
                        if not rec_list[1]:
                            break
                        rec_list = util.correct_last_char_list(rec_list)
                        conversations, nick_receiver, send_time = build_conversation(
                            rec_list, nick, send_time, nick_to_search,
                            nick_receiver, nick_sender, dateadd, conversations,
                            conn_comp_list)

                        if "," in rec_list[1]:
                            flag_comma = 1
                            rec_list_2 = [
                                e.strip() for e in rec_list[1].split(',')
                            ]
                            rec_list_2 = util.correct_last_char_list(
                                rec_list_2)
                            conversations, nick_receiver, send_time = build_conversation(
                                rec_list_2, nick, send_time, nick_to_search,
                                nick_receiver, nick_sender, dateadd,
                                conversations, conn_comp_list)

                        if (flag_comma == 0):
                            rec = util.splice_find(line, ">", ", ", 1)
                            conversations, nick_receiver, send_time = conv_helper(
                                rec, nick, send_time, nick_to_search,
                                nick_receiver, nick_sender, dateadd,
                                conversations, conn_comp_list)

    #Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year.

    for i in range(
            len(conversations)
    ):  #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs)
        if (
                len(conversations[i]) != 0
        ):  # response times are calculated starting from index 2. So now we have all the response times in conversations.
            del conversations[i][0:2]

    for i in range(len(conversations)):
        if (len(conversations[i]) != 0):
            first = conversations[i][0]
            for j in range(1, len(conversations[i])):
                if (conv_mat_diff(i, j, conversations) > 9):

                    conv.append(
                        conversations[i][j - 1] - first
                    )  #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response
                    #time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code.
                    conv_diff.append(conv_mat_diff(i, j, conversations))
                    first = conversations[i][j]
                if (j == (len(conversations[i]) - 1)):
                    conv.append(conversations[i][j] - first)
                    break

    def build_conv_csv(conv_list, graph_x, graph_y):

        for i in range(max(conv_list)):
            graph_x.append(i)
            graph_y.append(conv_list.count(i))

        return graph_x, graph_y

    graphx1, graphy1 = build_conv_csv(conv, graphx1, graphy1)
    graphx2, graphy2 = build_conv_csv(conv_diff, graphx2, graphy2)

    #To plot CDF we store the CL and CRT values and their number of occurences as shown above.

    row_cl = list(zip(graphx1, graphy1))
    row_crt = list(zip(graphx2, graphy2))

    return row_cl, row_crt

コード例 #10

ファイルを表示

ファイル: channel.py プロジェクト: rohangoel96/IRCLogParser

def conv_len_conv_refr_time(log_dict, nicks, nick_same_list):

	""" Calculates the conversation length (CL) that is the length of time for which two users communicate 
	i.e. if a message is not replied to within Response Time(RT), 
	then it is considered as a part of another conversation.
	This function also calculates the conversation refresh time(CRT)
	For a pair of users, this is the time when one conversation ends and another one starts.
	Args:   
		log_dict (str): Dictionary of logs data created using reader.py
		nicks(List) : list of nickname created using nickTracker.py
		nick_same_list :List of same_nick names created using nickTracker.py        
	Returns:
		row_cl(zip List): Conversation Length
		row_crt(zip List) :Conversation Refresh time
	   
	""" 
	conv = []
	conv_diff = []  

	G = util.to_graph(nick_same_list)
	conn_comp_list = list(connected_components(G))

	util.create_connected_nick_list(conn_comp_list)

	# We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g.
	#Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user.

	conversations=[[] for i in range(config.MAX_CONVERSATIONS)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too.
											 ## I would advice on using a different data structure which does not have an upper bound like we do in arrays.
	graphx1 =[]
	graphy1 =[]
	graphx2 =[]
	graphy2 =[]

	dateadd = -1 #Variable used for response time calculation. Varies from 0-365.

	def build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list):
		for names in rec_list:
			conversations, nick_receiver, send_time = conv_helper(names, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)			
		return  conversations, nick_receiver, send_time

	def conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list):
		if(rec == nick):
			send_time.append(line[1:6])
			if(nick_to_search != nick):
				nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver)				
				for i in range(config.MAX_CONVERSATIONS):
					if (nick_sender in conversations[i] and nick_receiver in conversations[i]):
						conversations = conv_append(conversations, i, dateadd, line)
						break
					if(len(conversations[i]) == 0):
						conversations[i].append(nick_sender)
						conversations[i].append(nick_receiver)						
						conversations = conv_append(conversations, i, dateadd, line)
						break
		return  conversations, nick_receiver, send_time

	def conv_mat_diff(i,j,conversations):
		"""
		i(int): matrix index for row 
		j(int): matrix index for column
		"""
		return (conversations[i][j]-conversations[i][j-1])	

	def conv_append(conversations, index, dateadd, line):
		conversations[index].append(config.HOURS_PER_DAY*config.MINS_PER_HOUR*dateadd + int(line[1:6][0:2])*config.MINS_PER_HOUR + int(line[1:6][3:5]))
		return conversations

	for day_content_all_channels in log_dict.values():
		for day_content in day_content_all_channels:
			day_log = day_content["log_data"]

			dateadd = dateadd + 1
			send_time = [] #list of all the times a user sends a message to another user		
			#code for making relation map between clients       
			for line in day_log:
				flag_comma = 0
				if(util.check_if_msg_line (line)):
					nick_sender = ""
					nick_receiver = ""
					m = re.search(r"\<(.*?)\>", line)
					nick_to_search = util.correctLastCharCR(m.group(0)[1:-1])
					nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender)				
						
					for nick in nicks:
						rec_list = [e.strip() for e in line.split(':')]
						util.rec_list_splice(rec_list)						
						if not rec_list[1]:
							break						
						rec_list = util.correct_last_char_list(rec_list)							
						conversations, nick_receiver, send_time = build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)
													
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2 = [e.strip() for e in rec_list[1].split(',')]							
							rec_list_2 = util.correct_last_char_list(rec_list_2)
							conversations, nick_receiver, send_time = build_conversation(rec_list_2, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)		
							
						if(flag_comma == 0):
							rec = util.splice_find(line, ">", ", ", 1)							            
							conversations, nick_receiver, send_time	= conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list)			
		
	#Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year.

	for i in range(len(conversations)):       #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs)
		if(len(conversations[i]) != 0):              # response times are calculated starting from index 2. So now we have all the response times in conversations.
			del conversations[i][0:2]

	for i in range(len(conversations)):
		if(len(conversations[i]) != 0):
			first = conversations[i][0]
			for j in range(1, len(conversations[i])):
					if(conv_mat_diff(i, j, conversations) > 9):

						conv.append(conversations[i][j-1] - first)    #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response
																	#time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code.
						conv_diff.append(conv_mat_diff(i, j, conversations))
						first = conversations[i][j]
					if(j == (len(conversations[i]) - 1)):
						conv.append(conversations[i][j] - first)                    
						break						

	def build_conv_csv(conv_list, graph_x, graph_y):

		for i in range(max(conv_list)):
			graph_x.append(i)
			graph_y.append(conv_list.count(i))

		return graph_x, graph_y

	graphx1, graphy1 = build_conv_csv(conv, graphx1, graphy1)
	graphx2, graphy2 = build_conv_csv(conv_diff, graphx2, graphy2)	

	#To plot CDF we store the CL and CRT values and their number of occurences as shown above.

	row_cl = zip(graphx1, graphy1)
	row_crt = zip(graphx2, graphy2)
	
	return row_cl, row_crt

コード例 #11

ファイルを表示

ファイル: channel.py プロジェクト: rohangoel96/IRCLogParser

def response_time(log_dict, nicks, nick_same_list):

	""" finds the response time of a message 
	i.e. the best guess for the time at which one can expect a reply for his/her message.

	Args:
		log_dict (str): Dictionary of logs data created using reader.py
		nicks(List) : List of nickname created using nickTracker.py
		nick_same_list :List of same_nick names created using nickTracker.py
		output_directory (str): Location of output directory
		
	Returns:
	   rows_RT(zip List): Response Time (This refers to the response
		time of a message i.e. the best guess for the time at
		which one can expect a reply for his/her message)

	"""
	G = util.to_graph(nick_same_list)
	conn_comp_list = list(connected_components(G))

	util.create_connected_nick_list(conn_comp_list)
	
	graph_cumulative = []
	graph_x_axis = []
	graph_y_axis = []

	def build_mean_list(conversations, index, mean_list):
		for j in range(2, len(conversations[index])):
			mean_list.append(conversations[index][j])
		return mean_list

	def resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list):
		if(rec == nick):
			send_time.append(line[1:6])
			if(nick_to_search != nick):
				nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver)								
				for i in range(config.MAX_RESPONSE_CONVERSATIONS):
					if (nick_sender in conversations[i] and nick_receiver in conversations[i]): 
						conversations[i].append(line[1:6])
						break
					if(len(conversations[i]) == 0):
						conversations[i].append(nick_sender)
						conversations[i].append(nick_receiver)
						conversations[i].append(line[1:6])
						break		
		return conversations, nick_receiver, send_time				

	for day_content_all_channels in log_dict.values():
		for day_content in day_content_all_channels:
			day_log = day_content["log_data"]

			send_time = []  #list of all the times a user sends a message to another user
			meanstd_list = []
			totalmeanstd_list = []
			x_axis = []
			y_axis = []
			real_y_axis = []             
			conversations = [[] for i in range(config.MAX_RESPONSE_CONVERSATIONS)]

			#code for making relation map between clients       
			for line in day_log:
				flag_comma = 0
				if(util.check_if_msg_line (line)):
					nick_sender = ""
					nick_receiver = ""
					m = re.search(r"\<(.*?)\>", line)
					nick_to_search = util.correctLastCharCR(m.group(0)[1:-1])
					nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender)					         
					for nick in nicks:
						rec_list = [e.strip() for e in line.split(':')]
						util.rec_list_splice(rec_list)

						if not rec_list[1]:
							break						
						rec_list = util.correct_last_char_list(rec_list)		
						
						for name in rec_list:
							conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list)							
							
						if "," in rec_list[1]: 
							flag_comma = 1
							rec_list_2 = [e.strip() for e in rec_list[1].split(',')]							
							rec_list_2 = util.correct_last_char_list(rec_list_2)		
								
							for name in rec_list_2:
								conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list)								

						if(flag_comma == 0):
							rec = util.splice_find(line, ">", ", ",1)							
							conversations, nick_receiver, send_time = resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list)						
			
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0):  
					for j in range(2, len(conversations[i]) - 1):
						conversations[i][j]=(int(conversations[i][j+1][0:2])*config.MINS_PER_HOUR+int(conversations[i][j+1][3:5])) - (int(conversations[i][j][0:2])*config.MINS_PER_HOUR+int(conversations[i][j][3:5]))
	
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0): 
					if(len(conversations[i]) == 3):
						conversations[i][2] = int(conversations[i][2][0:2])*config.MINS_PER_HOUR+int(conversations[i][2][3:5])     
					else: 
						del conversations[i][-1]

		#Explanation provided in parser-CL+CRT.py
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0):					
					totalmeanstd_list = build_mean_list(conversations, i, totalmeanstd_list)					

			if(len(totalmeanstd_list) != 0):
				for i in range(max(totalmeanstd_list) + 1):
					x_axis.append(i)

				for i in x_axis:
					y_axis.append(float(totalmeanstd_list.count(i)) / float(len(totalmeanstd_list)))
				
				#finding the probability of each RT to occur=No. of occurence/total occurences.
				real_y_axis.append(y_axis[0])
				for i in range(len(y_axis)):
					real_y_axis.append(float(real_y_axis[i-1]) + float(y_axis[i]))
			
			#to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry.
			for i in range(len(totalmeanstd_list)):
				graph_cumulative.append(totalmeanstd_list[i])

			if len(totalmeanstd_list) > 0:
				totalmeanstd_list.append(numpy.mean(totalmeanstd_list))
				totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list))
		
			for i in range(config.MAX_RESPONSE_CONVERSATIONS):
				if(len(conversations[i]) != 0):					
					meanstd_list = build_mean_list(conversations, i, meanstd_list)					
					conversations[i].append(numpy.mean(meanstd_list))
					conversations[i].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list)))
					meanstd_list[:] = []

	graph_cumulative.sort()

	for i in range(graph_cumulative[len(graph_cumulative)-1] + 1):
		graph_y_axis.append(graph_cumulative.count(i))     # problem when ti=0 count is unexpectedly large
		graph_x_axis.append(i)		

	#Finally storing the RT values along with their frequencies in a csv file. 
	rows_rt = zip(graph_x_axis, graph_y_axis)
	return rows_rt

コード例 #12

ファイルを表示

ファイル: user.py プロジェクト: rohangoel96/IRCLogParser

def keywords(log_dict, nicks, nick_same_list):
    """
    Returns keywods for all users

    Args:   
        log_dict (str): Dictionary of logs data created using reader.py
        nicks(List) : list of nickname created using nickTracker.py
        nick_same_list :List of same_nick names created using nickTracker.py

    Returns
        keywords_filtered: filtered keywords for user
        user_keyword_freq_dict: dictionary for each user having keywords and their frequency
        user_words_dict: keywods for user
        nicks_for_stop_words: stop words
    """
    user_words_dict = []
    user_keyword_freq_dict = []
    keywords_filtered = []
    no_messages = 0    

    def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list):              
        if(rec == nick_name):
            if(nick_to_compare != nick_name):                
                nick_receiver = iter_nicks(nick_receiver, nicks, nick_same_list, nick_name)        
        return nick_receiver           

    def iter_nicks(nick_sender_receiver, nicks, nick_same_list, nick_comp):        
        for i in range(len(nicks)):
            if nick_comp in nick_same_list[i]:
                nick_sender_receiver = nick_same_list[i][0]
                break
            else:
                nick_sender_receiver = nick_comp
        return nick_sender_receiver    

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            for line in day_log:
                flag_comma = 0
                if(util.check_if_msg_line(line)):
                    m = re.search(r"\<(.*?)\>", line)
                    nick_to_compare = util.correctLastCharCR((m.group(0)[1:-1]))
                    nick_sender = ''                    
                    nick_sender = iter_nicks(nick_sender, nicks, nick_same_list, nick_to_compare)
                    
                    nick_receiver = ''
                    for nick_name in nicks:
                        rec_list = [e.strip() for e in line.split(':')] #receiver list splited about :
                        util.rec_list_splice(rec_list)
                        if not rec_list[1]: #index 0 will contain time 14:02
                            break                        
                        rec_list = util.correct_last_char_list(rec_list)        
                        for rec in rec_list:
                            nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list)                            
                
                        if "," in rec_list[1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [e.strip() for e in rec_list[1].split(',')]                            
                            rec_list_2 = util.correct_last_char_list(rec_list_2)        
                            for rec in rec_list_2:
                                nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list)                                

                        if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi!
                            rec = util.splice_find(line, ">", ", ", 1)                            
                            nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list)                           
                                            
                    
                    #generating the words written by the sender
                    message = rec_list[1:]
                    no_messages += 1
                    correctedNickReciever = util.correct_nick_for_(nick_receiver)
                    if correctedNickReciever in message:
                        message.remove(correctedNickReciever)

                    lmtzr = WordNetLemmatizer()
                    
                    #limit word size = 3, drop numbers.
                    word_list_temp = re.sub(r'\d+', '', " ".join(re.findall(r'\w{3,}', ":".join(message).replace(","," ")))).split(" ")
                    word_list = []
                    
                    #remove punctuations
                    for word in word_list_temp:
                        word = word.lower()
                        word_list.append(word.replace("'",""))
                    word_list_lemmatized = []
                    
                    try:     
                        word_list_lemmatized = map(lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list))
                    except UnicodeDecodeError:
                        pass

                    fr = 1
                    for dic in user_words_dict:
                        if dic['sender'] == nick_sender:
                                dic['words'].extend(word_list_lemmatized)
                                fr = 0
                    if fr:
                        user_words_dict.append({'sender':nick_sender, 'words':word_list_lemmatized }) 

    nicks_for_stop_words = []
    stop_word_without_apostrophe = []

    for l in nick_same_list:
        nicks_for_stop_words.extend(l)

    for dictonary in user_words_dict:
        nicks_for_stop_words.append(dictonary['sender'])

    nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words])

    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'",""))        
    
    stop_words_extended = extended_stop_words(nicks_for_stop_words, stop_word_without_apostrophe)

    count_vect = CountVectorizer(analyzer = 'word', stop_words=stop_words_extended, min_df = 1)

    for dictonary in user_words_dict:
        try:
            matrix = count_vect.fit_transform(dictonary['words'])
            freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()]
            keywords = sorted(freqs, key = lambda x: -x[1])
            total_freq = 0.0
            for freq_tuple in keywords:
                total_freq += freq_tuple[1]
            
            for freq_tuple in keywords:
                freq_tuple.append(round(freq_tuple[1]/float(total_freq), 5))
            user_keyword_freq_dict.append({'nick':dictonary['sender'], 'keywords': keywords })
        except ValueError:
                pass
    for data in user_keyword_freq_dict:
        keywords, normal_scores = top_keywords_for_nick(user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD, config.KEYWORDS_MIN_WORDS)
        if config.DEBUGGER:    
            print "Nick:", data['nick']
            print "Keywords with normalised score > 0.01\n", keywords
            print "Their Normal scores\n", normal_scores
            print "\n"
        if keywords:
            keywords_filtered.append({'nick': data['nick'], 'keywords': keywords})
    
    return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words