def nick_change_graph(log_dict, DAY_BY_DAY_ANALYSIS=False): """ creates a graph which tracks the nick changes of the users where each edge has a time stamp denoting the time at which the nick was changed by the user Args: log_dict (str): Dictionary of logs created using reader.py Returns: list of the day_to_day nick changes if config.DAY_BY_DAY_ANALYSIS=True or else an aggregate nick change graph for the given time period. """ rem_time = None #remembers the time of the last message of the file parsed before the current file nick_change_day_list = [] aggregate_nick_change_graph = nx.MultiDiGraph() # graph for nick changes in the whole time span (not day to day) for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] today_nick_change_graph = nx.MultiDiGraph() #using networkx current_line_no = -1 for line in day_log: current_line_no = current_line_no + 1 if(line[0] == '=' and "changed the topic of" not in line): #excluding the condition when user changes the topic. Search for only nick changes nick1 = util.splice_find(line, "=", " is", 3) nick2 = util.splice_find(line, "wn as", "\n", 5) earlier_line_no = current_line_no while earlier_line_no >= 0: #to find the line just before "=="" so as to find time of Nick Change earlier_line_no = earlier_line_no - 1 if(day_log[earlier_line_no][0] != '='): year, month, day = util.get_year_month_day(day_content) util.build_graphs(nick1, nick2, day_log[earlier_line_no][1:6], year, month, day, today_nick_change_graph, aggregate_nick_change_graph) break if(earlier_line_no == -1): today_nick_change_graph.add_edge(nick1, nick2, weight=rem_time) aggregate_nick_change_graph.add_edge(nick1, nick2, weight = rem_time) count = len(day_log) - 1 #setting up the rem_time for next file, by noting the last message sent on that file. while(count >= 0): if(day_log[count][0] != '='): rem_time = day_log[count][1:6] break count = count-1 nick_change_day_list.append(today_nick_change_graph) if DAY_BY_DAY_ANALYSIS: return nick_change_day_list else: return aggregate_nick_change_graph
def parse_log_lines_for_conv(log_dict, nicks, conn_comp_list, conversations): dateadd = -1 #Variable used for response time calculation. Varies from 0-365. for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] dateadd = dateadd + 1 send_time = [ ] #list of all the times a user sends a message to another user #code for making relation map between clients for line in day_log: flag_comma = 0 if (util.check_if_msg_line(line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR( m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec( len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) conversations, nick_receiver, send_time = \ build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list, line) if "," in rec_list[1]: flag_comma = 1 rec_list_2 = [ e.strip() for e in rec_list[1].split(',') ] rec_list_2 = util.correct_last_char_list( rec_list_2) conversations, nick_receiver, send_time = \ build_conversation(rec_list_2, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list, line) if (flag_comma == 0): rec = util.splice_find(line, ">", ", ", 1) conversations, nick_receiver, send_time = \ conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list, line) return conversations, nick_receiver, send_time
def nick_change_graph(log_dict, DAY_BY_DAY_ANALYSIS=False): """ creates a graph which tracks the nick changes of the users where each edge has a time stamp denoting the time at which the nick was changed by the user Args: log_dict (str): Dictionary of logs created using reader.py Returns: list of the day_to_day nick changes if config.DAY_BY_DAY_ANALYSIS=True or else an aggregate nick change graph for the given time period. """ rem_time = None #remembers the time of the last message of the file parsed before the current file nick_change_day_list = [] aggregate_nick_change_graph = nx.MultiDiGraph( ) # graph for nick changes in the whole time span (not day to day) for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] today_nick_change_graph = nx.MultiDiGraph() #using networkx current_line_no = -1 for line in day_log: current_line_no = current_line_no + 1 if ( line[0] == '=' and "changed the topic of" not in line ): #excluding the condition when user changes the topic. Search for only nick changes nick1 = util.splice_find(line, "=", " is", 3) nick2 = util.splice_find(line, "wn as", "\n", 5) earlier_line_no = current_line_no while earlier_line_no >= 0: #to find the line just before "=="" so as to find time of Nick Change earlier_line_no = earlier_line_no - 1 if (day_log[earlier_line_no][0] != '='): year, month, day = util.get_year_month_day( day_content) util.build_graphs(nick1, nick2, day_log[earlier_line_no][1:6], year, month, day, today_nick_change_graph, aggregate_nick_change_graph) break if (earlier_line_no == -1): today_nick_change_graph.add_edge(nick1, nick2, weight=rem_time) aggregate_nick_change_graph.add_edge(nick1, nick2, weight=rem_time) count = len( day_log ) - 1 #setting up the rem_time for next file, by noting the last message sent on that file. while (count >= 0): if (day_log[count][0] != '='): rem_time = day_log[count][1:6] break count = count - 1 nick_change_day_list.append(today_nick_change_graph) if DAY_BY_DAY_ANALYSIS: return nick_change_day_list else: return aggregate_nick_change_graph
def keywords(log_dict, nicks, nick_same_list): """ Returns keywods for all users Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns keywords_filtered: filtered keywords for user user_keyword_freq_dict: dictionary for each user having keywords and their frequency user_words_dict: keywods for user nicks_for_stop_words: stop words """ user_words_dict = [] user_keyword_freq_dict = [] keywords_filtered = [] no_messages = 0 def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list): if (rec == nick_name): if (nick_to_compare != nick_name): nick_receiver = iter_nicks(nick_receiver, nicks, nick_same_list, nick_name) return nick_receiver def iter_nicks(nick_sender_receiver, nicks, nick_same_list, nick_comp): for i in range(len(nicks)): if nick_comp in nick_same_list[i]: nick_sender_receiver = nick_same_list[i][0] break else: nick_sender_receiver = nick_comp return nick_sender_receiver for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] for line in day_log: flag_comma = 0 if (util.check_if_msg_line(line)): m = re.search(r"\<(.*?)\>", line) nick_to_compare = util.correctLastCharCR( (m.group(0)[1:-1])) nick_sender = '' nick_sender = iter_nicks(nick_sender, nicks, nick_same_list, nick_to_compare) nick_receiver = '' for nick_name in nicks: rec_list = [e.strip() for e in line.split(':') ] #receiver list splited about : util.rec_list_splice(rec_list) if not rec_list[1]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for rec in rec_list: nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if "," in rec_list[ 1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [ e.strip() for e in rec_list[1].split(',') ] rec_list_2 = util.correct_last_char_list( rec_list_2) for rec in rec_list_2: nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if (flag_comma == 0 ): #receiver list can be <Dhruv> Rohan, Hi! rec = util.splice_find(line, ">", ", ", 1) nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) #generating the words written by the sender message = rec_list[1:] no_messages += 1 correctedNickReciever = util.correct_nick_for_( nick_receiver) if correctedNickReciever in message: message.remove(correctedNickReciever) lmtzr = WordNetLemmatizer() #limit word size = 3, drop numbers. word_list_temp = re.sub( r'\d+', '', " ".join( re.findall(r'\w{3,}', ":".join(message).replace( ",", " ")))).split(" ") word_list = [] #remove punctuations for word in word_list_temp: word = word.lower() word_list.append(word.replace("'", "")) word_list_lemmatized = [] try: word_list_lemmatized = map( lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list)) except UnicodeDecodeError: pass fr = 1 for dic in user_words_dict: if dic['sender'] == nick_sender: dic['words'].extend(word_list_lemmatized) fr = 0 if fr: user_words_dict.append({ 'sender': nick_sender, 'words': word_list_lemmatized }) nicks_for_stop_words = [] stop_word_without_apostrophe = [] for l in nick_same_list: nicks_for_stop_words.extend(l) for dictonary in user_words_dict: nicks_for_stop_words.append(dictonary['sender']) nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words]) for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'", "")) stop_words_extended = extended_stop_words(nicks_for_stop_words, stop_word_without_apostrophe) count_vect = CountVectorizer(analyzer='word', stop_words=stop_words_extended, min_df=1) for dictonary in user_words_dict: try: matrix = count_vect.fit_transform(dictonary['words']) freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()] keywords = sorted(freqs, key=lambda x: -x[1]) total_freq = 0.0 for freq_tuple in keywords: total_freq += freq_tuple[1] for freq_tuple in keywords: freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5)) user_keyword_freq_dict.append({ 'nick': dictonary['sender'], 'keywords': keywords }) except ValueError: pass for data in user_keyword_freq_dict: keywords, normal_scores = top_keywords_for_nick( user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD, config.KEYWORDS_MIN_WORDS) if config.DEBUGGER: print "Nick:", data['nick'] print "Keywords with normalised score > 0.01\n", keywords print "Their Normal scores\n", normal_scores print "\n" if keywords: keywords_filtered.append({ 'nick': data['nick'], 'keywords': keywords }) return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words
def test_splice_find(self, line, search_param1, search_param2, splice_index, expected_result): self.assertEqual( util.splice_find(line, search_param1, search_param2, splice_index), expected_result)
def response_time(log_dict, nicks, nick_same_list, cutoff_percentile): """ finds the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message. Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py cutoff_percentile (int): Cutoff percentile indicating statistical significance Returns: rows_RT(zip List): Response Time (This refers to the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message) """ G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) graph_cumulative = [] graph_x_axis = [] graph_y_axis = [] def build_mean_list(conversations, index, mean_list): for j in range(2, len(conversations[index])): mean_list.append(conversations[index][j]) return mean_list def resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list): if (rec == nick): send_time.append(line[1:6]) if (nick_to_search != nick): nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (nick_sender in conversations[i] and nick_receiver in conversations[i]): conversations[i].append(line[1:6]) break if (len(conversations[i]) == 0): conversations[i].append(nick_sender) conversations[i].append(nick_receiver) conversations[i].append(line[1:6]) break return conversations, nick_receiver, send_time for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] send_time = [ ] #list of all the times a user sends a message to another user meanstd_list = [] totalmeanstd_list = [] x_axis = [] y_axis = [] real_y_axis = [] conversations = [[] for i in range(config.MAX_RESPONSE_CONVERSATIONS)] #code for making relation map between clients for line in day_log: flag_comma = 0 if (util.check_if_msg_line(line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec( len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) for name in rec_list: conversations, nick_receiver, send_time = resp_helper( name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if "," in rec_list[1]: flag_comma = 1 rec_list_2 = [ e.strip() for e in rec_list[1].split(',') ] rec_list_2 = util.correct_last_char_list( rec_list_2) for name in rec_list_2: conversations, nick_receiver, send_time = resp_helper( name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if (flag_comma == 0): rec = util.splice_find(line, ">", ", ", 1) conversations, nick_receiver, send_time = resp_helper( rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (len(conversations[i]) != 0): for j in range(2, len(conversations[i]) - 1): conversations[i][j] = ( int(conversations[i][j + 1][0:2]) * config.MINS_PER_HOUR + int(conversations[i][j + 1][3:5])) - ( int(conversations[i][j][0:2]) * config.MINS_PER_HOUR + int(conversations[i][j][3:5])) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (len(conversations[i]) != 0): if (len(conversations[i]) == 3): conversations[i][2] = int(conversations[i][2][ 0:2]) * config.MINS_PER_HOUR + int( conversations[i][2][3:5]) else: del conversations[i][-1] #Explanation provided in parser-CL+CRT.py for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (len(conversations[i]) != 0): totalmeanstd_list = build_mean_list( conversations, i, totalmeanstd_list) if (len(totalmeanstd_list) != 0): for i in range(max(totalmeanstd_list) + 1): x_axis.append(i) for i in x_axis: y_axis.append( float(totalmeanstd_list.count(i)) / float(len(totalmeanstd_list))) #finding the probability of each RT to occur=No. of occurence/total occurences. real_y_axis.append(y_axis[0]) for i in range(len(y_axis)): real_y_axis.append( float(real_y_axis[i - 1]) + float(y_axis[i])) #to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry. for i in range(len(totalmeanstd_list)): graph_cumulative.append(totalmeanstd_list[i]) if len(totalmeanstd_list) > 0: totalmeanstd_list.append(numpy.mean(totalmeanstd_list)) totalmeanstd_list.append( numpy.mean(totalmeanstd_list) + 2 * numpy.std(totalmeanstd_list)) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (len(conversations[i]) != 0): meanstd_list = build_mean_list(conversations, i, meanstd_list) conversations[i].append(numpy.mean(meanstd_list)) conversations[i].append( numpy.mean(meanstd_list) + (2 * numpy.std(meanstd_list))) meanstd_list[:] = [] graph_cumulative.sort() truncated_rt = None rt_cutoff_time = None if graph_cumulative: for i in range(graph_cumulative[len(graph_cumulative) - 1] + 1): graph_y_axis.append(graph_cumulative.count( i)) # problem when ti=0 count is unexpectedly large graph_x_axis.append(i) #Finally storing the RT values along with their frequencies in a csv file; no need to invoke build_stat_dist() function rows_rt = zip(graph_x_axis, graph_y_axis) truncated_rt, rt_cutoff_time = truncate_table(rows_rt, cutoff_percentile) if config.CUTOFF_TIME_STRATEGY == "TWO_SIGMA": resp_time, resp_frequency_tuple = zip(*truncated_rt) resp_frequency = list(resp_frequency_tuple) rt_cutoff_time_frac = numpy.mean( resp_frequency) + 2 * numpy.std(resp_frequency) rt_cutoff_time = int(numpy.ceil(rt_cutoff_time_frac)) return truncated_rt, rt_cutoff_time
def nick_tracker(log_dict, track_users_on_channels = False): """ Tracks all nicks and the identifies nicks which point to same user Args: log_dict(dictionary): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} Returns: nicks(list): all nicks nick_same_list(list): list of lists with each list corresponding to nicks of same user """ nicks = [] # list of all the nicknames nick_same_list = [[] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] nick_channel_dict = [] channels_for_user = [] nicks_hash = [] channels_hash = [] #Getting all the nicknames in a list def nick_append(nick, nicks, nicks_today_on_this_channel, track_users_on_channels): if track_users_on_channels and (nick not in nicks_today_on_this_channel): nicks_today_on_this_channel.append(nick) #not nicks as there are same nicks spread across multiple channels nicks.append(nick) elif nick not in nicks: nicks.append(nick) return nicks, nicks_today_on_this_channel for day_content_all_channels in log_dict.values(): #traverse over data of different channels for that day channels_for_user_day = {}#empty for next day usage for day_content in day_content_all_channels: day_log = day_content["log_data"] channel_name = day_content["auxiliary_data"]["channel"] nicks_today_on_this_channel = [] for i in day_log: # use regex to get the string between <> and appended it to the nicks list if(util.check_if_msg_line (i)): m = re.search(r"\<(.*?)\>", i) nick = util.correctLastCharCR(m.group(0)[1:-1]) nicks, nicks_today_on_this_channel = nick_append(nick, nicks, nicks_today_on_this_channel, track_users_on_channels) ''' Forming list of lists for avoiding nickname duplicacy ''' for line in day_log: if(line[0] == '=' and "changed the topic of" not in line): old_nick = util.splice_find(line, "=", " is", 3) new_nick = util.splice_find(line, "wn as", "\n", 5) nicks, nicks_today_on_this_channel = nick_append(old_nick, nicks, nicks_today_on_this_channel, track_users_on_channels) nicks, nicks_today_on_this_channel = nick_append(new_nick, nicks, nicks_today_on_this_channel, track_users_on_channels) #nicks.append(new_nick) for i in range(config.MAX_EXPECTED_DIFF_NICKS): if old_nick in nick_same_list[i] or new_nick in nick_same_list[i]: if old_nick not in nick_same_list[i]: nick_same_list[i].append(old_nick) if new_nick not in nick_same_list[i]: nick_same_list[i].append(new_nick) break if not nick_same_list[i]: if old_nick not in nick_same_list[i]: nick_same_list[i].append(old_nick) if new_nick not in nick_same_list[i]: nick_same_list[i].append(new_nick) break if track_users_on_channels: ''' Creating list of dictionaries nick_channel_dict of the format : [{'nickname':'rohan', 'channels':['[#abc', 0],['#bcd', 0]]},{}] ''' considered_nicks = [] if config.DEBUGGER: print "Analysis on", (str(day_content["auxiliary_data"]["day"]) + "-" + str(day_content["auxiliary_data"]["month"])), channel_name for user in nicks_today_on_this_channel: f = 1 for nick_tuple in nick_same_list: if user in nick_tuple: user_nick = nick_tuple[0] f = 0 break if f: user_nick = user '''for channels of user on a day''' if channels_for_user_day.has_key(user_nick) and channel_name not in channels_for_user_day[user_nick]: channels_for_user_day[user_nick].append(channel_name) else: channels_for_user_day[user_nick] = [channel_name] flag = 1 for dictionary in nick_channel_dict: if dictionary['nickname'] == user_nick and user_nick not in considered_nicks: index = searchChannel(channel_name, dictionary['channels']) if index == -1: dictionary['channels'].append([channel_name,1]) else: dictionary['channels'][index][1]+=1 flag = 0 considered_nicks.append(user_nick) break if flag: nick_channel_dict.append({'nickname':user_nick, 'channels': [[channel_name, 1]]}) considered_nicks.append(user_nick) channels_for_user.append(channels_for_user_day) for nick in nicks: for index in range(config.MAX_EXPECTED_DIFF_NICKS): if nick in nick_same_list[index]: break if not nick_same_list[index]: nick_same_list[index].append(nick) break if config.DEBUGGER: print "========> 30 on " + str(len(nicks)) + " nicks" print nicks[:30] print "========> 30 on " + str(len(nick_same_list)) + " nick_same_list" print nick_same_list[:30] if not track_users_on_channels: return [nicks, nick_same_list] else: for dicts in nick_channel_dict: nick = dicts['nickname'] if nick not in nicks_hash: nicks_hash.append(nick) for channel in dicts['channels']: if channel[0] not in channels_hash: channels_hash.append(channel[0]) return [nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash]
def test_splice_find(self, line, search_param1, search_param2, splice_index, expected_result, mock_correctLastChar): mock_correctLastChar.side_effect = mock_correctLastCharCR self.assertEqual( util.splice_find(line, search_param1, search_param2, splice_index), expected_result)
def conv_len_conv_refr_time(log_dict, nicks, nick_same_list): """ Calculates the conversation length (CL) that is the length of time for which two users communicate i.e. if a message is not replied to within Response Time(RT), then it is considered as a part of another conversation. This function also calculates the conversation refresh time(CRT) For a pair of users, this is the time when one conversation ends and another one starts. Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns: row_cl(zip List): Conversation Length row_crt(zip List) :Conversation Refresh time """ conv = [] conv_diff = [] G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user. conversations = [ [] for i in range(config.MAX_CONVERSATIONS) ] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too. ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. graphx1 = [] graphy1 = [] graphx2 = [] graphy2 = [] dateadd = -1 #Variable used for response time calculation. Varies from 0-365. def build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list): for names in rec_list: conversations, nick_receiver, send_time = conv_helper( names, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) return conversations, nick_receiver, send_time def conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list): if (rec == nick): send_time.append(line[1:6]) if (nick_to_search != nick): nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver) for i in range(config.MAX_CONVERSATIONS): if (nick_sender in conversations[i] and nick_receiver in conversations[i]): conversations = conv_append(conversations, i, dateadd, line) break if (len(conversations[i]) == 0): conversations[i].append(nick_sender) conversations[i].append(nick_receiver) conversations = conv_append(conversations, i, dateadd, line) break return conversations, nick_receiver, send_time def conv_mat_diff(i, j, conversations): """ i(int): matrix index for row j(int): matrix index for column """ return (conversations[i][j] - conversations[i][j - 1]) def conv_append(conversations, index, dateadd, line): conversations[index].append( config.HOURS_PER_DAY * config.MINS_PER_HOUR * dateadd + int(line[1:6][0:2]) * config.MINS_PER_HOUR + int(line[1:6][3:5])) return conversations for day_content_all_channels in list(log_dict.values()): for day_content in day_content_all_channels: day_log = day_content["log_data"] dateadd = dateadd + 1 send_time = [ ] #list of all the times a user sends a message to another user #code for making relation map between clients for line in day_log: flag_comma = 0 if (util.check_if_msg_line(line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec( len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) conversations, nick_receiver, send_time = build_conversation( rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) if "," in rec_list[1]: flag_comma = 1 rec_list_2 = [ e.strip() for e in rec_list[1].split(',') ] rec_list_2 = util.correct_last_char_list( rec_list_2) conversations, nick_receiver, send_time = build_conversation( rec_list_2, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) if (flag_comma == 0): rec = util.splice_find(line, ">", ", ", 1) conversations, nick_receiver, send_time = conv_helper( rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) #Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year. for i in range( len(conversations) ): #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs) if ( len(conversations[i]) != 0 ): # response times are calculated starting from index 2. So now we have all the response times in conversations. del conversations[i][0:2] for i in range(len(conversations)): if (len(conversations[i]) != 0): first = conversations[i][0] for j in range(1, len(conversations[i])): if (conv_mat_diff(i, j, conversations) > 9): conv.append( conversations[i][j - 1] - first ) #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response #time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code. conv_diff.append(conv_mat_diff(i, j, conversations)) first = conversations[i][j] if (j == (len(conversations[i]) - 1)): conv.append(conversations[i][j] - first) break def build_conv_csv(conv_list, graph_x, graph_y): for i in range(max(conv_list)): graph_x.append(i) graph_y.append(conv_list.count(i)) return graph_x, graph_y graphx1, graphy1 = build_conv_csv(conv, graphx1, graphy1) graphx2, graphy2 = build_conv_csv(conv_diff, graphx2, graphy2) #To plot CDF we store the CL and CRT values and their number of occurences as shown above. row_cl = list(zip(graphx1, graphy1)) row_crt = list(zip(graphx2, graphy2)) return row_cl, row_crt
def conv_len_conv_refr_time(log_dict, nicks, nick_same_list): """ Calculates the conversation length (CL) that is the length of time for which two users communicate i.e. if a message is not replied to within Response Time(RT), then it is considered as a part of another conversation. This function also calculates the conversation refresh time(CRT) For a pair of users, this is the time when one conversation ends and another one starts. Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns: row_cl(zip List): Conversation Length row_crt(zip List) :Conversation Refresh time """ conv = [] conv_diff = [] G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user. conversations=[[] for i in range(config.MAX_CONVERSATIONS)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too. ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. graphx1 =[] graphy1 =[] graphx2 =[] graphy2 =[] dateadd = -1 #Variable used for response time calculation. Varies from 0-365. def build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list): for names in rec_list: conversations, nick_receiver, send_time = conv_helper(names, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) return conversations, nick_receiver, send_time def conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list): if(rec == nick): send_time.append(line[1:6]) if(nick_to_search != nick): nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver) for i in range(config.MAX_CONVERSATIONS): if (nick_sender in conversations[i] and nick_receiver in conversations[i]): conversations = conv_append(conversations, i, dateadd, line) break if(len(conversations[i]) == 0): conversations[i].append(nick_sender) conversations[i].append(nick_receiver) conversations = conv_append(conversations, i, dateadd, line) break return conversations, nick_receiver, send_time def conv_mat_diff(i,j,conversations): """ i(int): matrix index for row j(int): matrix index for column """ return (conversations[i][j]-conversations[i][j-1]) def conv_append(conversations, index, dateadd, line): conversations[index].append(config.HOURS_PER_DAY*config.MINS_PER_HOUR*dateadd + int(line[1:6][0:2])*config.MINS_PER_HOUR + int(line[1:6][3:5])) return conversations for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] dateadd = dateadd + 1 send_time = [] #list of all the times a user sends a message to another user #code for making relation map between clients for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) conversations, nick_receiver, send_time = build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) if "," in rec_list[1]: flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) conversations, nick_receiver, send_time = build_conversation(rec_list_2, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) if(flag_comma == 0): rec = util.splice_find(line, ">", ", ", 1) conversations, nick_receiver, send_time = conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list) #Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year. for i in range(len(conversations)): #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs) if(len(conversations[i]) != 0): # response times are calculated starting from index 2. So now we have all the response times in conversations. del conversations[i][0:2] for i in range(len(conversations)): if(len(conversations[i]) != 0): first = conversations[i][0] for j in range(1, len(conversations[i])): if(conv_mat_diff(i, j, conversations) > 9): conv.append(conversations[i][j-1] - first) #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response #time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code. conv_diff.append(conv_mat_diff(i, j, conversations)) first = conversations[i][j] if(j == (len(conversations[i]) - 1)): conv.append(conversations[i][j] - first) break def build_conv_csv(conv_list, graph_x, graph_y): for i in range(max(conv_list)): graph_x.append(i) graph_y.append(conv_list.count(i)) return graph_x, graph_y graphx1, graphy1 = build_conv_csv(conv, graphx1, graphy1) graphx2, graphy2 = build_conv_csv(conv_diff, graphx2, graphy2) #To plot CDF we store the CL and CRT values and their number of occurences as shown above. row_cl = zip(graphx1, graphy1) row_crt = zip(graphx2, graphy2) return row_cl, row_crt
def response_time(log_dict, nicks, nick_same_list): """ finds the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message. Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py output_directory (str): Location of output directory Returns: rows_RT(zip List): Response Time (This refers to the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message) """ G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) graph_cumulative = [] graph_x_axis = [] graph_y_axis = [] def build_mean_list(conversations, index, mean_list): for j in range(2, len(conversations[index])): mean_list.append(conversations[index][j]) return mean_list def resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list): if(rec == nick): send_time.append(line[1:6]) if(nick_to_search != nick): nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (nick_sender in conversations[i] and nick_receiver in conversations[i]): conversations[i].append(line[1:6]) break if(len(conversations[i]) == 0): conversations[i].append(nick_sender) conversations[i].append(nick_receiver) conversations[i].append(line[1:6]) break return conversations, nick_receiver, send_time for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] send_time = [] #list of all the times a user sends a message to another user meanstd_list = [] totalmeanstd_list = [] x_axis = [] y_axis = [] real_y_axis = [] conversations = [[] for i in range(config.MAX_RESPONSE_CONVERSATIONS)] #code for making relation map between clients for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[1]: break rec_list = util.correct_last_char_list(rec_list) for name in rec_list: conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if "," in rec_list[1]: flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for name in rec_list_2: conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if(flag_comma == 0): rec = util.splice_find(line, ">", ", ",1) conversations, nick_receiver, send_time = resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): for j in range(2, len(conversations[i]) - 1): conversations[i][j]=(int(conversations[i][j+1][0:2])*config.MINS_PER_HOUR+int(conversations[i][j+1][3:5])) - (int(conversations[i][j][0:2])*config.MINS_PER_HOUR+int(conversations[i][j][3:5])) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): if(len(conversations[i]) == 3): conversations[i][2] = int(conversations[i][2][0:2])*config.MINS_PER_HOUR+int(conversations[i][2][3:5]) else: del conversations[i][-1] #Explanation provided in parser-CL+CRT.py for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): totalmeanstd_list = build_mean_list(conversations, i, totalmeanstd_list) if(len(totalmeanstd_list) != 0): for i in range(max(totalmeanstd_list) + 1): x_axis.append(i) for i in x_axis: y_axis.append(float(totalmeanstd_list.count(i)) / float(len(totalmeanstd_list))) #finding the probability of each RT to occur=No. of occurence/total occurences. real_y_axis.append(y_axis[0]) for i in range(len(y_axis)): real_y_axis.append(float(real_y_axis[i-1]) + float(y_axis[i])) #to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry. for i in range(len(totalmeanstd_list)): graph_cumulative.append(totalmeanstd_list[i]) if len(totalmeanstd_list) > 0: totalmeanstd_list.append(numpy.mean(totalmeanstd_list)) totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list)) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): meanstd_list = build_mean_list(conversations, i, meanstd_list) conversations[i].append(numpy.mean(meanstd_list)) conversations[i].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list))) meanstd_list[:] = [] graph_cumulative.sort() for i in range(graph_cumulative[len(graph_cumulative)-1] + 1): graph_y_axis.append(graph_cumulative.count(i)) # problem when ti=0 count is unexpectedly large graph_x_axis.append(i) #Finally storing the RT values along with their frequencies in a csv file. rows_rt = zip(graph_x_axis, graph_y_axis) return rows_rt
def keywords(log_dict, nicks, nick_same_list): """ Returns keywods for all users Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns keywords_filtered: filtered keywords for user user_keyword_freq_dict: dictionary for each user having keywords and their frequency user_words_dict: keywods for user nicks_for_stop_words: stop words """ user_words_dict = [] user_keyword_freq_dict = [] keywords_filtered = [] no_messages = 0 def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list): if(rec == nick_name): if(nick_to_compare != nick_name): nick_receiver = iter_nicks(nick_receiver, nicks, nick_same_list, nick_name) return nick_receiver def iter_nicks(nick_sender_receiver, nicks, nick_same_list, nick_comp): for i in range(len(nicks)): if nick_comp in nick_same_list[i]: nick_sender_receiver = nick_same_list[i][0] break else: nick_sender_receiver = nick_comp return nick_sender_receiver for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] for line in day_log: flag_comma = 0 if(util.check_if_msg_line(line)): m = re.search(r"\<(.*?)\>", line) nick_to_compare = util.correctLastCharCR((m.group(0)[1:-1])) nick_sender = '' nick_sender = iter_nicks(nick_sender, nicks, nick_same_list, nick_to_compare) nick_receiver = '' for nick_name in nicks: rec_list = [e.strip() for e in line.split(':')] #receiver list splited about : util.rec_list_splice(rec_list) if not rec_list[1]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for rec in rec_list: nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for rec in rec_list_2: nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi! rec = util.splice_find(line, ">", ", ", 1) nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) #generating the words written by the sender message = rec_list[1:] no_messages += 1 correctedNickReciever = util.correct_nick_for_(nick_receiver) if correctedNickReciever in message: message.remove(correctedNickReciever) lmtzr = WordNetLemmatizer() #limit word size = 3, drop numbers. word_list_temp = re.sub(r'\d+', '', " ".join(re.findall(r'\w{3,}', ":".join(message).replace(","," ")))).split(" ") word_list = [] #remove punctuations for word in word_list_temp: word = word.lower() word_list.append(word.replace("'","")) word_list_lemmatized = [] try: word_list_lemmatized = map(lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list)) except UnicodeDecodeError: pass fr = 1 for dic in user_words_dict: if dic['sender'] == nick_sender: dic['words'].extend(word_list_lemmatized) fr = 0 if fr: user_words_dict.append({'sender':nick_sender, 'words':word_list_lemmatized }) nicks_for_stop_words = [] stop_word_without_apostrophe = [] for l in nick_same_list: nicks_for_stop_words.extend(l) for dictonary in user_words_dict: nicks_for_stop_words.append(dictonary['sender']) nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words]) for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'","")) stop_words_extended = extended_stop_words(nicks_for_stop_words, stop_word_without_apostrophe) count_vect = CountVectorizer(analyzer = 'word', stop_words=stop_words_extended, min_df = 1) for dictonary in user_words_dict: try: matrix = count_vect.fit_transform(dictonary['words']) freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()] keywords = sorted(freqs, key = lambda x: -x[1]) total_freq = 0.0 for freq_tuple in keywords: total_freq += freq_tuple[1] for freq_tuple in keywords: freq_tuple.append(round(freq_tuple[1]/float(total_freq), 5)) user_keyword_freq_dict.append({'nick':dictonary['sender'], 'keywords': keywords }) except ValueError: pass for data in user_keyword_freq_dict: keywords, normal_scores = top_keywords_for_nick(user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD, config.KEYWORDS_MIN_WORDS) if config.DEBUGGER: print "Nick:", data['nick'] print "Keywords with normalised score > 0.01\n", keywords print "Their Normal scores\n", normal_scores print "\n" if keywords: keywords_filtered.append({'nick': data['nick'], 'keywords': keywords}) return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words