def add_name_and_alias(cls, alias_dict, element): """ Takes an alias dict (mapping names -> list of aliases) and a pidgin *.xml buddy/contact element, and extracts the name and aliases from the element, storing them in the dict """ name = element.find('name').text.split('/')[0] aliases = map(lambda elem: util.simplify_SN(elem.text), element.findall('alias')) alias_dict[name].extend(aliases)
def generate_everything(sn_lists, logdir, trillian_logdirs, pidgin_self_aliases=None, pidgin_buddy_aliases=None, ngram_N=1, names=['me','other']): """ Convenience function for aggregating across protocols/programs Inputs ------ sn_lists : a list of things that look like ('aim',['me1','me2',...],['other1','other2',...]) logdir : where pidgin logs (if any) are stored trillian_logdirs : where trillian logs (if any) are stored pidgin_self_aliases : the result of calling PidginReader.make_my_aliases pidgin_buddy_aliases : the result of calling PidginReader.make_buddy_aliases ngram_N : 1 for unigrams, 2 for bigrams, etc. names : your name and the other person's name Returns ------- date_lists : a list of lists of datetimes. 1 list of dates per person. counter_lists : a list of lists of counters. 1 list of counters per person. words : 2 lists, each with all words used by that person intervals : 2 lists of intervals, giving the spacing between words (above) """ import numpy as np pidgin_self_aliases = pidgin_self_aliases or \ collections.defaultdict(lambda : []) pidgin_buddy_aliases = pidgin_buddy_aliases or \ collections.defaultdict(lambda : []) readers = [] for sn_list in sn_lists: (protocol,mes,others) = sn_list if protocol != 'trillian': assert protocol in ['aim', 'jabber', 'msn'] , "Unknown protocol %s"%protocol for (me, other) in itertools.product(mes,others): reader = PidginReader(logdir, [pidgin_self_aliases[me]+[me], pidgin_buddy_aliases[other]+[other]], [me,other], ngram_N, protocol) reader.read_all_files() readers.append(reader) else: for other in others: for trillian_logdir in trillian_logdirs: reader = TrillianReader(trillian_logdir, # I don't know how trillian generates its aliases... [mes, [util.simplify_SN(other)]], [None, other], ngram_N) reader.read_all_files() readers.append(reader) combined = LogReader.combined_results(True, *readers) keys = ['line_dates', 'line_counters', 'words', 'word_dates'] (date_lists, counter_lists, words, word_date_lists) = \ [[r[key] for r in combined] for key in keys] interval_tds = [np.diff(word_dates) for word_dates in word_date_lists] intervals = [map(methodcaller('total_seconds'),s) for s in interval_tds] return (date_lists, counter_lists, words, intervals)
def read_line(self, line, allow_session_start=True): """ Reads one line from an IM log. If it's the next IM in the sequence, adds the timestamp+words to the appropriate lists. """ matcher = my_im.match(line) if matcher is None: if allow_session_start: ##### Check for session start session_start_match = self.session_start_matcher.match(line) if session_start_match: self.previous = dateutil.parser.parse( session_start_match.group(1), ignoretz=True) return None sender = util.simplify_SN(matcher.group(2)) storage = None # which person's list to store this line (IM) in ##### determine sender for (aliases, lines) in zip(self.both_aliases, self.both_lines): if sender in aliases: storage = lines break else: return None # didn't match either set of aliases ##### make sure it's not empty processed = util.strip_punctuation(matcher.group(3).lower()).strip() if processed == '': return None ##### parse timestamp. strptime works the vast majority of the time raw_timestamp = matcher.group(1) try: line_time = datetime.datetime.strptime(raw_timestamp, STANDARD_TIMESTAMP) except ValueError: try: line_time = dateutil.parser.parse(raw_timestamp, ignoretz=True) except ValueError: return None # rare edge case from weird pastes, etc ##### Compute datetime from timestamp (time) and previous (date) timestamp = datetime.datetime.combine(self.previous.date(), line_time.time()) # handle conversations that span multiple days if util.crossed_day(self.previous, line_time): timestamp += ONE_DAY if timestamp < self.previous: return None ##### Store the timestamp and word counters for the message text storage['line_dates'].append(timestamp) storage['line_counters'].append(util.make_ngram_counter(self.ngram_N, matcher.group(3))) words = util.strip_punctuation(matcher.group(3).lower()).split() word_dates = [timestamp] * len(words) # intervals = [0] * len(words) # intervals[0] = (timestamp - self.previous).total_seconds() storage['words'].extend(words) storage['word_dates'].extend(word_dates) #storage['intervals'].extend(intervals) self.previous = timestamp