Example #1
0
 def add_name_and_alias(cls, alias_dict, element):
     """
     Takes an alias dict (mapping names -> list of aliases)
     and a pidgin *.xml buddy/contact element, and extracts the
     name and aliases from the element, storing them in the dict
     """
     name = element.find('name').text.split('/')[0]
     aliases = map(lambda elem: util.simplify_SN(elem.text),
                 element.findall('alias'))
     alias_dict[name].extend(aliases)
Example #2
0
def generate_everything(sn_lists, logdir, trillian_logdirs,
                        pidgin_self_aliases=None, pidgin_buddy_aliases=None,
                        ngram_N=1, names=['me','other']):
    """
    Convenience function for aggregating across protocols/programs

    Inputs
    ------
    sn_lists : a list of things that look like 
               ('aim',['me1','me2',...],['other1','other2',...])
    logdir : where pidgin logs (if any) are stored
    trillian_logdirs : where trillian logs (if any) are stored
    pidgin_self_aliases : the result of calling PidginReader.make_my_aliases
    pidgin_buddy_aliases : the result of calling PidginReader.make_buddy_aliases
    ngram_N : 1 for unigrams, 2 for bigrams, etc.
    names : your name and the other person's name

    Returns
    -------
    date_lists  : a list of lists of datetimes. 1 list of dates per person.
    counter_lists : a list of lists of counters. 1 list of counters per person.
    words : 2 lists, each with all words used by that person
    intervals : 2 lists of intervals, giving the spacing between words (above)
    """
    import numpy as np

    pidgin_self_aliases = pidgin_self_aliases or \
            collections.defaultdict(lambda : [])
    pidgin_buddy_aliases = pidgin_buddy_aliases or \
            collections.defaultdict(lambda : [])
    readers = []
    for sn_list in sn_lists:
        (protocol,mes,others) = sn_list
        if protocol != 'trillian':
            assert protocol in ['aim', 'jabber', 'msn'] , "Unknown protocol %s"%protocol
            for (me, other) in itertools.product(mes,others):
                reader = PidginReader(logdir,
                                      [pidgin_self_aliases[me]+[me],
                                          pidgin_buddy_aliases[other]+[other]],
                                      [me,other],
                                      ngram_N,
                                      protocol)
                reader.read_all_files()
                readers.append(reader)
        else:
            for other in others:
                for trillian_logdir in trillian_logdirs:
                    reader = TrillianReader(trillian_logdir,
                    # I don't know how trillian generates its aliases...
                                            [mes, [util.simplify_SN(other)]],
                                            [None, other],
                                            ngram_N)
                    reader.read_all_files()
                    readers.append(reader)
    combined = LogReader.combined_results(True, *readers)

    keys = ['line_dates', 'line_counters', 'words', 'word_dates']
    (date_lists, counter_lists, words, word_date_lists) = \
        [[r[key] for r in combined] for key in keys]
    interval_tds = [np.diff(word_dates) for word_dates in word_date_lists]
    intervals = [map(methodcaller('total_seconds'),s) for s in interval_tds]

    return (date_lists, counter_lists, words, intervals)
Example #3
0
    def read_line(self, line, allow_session_start=True):
        """
        Reads one line from an IM log. If it's the next IM in the sequence, adds
        the timestamp+words to the appropriate lists.
        """
        matcher = my_im.match(line)
        if matcher is None:
            if allow_session_start:
                ##### Check for session start
                session_start_match = self.session_start_matcher.match(line)
                if session_start_match:
                    self.previous = dateutil.parser.parse(
                                        session_start_match.group(1),
                                        ignoretz=True)
            return None

        sender = util.simplify_SN(matcher.group(2))
        storage = None # which person's list to store this line (IM) in
        ##### determine sender
        for (aliases, lines) in zip(self.both_aliases, self.both_lines):
            if sender in aliases:
                storage = lines
                break
        else:
            return None # didn't match either set of aliases
        ##### make sure it's not empty
        processed = util.strip_punctuation(matcher.group(3).lower()).strip()
        if processed == '':
            return None
        ##### parse timestamp. strptime works the vast majority of the time
        raw_timestamp = matcher.group(1)
        try:
            line_time = datetime.datetime.strptime(raw_timestamp,
                                                   STANDARD_TIMESTAMP)
        except ValueError:
            try:
                line_time = dateutil.parser.parse(raw_timestamp, ignoretz=True)
            except ValueError:
                return None # rare edge case from weird pastes, etc
        ##### Compute datetime from timestamp (time) and previous (date)
        timestamp = datetime.datetime.combine(self.previous.date(),
                                                line_time.time())
        # handle conversations that span multiple days
        if util.crossed_day(self.previous, line_time):
            timestamp += ONE_DAY

        if timestamp < self.previous:
            return None

        ##### Store the timestamp and word counters for the message text
        storage['line_dates'].append(timestamp)
        storage['line_counters'].append(util.make_ngram_counter(self.ngram_N, matcher.group(3)))

        words = util.strip_punctuation(matcher.group(3).lower()).split()
        word_dates = [timestamp] * len(words)
        # intervals = [0] * len(words)
        # intervals[0] = (timestamp - self.previous).total_seconds()

        storage['words'].extend(words)
        storage['word_dates'].extend(word_dates)
        #storage['intervals'].extend(intervals)

        self.previous = timestamp