def __init__(self, rel_id=None): """Initializes the object for the directed communication relationship rel_id = (sender address,recipient address) if it is specified. """ self._db = DB() self._rel_id = None self._messages = MessageCollection.MessageCollection() if rel_id != None: self.setRelationshipID(rel_id)
def __init__(self,rel_id=None): """Initializes the object for the directed communication relationship rel_id = (sender address,recipient address) if it is specified. """ self._db = DB() self._rel_id = None self._messages = MessageCollection.MessageCollection() if rel_id != None: self.setRelationshipID(rel_id)
def __init__(self, rel_id=None): """Initializes the object for the undirected communication relationship rel_id = (participant address 1,paritipant address 2) if it is specified. """ self._db = DB() self._rel_id = None self._directed_relationships = {} self._all_messages = MessageCollection.MessageCollection() self._thread_factory = MessageThreading.MessageThreadFactory() self._threads = None if rel_id != None: self.setRelationshipID(rel_id)
def __init__(self, ego_address): self._db = DB() self._ego_address = ego_address self._alters = self.getSenders() self._alters.extend(self.getRecipients()) self._alters = list(set(self._alters))
class CommEgoNetwork(object): def __init__(self, ego_address): self._db = DB() self._ego_address = ego_address self._alters = self.getSenders() self._alters.extend(self.getRecipients()) self._alters = list(set(self._alters)) def fullyObserved(self): """Returns True if the ego network is fully observed.""" return self._db.fullyObserved(self._ego_address) def setEgoAddress(self, ego_address): """Sets the ego network email address to the address given.""" self._ego_address = ego_address def getSenders(self): """Returns a list of email addresses that sent email to the ego.""" sender_tups = self._db.getSendersForRecipient(self._ego_address) senders = [t[0] for t in sender_tups] return senders def getRecipients(self): """Returns a list of email addresses that received email from the ego.""" recip_tups = self._db.getRecipientsForSender(self._ego_address) recips = [t[0] for t in recip_tups] return recips def getAlters(self): """Returns a list of all email addresses that exchanged email with the ego.""" return self._alters def getCommRelationships(self): """Returns a list of all communication relationships involving the ego.""" return [ CommRelationship.CommRelationship((a, self._ego_address)) for a in self._alters ] def getRelationshipEmailCounts(self): """Returns a dictionary containing relationship email count data. The top level dictionary has two keys: 'ego' and 'data'. The value associated with 'ego' is the ego's email address. The value associated with 'data' is a list of dictionaries containing data for each communication relationship in the ego network. The relationship dictionaries have two keys: 'alter' and 'counts'. The value associated with 'alter' is the alter's email address. The value associated with counts is a list containing [# of messages sent by the ego, # of direct messages sent by the ego, # of indirect messages sent by the ego, # of threaded messages sent by the ego, # of messages sent by the alter, # of direct messages sent by the alter, # of indirect messages sent by the alter, # of threaded messages sent by the alter]. """ ego = self._ego_address ego_data = {'ego': ego, 'data': []} rels = self.getCommRelationships() for rel in rels: # Get the relationship participants rel_id = rel.getRelationshipID() # Identify the alter address if rel_id[0] == ego: alter = rel_id[1] else: alter = rel_id[0] # Construct the dictionary of relationship data rel_data = {'alter': alter} rel_data['counts'] = [ rel.getNumberOfMsgsFromSender(ego), rel.getNumberOfDirectMsgsFromSender(ego), rel.getNumberOfIndirectMsgsFromSender(ego), rel.getNumberOfThreadedMsgsFromSender(ego), rel.getNumberOfMsgsFromSender(alter), rel.getNumberOfDirectMsgsFromSender(alter), rel.getNumberOfIndirectMsgsFromSender(alter), rel.getNumberOfThreadedMsgsFromSender(alter) ] ego_data['data'].append(rel_data) return ego_data def getNumberOfThreadedMsgs(self): """Returns the total number of threaded messages exchanged in the ego network.""" # Get the relationships associated with the ego network rels = self.getCommRelationships() # Collect the message ids of the threaded messages mids = set([]) for rel in rels: threads = rel.getConversationThreads() for thread in threads: thread_mids = thread.getMessageIDs() mids = mids.union(set(thread_mids)) return len(mids)
class DirectedCommRelationship(object): def __init__(self, rel_id=None): """Initializes the object for the directed communication relationship rel_id = (sender address,recipient address) if it is specified. """ self._db = DB() self._rel_id = None self._messages = MessageCollection.MessageCollection() if rel_id != None: self.setRelationshipID(rel_id) def __str__(self): """Creates a display string for the relationship id.""" return str(self._rel_id) def setRelationshipID(self, rel_id): """Sets the message collection to those messages associated with the given directed relationship. """ # Set the relationship id self._rel_id = rel_id # Get the list of (message id, epoch secs) tuples for the directed relationship tlist = self._db.getDirectedCommRelationship(rel_id) # Check to ensure we're dealing with a valid relationship if tlist != None: # Extract the list of message ids mids = [tup[0] for tup in tlist] # Set the list of message ids in the MessageCollection object self._messages.setMessageIDs(mids) def getRelationshipID(self): """Returns the relationship ID for the directed relationship.""" return self._rel_id def getAllMessages(self): """Returns the message collection associated with the directed relationship.""" return self._messages def getSenderTokens(self, time_interval=None): """Returns a list of all sender tokens from the set of messages in the directed relationship. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the sender tokens returned correspond to the messages that were sent within the time interval. """ # If a time interval is specified, check to see if it is valid if time_interval != None and not valid_time_interval(time_interval): raise Exception("The specificed time interval is not valid!") # Extract the tokens tokens = [] for msg in self._messages: if time_interval == None or within_time_interval( msg.Datetime, time_interval): tokens.extend(msg.getSenderTokens()) return tokens def getNumberOfMsgs(self, time_interval=None): """Returns the number of messages associated with the directed relationship. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the count returned corresponds to the number of messages that were sent within the time interval. """ return self._messages.getNumberOfMsgs(time_interval) def getNumberOfDirectMsgs(self, time_interval=None): """Returns the number of messages where the recipient is listed in the TO field. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the count returned corresponds to the number of relevant messages that were sent within the time interval. """ # If a time interval is specified, check to see if it is valid if time_interval != None and not valid_time_interval(time_interval): raise Exception("The specificed time interval is not valid!") # Count the relevant messages count = 0 recip = self._rel_id[1] for msg in self._messages: # If a time interval was specified and the message is not in the time interval, # continue to the next message if time_interval != None and not within_time_interval( msg.Datetime, time_interval): continue if recip in msg.TO: count += 1 return count def getNumberOfIndirectMsgs(self, time_interval=None): """Returns the number of messages where the recipient is listed in either the CC or BCC field. If the recipient is listed in the TO field as well, the TO field takes precedent. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the count returned corresponds to the number of relevant messages that were sent within the time interval. """ # If a time interval is specified, check to see if it is valid if time_interval != None and not valid_time_interval(time_interval): raise Exception("The specificed time interval is not valid!") count = 0 recip = self._rel_id[1] for msg in self._messages: # If a time interval was specified and the message is not in the time interval, # continue to the next message if time_interval != None and not within_time_interval( msg.Datetime, time_interval): continue if (not recip in msg.TO) and (recip in msg.CC or recip in msg.BCC): count += 1 return count
def __init__(self,ego_address): self._db = DB() self._ego_address = ego_address self._alters = self.getSenders() self._alters.extend(self.getRecipients()) self._alters = list(set(self._alters))
class CommEgoNetwork(object): def __init__(self,ego_address): self._db = DB() self._ego_address = ego_address self._alters = self.getSenders() self._alters.extend(self.getRecipients()) self._alters = list(set(self._alters)) def fullyObserved(self): """Returns True if the ego network is fully observed.""" return self._db.fullyObserved(self._ego_address) def setEgoAddress(self,ego_address): """Sets the ego network email address to the address given.""" self._ego_address = ego_address def getSenders(self): """Returns a list of email addresses that sent email to the ego.""" sender_tups = self._db.getSendersForRecipient(self._ego_address) senders = [t[0] for t in sender_tups] return senders def getRecipients(self): """Returns a list of email addresses that received email from the ego.""" recip_tups = self._db.getRecipientsForSender(self._ego_address) recips = [t[0] for t in recip_tups] return recips def getAlters(self): """Returns a list of all email addresses that exchanged email with the ego.""" return self._alters def getCommRelationships(self): """Returns a list of all communication relationships involving the ego.""" return [CommRelationship.CommRelationship((a,self._ego_address)) for a in self._alters] def getRelationshipEmailCounts(self): """Returns a dictionary containing relationship email count data. The top level dictionary has two keys: 'ego' and 'data'. The value associated with 'ego' is the ego's email address. The value associated with 'data' is a list of dictionaries containing data for each communication relationship in the ego network. The relationship dictionaries have two keys: 'alter' and 'counts'. The value associated with 'alter' is the alter's email address. The value associated with counts is a list containing [# of messages sent by the ego, # of direct messages sent by the ego, # of indirect messages sent by the ego, # of threaded messages sent by the ego, # of messages sent by the alter, # of direct messages sent by the alter, # of indirect messages sent by the alter, # of threaded messages sent by the alter]. """ ego = self._ego_address ego_data = {'ego' : ego, 'data' : []} rels = self.getCommRelationships() for rel in rels: # Get the relationship participants rel_id = rel.getRelationshipID() # Identify the alter address if rel_id[0] == ego: alter = rel_id[1] else: alter = rel_id[0] # Construct the dictionary of relationship data rel_data = {'alter' : alter} rel_data['counts'] = [rel.getNumberOfMsgsFromSender(ego), rel.getNumberOfDirectMsgsFromSender(ego), rel.getNumberOfIndirectMsgsFromSender(ego), rel.getNumberOfThreadedMsgsFromSender(ego), rel.getNumberOfMsgsFromSender(alter), rel.getNumberOfDirectMsgsFromSender(alter), rel.getNumberOfIndirectMsgsFromSender(alter), rel.getNumberOfThreadedMsgsFromSender(alter)] ego_data['data'].append(rel_data) return ego_data def getNumberOfThreadedMsgs(self): """Returns the total number of threaded messages exchanged in the ego network.""" # Get the relationships associated with the ego network rels = self.getCommRelationships() # Collect the message ids of the threaded messages mids = set([]) for rel in rels: threads = rel.getConversationThreads() for thread in threads: thread_mids = thread.getMessageIDs() mids = mids.union(set(thread_mids)) return len(mids)
class DirectedCommRelationship(object): def __init__(self,rel_id=None): """Initializes the object for the directed communication relationship rel_id = (sender address,recipient address) if it is specified. """ self._db = DB() self._rel_id = None self._messages = MessageCollection.MessageCollection() if rel_id != None: self.setRelationshipID(rel_id) def __str__(self): """Creates a display string for the relationship id.""" return str(self._rel_id) def setRelationshipID(self,rel_id): """Sets the message collection to those messages associated with the given directed relationship. """ # Set the relationship id self._rel_id = rel_id # Get the list of (message id, epoch secs) tuples for the directed relationship tlist = self._db.getDirectedCommRelationship(rel_id) # Check to ensure we're dealing with a valid relationship if tlist != None: # Extract the list of message ids mids = [tup[0] for tup in tlist] # Set the list of message ids in the MessageCollection object self._messages.setMessageIDs(mids) def getRelationshipID(self): """Returns the relationship ID for the directed relationship.""" return self._rel_id def getAllMessages(self): """Returns the message collection associated with the directed relationship.""" return self._messages def getSenderTokens(self,time_interval=None): """Returns a list of all sender tokens from the set of messages in the directed relationship. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the sender tokens returned correspond to the messages that were sent within the time interval. """ # If a time interval is specified, check to see if it is valid if time_interval != None and not valid_time_interval(time_interval): raise Exception("The specificed time interval is not valid!") # Extract the tokens tokens = [] for msg in self._messages: if time_interval == None or within_time_interval(msg.Datetime,time_interval): tokens.extend(msg.getSenderTokens()) return tokens def getNumberOfMsgs(self,time_interval=None): """Returns the number of messages associated with the directed relationship. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the count returned corresponds to the number of messages that were sent within the time interval. """ return self._messages.getNumberOfMsgs(time_interval) def getNumberOfDirectMsgs(self,time_interval=None): """Returns the number of messages where the recipient is listed in the TO field. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the count returned corresponds to the number of relevant messages that were sent within the time interval. """ # If a time interval is specified, check to see if it is valid if time_interval != None and not valid_time_interval(time_interval): raise Exception("The specificed time interval is not valid!") # Count the relevant messages count = 0 recip = self._rel_id[1] for msg in self._messages: # If a time interval was specified and the message is not in the time interval, # continue to the next message if time_interval != None and not within_time_interval(msg.Datetime,time_interval): continue if recip in msg.TO: count += 1 return count def getNumberOfIndirectMsgs(self,time_interval=None): """Returns the number of messages where the recipient is listed in either the CC or BCC field. If the recipient is listed in the TO field as well, the TO field takes precedent. If a time interval is specified through a tuple of datetimes (interval_begin, interval_end), the count returned corresponds to the number of relevant messages that were sent within the time interval. """ # If a time interval is specified, check to see if it is valid if time_interval != None and not valid_time_interval(time_interval): raise Exception("The specificed time interval is not valid!") count = 0 recip = self._rel_id[1] for msg in self._messages: # If a time interval was specified and the message is not in the time interval, # continue to the next message if time_interval != None and not within_time_interval(msg.Datetime,time_interval): continue if (not recip in msg.TO) and (recip in msg.CC or recip in msg.BCC): count += 1 return count
def __init__(self, message_id=None): self._db = DB() if message_id != None: self.setMessageID(message_id)
class Message(object): def __init__(self, message_id=None): self._db = DB() if message_id != None: self.setMessageID(message_id) def __str__(self): """Creates a display string for printing the specified message attributes.""" # Define the property order for display disp_props = ['MessageID', 'Datetime', 'EpochSecs', 'Sender', 'TO', 'CC', 'BCC', 'Subject', 'Body'] # Assemble the display string disp_string = '' for prop in disp_props: # Get the attribute if it exists try: attr = getattr(self,prop) except AttributeError: attr = 'not defined' # If the attribute is a list, build the string representation if type(attr) == list: astr = '' for item in attr: astr += str(item) + ', ' attr = astr[:-2] # Append to the display string disp_string += prop + ' : ' + str(attr) + '\n' return disp_string def setMessageID(self,message_id): """Sets the message object attributes to those returned by db.getMessage().""" # Get the message properties msg = self._db.getMessage(message_id) # Set the object properties for k,v in msg.items(): setattr(self, k, v) def _trim_at_first_substring(self,sub,s): """Finds the first occurrence of sub in s. If sub is present, s is trimmed at the starting location of sub and returned.""" idx = s.find(sub) if idx > -1: s = s[:idx] return s def getSenderText(self): """Returns the filtered message body with text from previous messages removed.""" # Get the message body body = self.Body # The following are heuristics for identifying sender text in the Enron email corpus # Remove the original message text if present body = self._trim_at_first_substring('-----Original Message-----',body) # Remove forwarded message text if present body = self._trim_at_first_substring('---------------------- Forwarded by',body) body = self._trim_at_first_substring('From:',body) body = self._trim_at_first_substring('To:',body) # Remove meeting text body = self._trim_at_first_substring('-----Original Appointment-----',body) # Remove the BlackBerry signature if present body = self._trim_at_first_substring('--------------------------\nSent from my BlackBerry Wireless Handheld',body) # remove random =20 entries in the message body body = re.sub(r'=20','',body) # remove random = that appear in the middle, at the beginning and at # the end of words body = re.sub(r'\b=\b','',body) body = re.sub(r'=\b','',body) body = re.sub(r'\b=','',body) return body def getSenderTokens(self,lower=True): """Returns a list of tokens derived from the sender's text in the message body. If lower = True, the tokens will be returned in all lowercase.""" # The regular expression defining the tokenizer. # Extracts sequences with <one or more letters>'<one or more letters> OR # <one or more letters> regexp = r"([a-zA-Z]+'[a-zA-Z]+)|([a-zA-Z]+)" # Extract the tokens tokens = nltk.regexp_tokenize(self.getSenderText(),regexp) # Lowercase the tokens if necessary if lower: tokens = map(lambda s : s.lower(),tokens) return tokens