def edgelist_from_communications_by_subgraph(db, vertices, source='maildir', edge_attributes=[], count=50, for_networkx=True, start_date="9/24/2001", end_date="2/4/2002"): """Queries the db for the edges (and associated attributes) for the subgraph induced by vertices. To accomplish this, query for all messages where 'ffrom' is one of the vertices, then filter out the messages where none of the vertices in 'to' are also in the vertices list. edge_attributes net you additional information added to the edgelist. NB: this is off the communications collection so many email-specific things are missing (e.g. 'body' or 'ldc_knn_topic'). To get this additional information, use add_attributes_to_edgelist TODO: make this transparent to the API user, so we check to see which attributes are not available with the current return and query for the rest. --GAC [JD - is there a smarter way to do this with a single query instead of one to db.communications and then one to db.email?] We return a weighted, attributed edgelist, ostensibly for ingestion into NetworkX. """ edgelist = {} for vertex in vertices: cursor = find_communications(db, source=source, ffrom=vertex, count=count, start_date=start_date, end_date=end_date) # Sets up the edges only, attributes added in the next step [add_communication_to_edgelist(edgelist, com) for com in cursor if com['to'] in vertices] # Otherwise skip time consuming lookups if len(edge_attributes) > 0: # loop over edges present in the subgraph for edge_key in edgelist.keys(): for message_id in edgelist[edge_key]['message_ids']: add_attributes_to_edgelist(db, edgelist, edge_key, message_id, edge_attributes, source=source) if for_networkx: # Run the prep command here to be able to feed directly --GAC return prepare_edgelist_for_nx(edgelist) else: return edgelist
from arbreapp.mailman.queries import find_emails, find_communications ### Some basics on querying # We first initialize our database connection db = init_dbconn() # count=0 means no limit on query size # cursor = find_emails(source='lucene', count=0) cursor = find_emails(db, source='maildir') # defaults to limit of 50 emails senders = list() for email in cursor: if 'to' in email and 'ffrom' in email: senders.append((email['to'], email['ffrom'])) print 'An example list:\n%s\n' % (senders) ### Generating the list of recipients between 1/1/2001 and 1/5/2001 start_datetext = "January 1, 2001" end_datetext = "1/5/2001" cursor = find_communications(db, source='maildir', start_date=start_datetext, end_date=end_datetext) early_january_recips = [email['to'] for email in cursor] print 'Recipients of email between %s and %s:' % (start_datetext, end_datetext) print early_january_recips print ''