예제 #1
0
파일: analytics.py 프로젝트: hausdorf/vn
def edgelist_from_communications_by_subgraph(db, vertices, source='maildir',
                                             edge_attributes=[], count=50,
                                             for_networkx=True,
                                             start_date="9/24/2001",
                                             end_date="2/4/2002"):
    """Queries the db for the edges (and associated attributes) for the subgraph
    induced by vertices.

    To accomplish this, query for all messages where 'ffrom' is one of the
    vertices, then filter out the messages where none of the vertices in 'to'
    are also in the vertices list.

    edge_attributes net you additional information added to the edgelist.
    
    NB: this is off the communications collection so many email-specific things
    are missing (e.g. 'body' or 'ldc_knn_topic'). To get this additional
    information, use add_attributes_to_edgelist

    TODO: make this transparent to the API user, so we check to see which
    attributes are not available with the current return and query for the rest.
    --GAC
    
    [JD - is there a smarter way to do this with a single query instead of one
    to db.communications and then one to db.email?]
    
    We return a weighted, attributed edgelist, ostensibly for ingestion into
    NetworkX.
    """

    edgelist = {}
    
    for vertex in vertices:
        cursor = find_communications(db, source=source, ffrom=vertex, count=count,
                                     start_date=start_date, end_date=end_date)
        
        # Sets up the edges only, attributes added in the next step
        [add_communication_to_edgelist(edgelist, com)
         for com in cursor if com['to'] in vertices]

    # Otherwise skip time consuming lookups        
    if len(edge_attributes) > 0:
        # loop over edges present in the subgraph
        for edge_key in edgelist.keys(): 
            for message_id in edgelist[edge_key]['message_ids']:
                add_attributes_to_edgelist(db, edgelist, edge_key, message_id,
                                           edge_attributes, source=source)
        
    if for_networkx:
        # Run the prep command here to be able to feed directly --GAC
        return prepare_edgelist_for_nx(edgelist)
    else:
        return edgelist
예제 #2
0
파일: demo.py 프로젝트: hausdorf/vn
from arbreapp.mailman.queries import find_emails, find_communications

### Some basics on querying

# We first initialize our database connection
db = init_dbconn()

# count=0 means no limit on query size
# cursor = find_emails(source='lucene', count=0)
cursor = find_emails(db, source='maildir') # defaults to limit of 50 emails

senders = list()
for email in cursor:
    if 'to' in email and 'ffrom' in email:
        senders.append((email['to'], email['ffrom']))
print 'An example list:\n%s\n' % (senders)

### Generating the list of recipients between 1/1/2001 and 1/5/2001

start_datetext = "January 1, 2001"
end_datetext = "1/5/2001"
cursor = find_communications(db, source='maildir', start_date=start_datetext,
                             end_date=end_datetext)

early_january_recips = [email['to'] for email in cursor]
print 'Recipients of email between %s and %s:' % (start_datetext, end_datetext)
print early_january_recips
print ''