def writeRanks(ids): with newman_connector() as read_cnx1, newman_connector() as read_cnx, newman_connector() as write_cnx: with execute_query(read_cnx1.conn(), stmt) as qry: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning ranks" for mail in qry.cursor(): #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid facts.addFact(mail[0], "email_addr", "rank", ids.get(mail[0],'0'), txid) print "commit" write_cnx.commit()
def writeRanks(ids): with newman_connector() as read_cnx1, newman_connector( ) as read_cnx, newman_connector() as write_cnx: with execute_query(read_cnx1.conn(), stmt) as qry: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning ranks" for mail in qry.cursor(): #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid facts.addFact(mail[0], "email_addr", "rank", ids.get(mail[0], '0'), txid) print "commit" write_cnx.commit()
for subgraph in clustering.subgraphs(): community_name = jsonGet(['name'], head(subgraph.vs['node']), 'n/a') for node in subgraph.vs['node']: node['community'] = community_name #output format #NODE\tCOMMUNITY # for node in nodes: # print "{}\t{}".format(node['name'], node['community']) count = counter(1) with newman_connector() as read_cnx, newman_connector() as write_cnx: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning communities" for node in nodes: email_addr, community_id = node['name'], node['community'] facts.addFact(email_addr, "email_addr", "community", community_id, txid) facts.addFact(email_addr, "email_addr", "group_id", next(count), txid) print "commit" write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "assign community ids" stmt = (
args = parser.parse_args() headers = [ "id", "threadid", "dir", "category", "datetime", "from", "tos", "ccs", "bccs", "subject", "body", "tosize", "ccsize", "attachsize", "attach", "bodysize", "location" ] #skip header row for counting c = counter(-1) with newman_connector() as cnx: tx = Tx(cnx.conn()).next() print "tx: %s" % tx fact = Fact(cnx.conn(), autocommit=False) for line in slurpA(args.input_tsv): try: count = c.next() if count % 1000 == 0: print "ingested count - %s " % count row = line.split('\t') row = (c.strip() for c in row) num, dir, category, utc_date, importance, fromemail, ip, toemail, ccemail, bccemail, attach, messageid, inreplyto, references, subject, body = row fromemail = lower(fromemail) toemail = lower(toemail) ccemail = lower(ccemail) bccemail = lower(bccemail)
for subgraph in clustering.subgraphs(): community_name = jsonGet(['name'], head(subgraph.vs['node']), 'n/a') for node in subgraph.vs['node']: node['community'] = community_name #output format #NODE\tCOMMUNITY # for node in nodes: # print "{}\t{}".format(node['name'], node['community']) count = counter(1) with newman_connector() as read_cnx, newman_connector() as write_cnx: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning communities" for node in nodes: email_addr, community_id = node['name'], node['community'] facts.addFact(email_addr, "email_addr", "community", community_id, txid) facts.addFact(email_addr, "email_addr", "group_id", next(count), txid) print "commit" write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "assign community ids" stmt = ( " insert into facts (subject, schema_name, predicate, obj, tx) " " select f.subject, f.schema_name, 'community_id', f2.obj, %s "
" where f.schema_name = 'email'" " and f2.schema_name = f.schema_name" " and f.predicate = 'from'" " and f2.predicate in ('to', 'cc', 'bcc')") if __name__ == "__main__": parser = argparse.ArgumentParser( description='Enrich Emails Communications') args = parser.parse_args() with newman_connector() as read_cnx, newman_connector() as write_cnx: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich sent to recipient communications" facts = Fact(write_cnx.conn(), autocommit=False) with execute_nonquery(write_cnx.conn(), stmt_sent_to, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid print "enrich total sent to " with execute_query(write_cnx.conn(), stmt_total_recipients, txid) as qry: pass write_cnx.commit() txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid
if __name__ == "__main__": parser = argparse.ArgumentParser(description='Ingest Walker Email') parser.add_argument("input_tsv", help="input of tsv file") args= parser.parse_args() headers = ["id","threadid", "dir","category","datetime","from","tos","ccs","bccs","subject","body","tosize","ccsize","attachsize","attach","bodysize","location"] #skip header row for counting c = counter(-1) with newman_connector() as cnx: tx = Tx(cnx.conn()).next() print "tx: %s" % tx fact = Fact(cnx.conn(), autocommit=False) for line in slurpA(args.input_tsv): try: count = c.next() if count % 1000 == 0: print "ingested count - %s " % count row = line.split('\t') row = (c.strip() for c in row) num,dir,category,utc_date,importance,fromemail,ip,toemail,ccemail,bccemail,attach,messageid,inreplyto,references,subject,body = row fromemail = lower(fromemail) toemail = lower(toemail) ccemail = lower(ccemail) bccemail = lower(bccemail)