# -*- coding: utf-8 -*- import sys sys.path.append("./demail") from newman.utils.file import slurpA from newman.utils.functions import head,last,nth if __name__ == "__main__": recipients ={} SourceEmail = sys.argv[1] lines = slurpA("tmp/exploded.csv") for line in lines: (dt,src,target) = line.strip().split('\t') if src != SourceEmail or target == SourceEmail: continue else: if target in recipients: recipients[target] += 1 else: recipients[target] = 1 ranked = sorted(recipients.items(),key=lambda x:(-x[1],x[0]))[:20] top = float(nth(head(ranked), 1)) step = 1.0/top
' FROM facts f where schema_name = "email_addr" and predicate = "community"' ) def writeRanks(ids): with newman_connector() as read_cnx1, newman_connector( ) as read_cnx, newman_connector() as write_cnx: with execute_query(read_cnx1.conn(), stmt) as qry: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning ranks" for mail in qry.cursor(): #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid facts.addFact(mail[0], "email_addr", "rank", ids.get(mail[0], '0'), txid) print "commit" write_cnx.commit() if __name__ == "__main__": ids = {} lines = slurpA("tmp/rankings") for line in lines: rank, mails = line.split(':') mails = mails.strip() for mail in mails.split(','): ids[mail] = rank writeRanks(ids)
headers = [ "id", "threadid", "dir", "category", "datetime", "from", "tos", "ccs", "bccs", "subject", "body", "tosize", "ccsize", "attachsize", "attach", "bodysize", "location" ] #skip header row for counting c = counter(-1) with newman_connector() as cnx: tx = Tx(cnx.conn()).next() print "tx: %s" % tx fact = Fact(cnx.conn(), autocommit=False) for line in slurpA(args.input_tsv): try: count = c.next() if count % 1000 == 0: print "ingested count - %s " % count row = line.split('\t') row = (c.strip() for c in row) num, dir, category, utc_date, importance, fromemail, ip, toemail, ccemail, bccemail, attach, messageid, inreplyto, references, subject, body = row fromemail = lower(fromemail) toemail = lower(toemail) ccemail = lower(ccemail) bccemail = lower(bccemail) network = ''
# -*- coding: utf-8 -*- import sys sys.path.append("./demail") from newman.utils.file import slurpA from newman.utils.functions import head, last, nth if __name__ == "__main__": recipients = {} SourceEmail = sys.argv[1] lines = slurpA("tmp/exploded.csv") for line in lines: (dt, src, target) = line.strip().split('\t') if src != SourceEmail or target == SourceEmail: continue else: if target in recipients: recipients[target] += 1 else: recipients[target] = 1 ranked = sorted(recipients.items(), key=lambda x: (-x[1], x[0]))[:20] top = float(nth(head(ranked), 1)) step = 1.0 / top
stmt = ( 'SELECT distinct f.subject ' ' FROM facts f where schema_name = "email_addr" and predicate = "community"' ) def writeRanks(ids): with newman_connector() as read_cnx1, newman_connector() as read_cnx, newman_connector() as write_cnx: with execute_query(read_cnx1.conn(), stmt) as qry: txid = Tx(read_cnx.conn()).next() print "tx: %s" % txid facts = Fact(write_cnx.conn(), autocommit=False) print "assigning ranks" for mail in qry.cursor(): #print mail[0] #, "email_addr", "rank", ids.get(mail,'0'), txid facts.addFact(mail[0], "email_addr", "rank", ids.get(mail[0],'0'), txid) print "commit" write_cnx.commit() if __name__ == "__main__": ids = {} lines = slurpA("tmp/rankings") for line in lines: rank,mails = line.split(':') mails = mails.strip() for mail in mails.split(','): ids[mail] = rank writeRanks(ids)
parser = argparse.ArgumentParser(description='Ingest Walker Email') parser.add_argument("input_tsv", help="input of tsv file") args= parser.parse_args() headers = ["id","threadid", "dir","category","datetime","from","tos","ccs","bccs","subject","body","tosize","ccsize","attachsize","attach","bodysize","location"] #skip header row for counting c = counter(-1) with newman_connector() as cnx: tx = Tx(cnx.conn()).next() print "tx: %s" % tx fact = Fact(cnx.conn(), autocommit=False) for line in slurpA(args.input_tsv): try: count = c.next() if count % 1000 == 0: print "ingested count - %s " % count row = line.split('\t') row = (c.strip() for c in row) num,dir,category,utc_date,importance,fromemail,ip,toemail,ccemail,bccemail,attach,messageid,inreplyto,references,subject,body = row fromemail = lower(fromemail) toemail = lower(toemail) ccemail = lower(ccemail) bccemail = lower(bccemail) network = ''
" values (%s, %s, %s, %s, %s, %s)") with execute_nonquery(conn, stmt, category_id, idx, value, score, purity, docs) as qry: pass if __name__ == "__main__": parser = argparse.ArgumentParser(description='Ingest Walker Email Topics') parser.add_argument("topic_idx", help="topics index file") parser.add_argument("topic_scores", help="topic scores file") args= parser.parse_args() flush = partial(flush_buffer, "tmp/bulk_topic_score.dat") #index topic_score doc_purity percent_docs summary0 summary1 etc... #0 8.09 0.557 14.54 governor state jobs candidate rail ad gubernatorial primary election race scores_items = [line.split('\t') for line in slurpA(args.topic_idx)[1:]] scores_items = [map(lambda s: s.strip(), line) for line in scores_items] scores_items = [(i[0], i[1], i[2], i[3], " ".join(i[4:])) for i in scores_items] topics = {"topic_{0}".format(i[0]):i[1:] for i in scores_items} #topics = {"topic_{0}".format(i):v for i,v in enumerate(slurpA(args.topic_idx)) } c = counter(0) with newman_connector() as cnx: insert_topic = partial(insert_topic_category, cnx.conn(), "all") print "import topics " for k,v in topics.iteritems(): idx = k.replace("topic_", "") #score, purity, docs, summary = v.split(None, 3) score, purity, docs, summary = v
docs) as qry: pass if __name__ == "__main__": parser = argparse.ArgumentParser(description='Ingest Walker Email Topics') parser.add_argument("topic_idx", help="topics index file") parser.add_argument("topic_scores", help="topic scores file") args = parser.parse_args() flush = partial(flush_buffer, "tmp/bulk_topic_score.dat") #index topic_score doc_purity percent_docs summary0 summary1 etc... #0 8.09 0.557 14.54 governor state jobs candidate rail ad gubernatorial primary election race scores_items = [line.split('\t') for line in slurpA(args.topic_idx)[1:]] scores_items = [map(lambda s: s.strip(), line) for line in scores_items] scores_items = [(i[0], i[1], i[2], i[3], " ".join(i[4:])) for i in scores_items] topics = {"topic_{0}".format(i[0]): i[1:] for i in scores_items} #topics = {"topic_{0}".format(i):v for i,v in enumerate(slurpA(args.topic_idx)) } c = counter(0) with newman_connector() as cnx: insert_topic = partial(insert_topic_category, cnx.conn(), "all") print "import topics " for k, v in topics.iteritems(): idx = k.replace("topic_", "") #score, purity, docs, summary = v.split(None, 3)