def extract_thread(): try: args = ["./bin/ingest.sh", ingest_id, ingest_parent_dir, ingest_file, type] cherrypy.log("running pst: {}".format(" ".join(args))) spit(service_status_log, "[Running] {} \n".format(" ".join(args))) with open(ingester_log, 'w') as t: kwargs = {'stdout': t, 'stderr': t, 'cwd': base_dir, 'bufsize' : 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() # TODO should never see this line - remove this cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit(service_status_log, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(service_status_log, "[Done Ingesting data. Reloading the email_addr cache.]") initialize_email_addr_cache(ingest_id, update=True) spit(service_status_log, "[Complete.]") except: error_info = sys.exc_info()[0] spit(service_status_log, "[Error] {}\n".format(error_info.replace('\n', ' ')))
def download(data): user = data.get("user") if not user: return tangelo.HTTPStatusCode(400, "invalid service call missing user") passwd = data.get("pass") limit = data.get("limit", "2000") logfile = "{}/{}.log".format(work_dir, user) spit(logfile, "[Start] {}\n".format(user), True) cherrypy.log("logfile: {}".format(logfile)) def download_thread(): try: cherrypy.log("Thread Start User: {}".format(user)) try: session = newman_email.login(user, passwd, logfile) fldr = "{}/emails/{}".format(webroot, user) cherrypy.log("Login User: {}".format(user)) if os.path.exists(fldr): rmrf(fldr) mkdir(fldr) spit("{}/output.csv".format(fldr), newman_email.headerrow() + "\n") mkdir(fldr + "/emails") newman_email.download(session, user, fldr, int(limit), logfile) spit(logfile, "[Completed Download] {}\n".format(user)) except Exception as ex: spit(logfile, "[Error] {}\n".format(ex)) cherrypy.log("[Error] {}\n".format(ex)) except: spit(logfile, "[Error]") cherrypy.log("[Error]") error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) finally: newman_email.close_session(session) except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=download_thread, args=()) thr.start() tangelo.content_type("application/json") return {"id": user}
def download(data): user = data.get("user") if not user: return tangelo.HTTPStatusCode(400, "invalid service call missing user") passwd = data.get("pass") limit = data.get("limit", "2000") logfile = "{}/{}.log".format(work_dir, user) spit(logfile, "[Start] {}\n".format(user), True) cherrypy.log("logfile: {}".format(logfile)) def download_thread(): try: cherrypy.log("Thread Start User: {}".format(user)) try: session = newman_email.login(user, passwd, logfile) fldr = "{}/emails/{}".format(webroot, user) cherrypy.log("Login User: {}".format(user)) if os.path.exists(fldr): rmrf(fldr) mkdir(fldr) spit("{}/output.csv".format(fldr), newman_email.headerrow() + "\n") mkdir(fldr + "/emails") newman_email.download(session, user, fldr, int(limit), logfile) spit(logfile, "[Completed Download] {}\n".format(user)) except Exception as ex: spit(logfile, "[Error] {}\n".format(ex)) cherrypy.log("[Error] {}\n".format(ex)) except: spit(logfile, "[Error]") cherrypy.log("[Error]") error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) finally: newman_email.close_session(session) except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=download_thread, args=()) thr.start() tangelo.content_type("application/json") return { "id" : user }
def extract_pst(*args, **kwargs): cherrypy.log("search.extract_pst(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) ingest_id=kwargs.get("ingest-id") ingest_file=kwargs.get("file") type=kwargs.get("type", "pst") # path = "{}/{}".format(ingest_parent_dir, type) if not ingest_id or not type or not ingest_file: raise TypeError("Encountered a 'None' value for 'email', 'type', or 'ingest_file!'") # Add the prefix for the newman indexes ingest_id = index_prefix+ingest_id logname = "pst_{}".format(fmtNow()) ingester_log = "{}/{}.ingester.log".format(work_dir, logname) # errfile = "{}/{}.err.log".format(work_dir, logname) service_status_log = "{}/{}.status.log".format(work_dir, logname) spit(service_status_log, "[Start] email address={}\n".format(ingest_id), True) def extract_thread(): try: args = ["./bin/ingest.sh", ingest_id, ingest_parent_dir, ingest_file, type] cherrypy.log("running pst: {}".format(" ".join(args))) spit(service_status_log, "[Running] {} \n".format(" ".join(args))) with open(ingester_log, 'w') as t: kwargs = {'stdout': t, 'stderr': t, 'cwd': base_dir, 'bufsize' : 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() # TODO should never see this line - remove this cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit(service_status_log, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(service_status_log, "[Done Ingesting data. Reloading the email_addr cache.]") initialize_email_addr_cache(ingest_id, update=True) spit(service_status_log, "[Complete.]") except: error_info = sys.exc_info()[0] spit(service_status_log, "[Error] {}\n".format(error_info.replace('\n', ' '))) # cherrypy.log(error_info) thr = threading.Thread(target=extract_thread, args=()) thr.start() tangelo.content_type("application/json") return {'log' : logname }
def login(username, passwd, log): try: _session = imaplib.IMAP4_SSL('imap.gmail.com') resp, account = _session.login(username, passwd) cherrypy.log('login {}'.format(resp)) if resp != 'OK': cherrypy.log("Failed login:"******"[Error] {}\n".format("Failed login:"******"Failed login: {} {}".format(resp, account)) return _session except imaplib.IMAP4.error: cherrypy.log("Failed login: {}".format(username)) spit(log, "[Error] {}\n".format("Failed login: {}".format(username))) raise Exception("Exception Failed login: {}".format(username))
def extract_thread(): args = ["./bin/pstextract.sh", email, pst_path] cherrypy.log("running pst: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) try: with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = { 'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize': 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit( logfile, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(logfile, "[Complete]") except Exception: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))
def extract(email_id, buff_mail, out_dir, categories, target_email): _dir = "{}/emails/{}".format(out_dir, email_id) mkdirp(_dir) #write raw email to new dir spit("{}/{}.eml".format(_dir, email_id), buff_mail) mail = email.message_from_string(buff_mail) attach=[] msg = "" attach_count = counter() for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName if fileName else "Attach_{}".format(attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_')]) attach.append(fileName) filePath = "{}/{}".format(_dir, fileName) #save attachment fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) spit("{}/{}.txt".format(_dir, email_id), msg) row= createRow(email_id, "emails/{}".format(email_id), target_email, mail, categories, attach, msg) return row
def extract(email_id, buff_mail, out_dir, categories, target_email): _dir = "{}/emails/{}".format(out_dir, email_id) mkdirp(_dir) #write raw email to new dir spit("{}/{}.eml".format(_dir, email_id), buff_mail) mail = email.message_from_string(buff_mail) attach = [] msg = "" attach_count = counter() for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_type() == 'message/delivery-status': continue if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() fileName = fileName if fileName else "Attach_{}".format( attach_count.next()) if fileName == 'rtf-body.rtf': continue fileName = clean_string(fileName, [ EXPR_OPTS['fix_utf8'], EXPR_OPTS['fix_forwardslash'], (r' ', '_'), (r'&', '_') ]) attach.append(fileName) filePath = "{}/{}".format(_dir, fileName) #save attachment fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = clean_string(msg, [EXPR_OPTS['fix_utf8']]) spit("{}/{}.txt".format(_dir, email_id), msg) row = createRow(email_id, "emails/{}".format(email_id), target_email, mail, categories, attach, msg) return row
def extract_pst(*args, **kwargs): email = kwargs.get("email") pst = kwargs.get("pst") pst_path = "{}/{}".format(pst_dir, pst) logname = "pst_{}".format(fmtNow()) teefile = "{}/{}.tee.log".format(work_dir, logname) errfile = "{}/{}.err.log".format(work_dir, logname) logfile = "{}/{}.status.log".format(work_dir, logname) spit(logfile, "[Start] {}\n".format(email), True) def extract_thread(): args = ["./bin/pstextract.sh", email, pst_path] cherrypy.log("running pst: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) try: with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = { 'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize': 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit( logfile, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(logfile, "[Complete]") except Exception: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=extract_thread, args=()) thr.start() tangelo.content_type("application/json") return {'log': logname}
def changeConfig(data): target = data.get('target', None) database = data.get('database', None) host = data.get('host', None) user = data.get('user', None) password = data.get('password', None) filename = data.get('filename', 'target') fp = "{}/conf/server.conf".format(base_dir) conf = json.loads(slurp(fp)) if os.path.isfile(fp) else {} zargs= zip(['target','database','host','user','password'], [target, database, host, user, password ]) args_map = {x:y for x,y in filter(lambda o: o[1], zargs)} config = dict(conf, **args_map) out = json.dumps(config, sort_keys=True, indent=4, separators=(',', ': ')) spit(fp, out, True) spit('{}/conf/{}.cfg'.format(base_dir, filename), 'EMAIL_TARGET="{}"\n'.format(config["target"]), True) tangelo.content_type("application/json") return { 'target' : config["target"], 'config' : filename + ".cfg" }
def changeConfig(data): target = data.get('target', None) database = data.get('database', None) host = data.get('host', None) user = data.get('user', None) password = data.get('password', None) filename = data.get('filename', 'target') fp = "{}/conf/server.conf".format(base_dir) conf = json.loads(slurp(fp)) if os.path.isfile(fp) else {} zargs = zip(['target', 'database', 'host', 'user', 'password'], [target, database, host, user, password]) args_map = {x: y for x, y in filter(lambda o: o[1], zargs)} config = dict(conf, **args_map) out = json.dumps(config, sort_keys=True, indent=4, separators=(',', ': ')) spit(fp, out, True) spit('{}/conf/{}.cfg'.format(base_dir, filename), 'EMAIL_TARGET="{}"\n'.format(config["target"]), True) tangelo.content_type("application/json") return {'target': config["target"], 'config': filename + ".cfg"}
def extract_thread(): try: args = [ "./bin/ingest.sh", ingest_id, ingest_parent_dir, ingest_file, type ] cherrypy.log("running pst: {}".format(" ".join(args))) spit(service_status_log, "[Running] {} \n".format(" ".join(args))) with open(ingester_log, 'w') as t: kwargs = { 'stdout': t, 'stderr': t, 'cwd': base_dir, 'bufsize': 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() # TODO should never see this line - remove this cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit( service_status_log, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit( service_status_log, "[Done Ingesting data. Reloading the email_addr cache.]" ) initialize_email_addr_cache(ingest_id, update=True) spit(service_status_log, "[Complete.]") except: error_info = sys.exc_info()[0] spit(service_status_log, "[Error] {}\n".format(error_info.replace('\n', ' ')))
def extract_pst(*args, **kwargs): email=kwargs.get("email") pst=kwargs.get("pst") pst_path = "{}/{}".format(pst_dir, pst) logname = "pst_{}".format(fmtNow()) teefile = "{}/{}.tee.log".format(work_dir, logname) errfile = "{}/{}.err.log".format(work_dir, logname) logfile = "{}/{}.status.log".format(work_dir, logname) spit(logfile, "[Start] {}\n".format(email), True) def extract_thread(): args = ["./bin/pstextract.sh", email, pst_path] cherrypy.log("running pst: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) try: with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize' : 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit(logfile, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(logfile, "[Complete]") except Exception: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=extract_thread, args=()) thr.start() tangelo.content_type("application/json") return {'log' : logname }
def extract_thread(): args = ["./bin/pstextract.sh", email, pst_path] cherrypy.log("running pst: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) try: with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize' : 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit(logfile, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(logfile, "[Complete]") except Exception: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))
def flush_buffer(f, buffer): if len(buffer) > 0: spit(f, "\n".join(buffer) + "\n")
def download(srv, target_email, outdir, limit, logfile): srv.select("[Gmail]/All Mail", True) #resp, data = srv.uid('SEARCH', None, 'ALL') resp, data = srv.search(None, 'ALL') if resp != 'OK': err_msg = "Error searching: %s %s" % (resp, data) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) msgids = data[0].split() if limit > 0: msgids = msgids[-limit:] attach_count = counter() c = counter() l = len(msgids) for msgid in msgids: try: uid = getUIDForMessage(srv, msgid) fldr ="emails/{}".format(uid) mkdir("{}/{}".format(outdir, fldr)) i = c.next() if i % 200 == 0: spit(logfile, "[Downloading] Downloaded: {}/{}\n".format(i,l)) resp, msgParts = srv.fetch(msgid, '(RFC822)') if resp != 'OK': err_msg = "Bad response: %s %s" % (resp, msgParts) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) emailBody = msgParts[0][1] spit("{}/{}/{}.eml".format(outdir,fldr, uid), emailBody) mail = email.message_from_string(emailBody) attach = [] msg="" for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() #escape file name fileName = fileName if fileName else "Attach_{}".format(attach_count.next()) fileName = fileName.replace('/','_') attach.append(fileName) filePath = "{}/{}/{}".format(outdir, fldr, fileName) fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = re.sub(r'[^\x00-\x7F]',' ', msg) spit("{}/{}/{}.txt".format(outdir,fldr, uid), msg) row = createRow(uid, fldr, target_email, mail, attach, msg) spit("{}/output.csv".format(outdir), row + "\n") except Exception, e: spit(logfile, "[Downloading] [Exception]: line {}, msgid {}, except {}\n".format(i,msgid, str(e))) continue
parser = argparse.ArgumentParser( description=" ... ", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=desc) parser.add_argument("-a","--header", action='store_true', help="add header to output") parser.add_argument("-s","--start", type=int, default=0, help="start at line #") parser.add_argument("-l", "--limit", type=int, default=0, help="end at line #") parser.add_argument("target_email", help="Target Email") parser.add_argument("out_dir", help="Output Directory") parser.add_argument("infile", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="Input File") args = parser.parse_args() outfile = "{}/output.csv".format(args.out_dir) mkdirp("{}/emails".format(args.out_dir)) if args.header: spit(outfile, email_extract.headerrow() + "\n") for i, line in enumerate(skip(args.infile, at_start=args.start)): if ((not args.limit == 0) and (i >= args.limit)): break; try: fp = line.strip() guid = email_extract.md5(fp) category = email_extract.categoryList(fp) buff = slurp(fp) row = email_extract.extract(guid, buff, args.out_dir, category, args.target_email) spit(outfile, row + "\n") except Exception as e: print "exception line: {} | {} ".format(i, e.message)
def download(srv, target_email, outdir, limit, logfile): srv.select("[Gmail]/All Mail", True) #resp, data = srv.uid('SEARCH', None, 'ALL') resp, data = srv.search(None, 'ALL') if resp != 'OK': err_msg = "Error searching: %s %s" % (resp, data) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) msgids = data[0].split() if limit > 0: msgids = msgids[-limit:] attach_count = counter() c = counter() l = len(msgids) for msgid in msgids: try: uid = getUIDForMessage(srv, msgid) fldr = "emails/{}".format(uid) mkdir("{}/{}".format(outdir, fldr)) i = c.next() if i % 200 == 0: spit(logfile, "[Downloading] Downloaded: {}/{}\n".format(i, l)) resp, msgParts = srv.fetch(msgid, '(RFC822)') if resp != 'OK': err_msg = "Bad response: %s %s" % (resp, msgParts) spit(logfile, "[Error] {}\n".format(err_msg)) raise Exception(err_msg) emailBody = msgParts[0][1] spit("{}/{}/{}.eml".format(outdir, fldr, uid), emailBody) mail = email.message_from_string(emailBody) attach = [] msg = "" for part in mail.walk(): if part.get_content_type() == 'text/plain': msg = msg + "\n" + part.get_payload() if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue fileName = part.get_filename() #escape file name fileName = fileName if fileName else "Attach_{}".format( attach_count.next()) fileName = fileName.replace('/', '_') attach.append(fileName) filePath = "{}/{}/{}".format(outdir, fldr, fileName) fp = open(filePath, 'wb') fp.write(part.get_payload(decode=True)) fp.close() msg = re.sub(r'[^\x00-\x7F]', ' ', msg) spit("{}/{}/{}.txt".format(outdir, fldr, uid), msg) row = createRow(uid, fldr, target_email, mail, attach, msg) spit("{}/output.csv".format(outdir), row + "\n") except Exception, e: spit( logfile, "[Downloading] [Exception]: line {}, msgid {}, except {}\n". format(i, msgid, str(e))) continue
def download_thread(): try: cherrypy.log("Thread Start User: {}".format(user)) try: session = newman_email.login(user, passwd, logfile) fldr = "{}/emails/{}".format(webroot, user) cherrypy.log("Login User: {}".format(user)) if os.path.exists(fldr): rmrf(fldr) mkdir(fldr) spit("{}/output.csv".format(fldr), newman_email.headerrow() + "\n") mkdir(fldr + "/emails") newman_email.download(session, user, fldr, int(limit), logfile) spit(logfile, "[Completed Download] {}\n".format(user)) except Exception as ex: spit(logfile, "[Error] {}\n".format(ex)) cherrypy.log("[Error] {}\n".format(ex)) except: spit(logfile, "[Error]") cherrypy.log("[Error]") error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) finally: newman_email.close_session(session) except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))
with execute_query(cnx.conn(), topic_stmt, _id) as qry: return [{ 'name': formatName(nth(o, 0)), 'score': formatScore(nth(o, 1)) } for o in qry.cursor()] with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: for row in qry.cursor(): _id, _dir, _from, tos, ccs, bccs, subject, _date, attachments, body = row outdir = "demail/emails/{}/{}".format(args.target, _dir) outfile = "{}/{}.html".format(outdir, last(split(_id, "/"))) topics = getTopics(_id) o = { 'doc': { 'topics': topics, 'id': _id, 'from': _from, 'to': "; ".join(split(tos, ';')), 'cc': "; ".join(split(ccs, ';')), 'bcc': "; ".join(split(bccs, ';')), 'subject': subject, 'date': _date, 'attachments': attachments, 'body': body } } html = T.render(o) spit(outfile, html, True)
def ingest_thread(): cherrypy.log("Ingest Started:") try: cherrypy.log("started: {}".format(fmtNow())) spit(logfile, "[Started] {} \n".format(fmtNow())) args = ["./bin/rebuild_all.sh"] cherrypy.log("running: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir} rebuildp = subprocess.Popen(args, **kwargs) out, err = rebuildp.communicate() cherrypy.log("rebuild complete: {}".format(fmtNow())) rtn = rebuildp.returncode if rtn != 0: spit( logfile, "[Error] rebuild return with non-zero code: {} \n". format(rtn)) return args = ["./bin/ingest.sh", cfg] cherrypy.log("running ingest: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = { 'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize': 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit( logfile, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(logfile, "[Complete]") except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))
def extract_pst(*args, **kwargs): cherrypy.log("search.extract_pst(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) ingest_id = kwargs.get("ingest-id") ingest_file = kwargs.get("file") type = kwargs.get("type", "pst") # path = "{}/{}".format(ingest_parent_dir, type) if not ingest_id or not type or not ingest_file: raise TypeError( "Encountered a 'None' value for 'email', 'type', or 'ingest_file!'" ) # Add the prefix for the newman indexes ingest_id = index_prefix + ingest_id logname = "pst_{}".format(fmtNow()) ingester_log = "{}/{}.ingester.log".format(work_dir, logname) # errfile = "{}/{}.err.log".format(work_dir, logname) service_status_log = "{}/{}.status.log".format(work_dir, logname) spit(service_status_log, "[Start] email address={}\n".format(ingest_id), True) def extract_thread(): try: args = [ "./bin/ingest.sh", ingest_id, ingest_parent_dir, ingest_file, type ] cherrypy.log("running pst: {}".format(" ".join(args))) spit(service_status_log, "[Running] {} \n".format(" ".join(args))) with open(ingester_log, 'w') as t: kwargs = { 'stdout': t, 'stderr': t, 'cwd': base_dir, 'bufsize': 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() # TODO should never see this line - remove this cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit( service_status_log, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit( service_status_log, "[Done Ingesting data. Reloading the email_addr cache.]" ) initialize_email_addr_cache(ingest_id, update=True) spit(service_status_log, "[Complete.]") except: error_info = sys.exc_info()[0] spit(service_status_log, "[Error] {}\n".format(error_info.replace('\n', ' '))) # cherrypy.log(error_info) thr = threading.Thread(target=extract_thread, args=()) thr.start() tangelo.content_type("application/json") return {'log': logname}
if i % chunk == 0: part = part + 1 outfile = "{}/{}.part_{}".format(args.out_dir, "ingest", str(part).zfill(6)) _id, _dir, attachs, subject, body = processLine(line) attach_array = [] if attachs: for _i, a in enumerate(attachs.split(';')): attach_id = "attach_{}_{}".format(_id, _i) filePath = "demail/emails/{}/{}/{}".format( args.email_addr, _dir, a) either = extractText(filePath) if isRight(either): attach_json = toAttachItem(attach_id, _id, right(either), filePath) attach_array.append(attach_json) index = { 'id': _id, 'subject': subject, 'body': body, 'attachments': attach_array } idx = json.dumps({"index": {"_id": _id}}) doc = json.dumps(index) spit(outfile, "{}\n{}\n".format(idx, doc)) i = i + 1
"--limit", type=int, default=0, help="end at line #") parser.add_argument("target_email", help="Target Email") parser.add_argument("out_dir", help="Output Directory") parser.add_argument("infile", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="Input File") args = parser.parse_args() outfile = "{}/output.csv".format(args.out_dir) mkdirp("{}/emails".format(args.out_dir)) if args.header: spit(outfile, email_extract.headerrow() + "\n") for i, line in enumerate(skip(args.infile, at_start=args.start)): if ((not args.limit == 0) and (i >= args.limit)): break try: fp = line.strip() guid = email_extract.md5(fp) category = email_extract.categoryList(fp) buff = slurp(fp) row = email_extract.extract(guid, buff, args.out_dir, category, args.target_email) spit(outfile, row + "\n") except Exception as e: print "exception line: {} | {} ".format(i, e.message)
def ingest_thread(): cherrypy.log("Ingest Started:") try: cherrypy.log("started: {}".format(fmtNow())) spit(logfile, "[Started] {} \n".format(fmtNow())) args = ["./bin/rebuild_all.sh"] cherrypy.log("running: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir } rebuildp = subprocess.Popen(args, **kwargs) out, err = rebuildp.communicate() cherrypy.log("rebuild complete: {}".format(fmtNow())) rtn = rebuildp.returncode if rtn != 0: spit(logfile, "[Error] rebuild return with non-zero code: {} \n".format(rtn)) return args = ["./bin/ingest.sh", cfg] cherrypy.log("running ingest: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize' : 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit(logfile, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(logfile, "[Complete]") except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' ')))
def getTopics(_id): with newman_connector() as cnx: with execute_query(cnx.conn(), topic_stmt, _id) as qry: return [{'name': formatName(nth(o, 0)), 'score': formatScore(nth(o, 1)) } for o in qry.cursor()] with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: for row in qry.cursor(): _id, _dir, _from, tos, ccs, bccs, subject, _date, attachments, body = row outdir = "demail/emails/{}/{}".format(args.target, _dir) outfile = "{}/{}.html".format(outdir, last(split(_id, "/"))) topics = getTopics(_id) o = { 'doc': { 'topics' : topics, 'id': _id, 'from': _from, 'to' : "; ".join(split(tos, ';')), 'cc' : "; ".join(split(ccs, ';')), 'bcc': "; ".join(split(bccs, ';')), 'subject': subject, 'date': _date, 'attachments': attachments, 'body' : body }} html= T.render(o) spit(outfile, html, True)
for line in args.infile: if i % 100 == 0: print "processed: {} ".format(i) if i % chunk == 0: part= part + 1 outfile = "{}/{}.part_{}".format(args.out_dir, "ingest", str(part).zfill(6)) _id, _dir, attachs, subject, body= processLine(line) attach_array = [] if attachs: for _i, a in enumerate(attachs.split(';')): attach_id= "attach_{}_{}".format(_id, _i) filePath ="demail/emails/{}/{}/{}".format(args.email_addr, _dir, a) either = extractText(filePath) if isRight(either): attach_json = toAttachItem(attach_id, _id, right(either), filePath) attach_array.append(attach_json) index = { 'id' : _id, 'subject': subject, 'body' : body, 'attachments' : attach_array } idx = json.dumps({ "index": { "_id" : _id }}) doc = json.dumps(index) spit(outfile, "{}\n{}\n".format(idx,doc)) i=i+1