def run(self): timer_start = time.time() self._db.open() self._redact() self._db.commit() #final commit just in case self._db.close() print common.display_elapsed(timer_start, "cc redaction")
def main(): timer_start = time.time() db = database.DbTool() db.open() _detect_mimetypes(db) db.commit() #final commit (cleanup) db.close() print common.display_elapsed(timer_start, "detect MIME-types")
def main(): timer_start = time.time() db = database.DbTool() db.open() _extract_containers(db) db.commit() db.close() print common.display_elapsed(timer_start, "extract containers")
def run(): timer_start = time.time() md5 = {} root_path = common.get_path_pstitems_text() db = database.DbTool() db.open() pstitems = db.get_pstitems() count = 1 total = len(pstitems) for pi in pstitems: item_id = pi[0] psttype = pi[1] if count % 100 == 0: print "[nepi_tofile] Processing pstitem {0} of {1} [item_id {2}]".format( count, total, item_id) count += 1 if psttype != common.PSTTYPE_NOTE and psttype != common.PSTTYPE_ATTACHMENT: #process emails and attachments separately f_path = os.path.join(root_path, str(item_id)) sb = [] ips = db.get_itemproperties_by_item(item_id) for ip in ips: name = str(ip[0]) value = str(ip[1]) sb.append(name) sb.append(": ") sb.append(value) sb.append("\n") text = '' #create empty files if no text if len(sb) > 0: text = "".join(sb) f = open(f_path, 'w') f.write(text) f.close() db.close() print common.display_elapsed( timer_start, "STEP COMPLETED: generate text files from non-email pstitems")
def main(): timer_start = time.time() mid_dict = {} #dictionary: key = messageid, value = item_id repl_dict = {} #dictionary: key = item_id-reply, value = item_id-original db = database.DbTool() db.open() mid_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "messageid") mids = db.get_itemproperties_by_property(mid_property_id) total = len(mids) count = 1 for ip in mids: item_id = ip[0] messageid = str(ip[1]).strip() #ignore value_text - none there mid_dict[messageid] = item_id if count % 100 == 0: print "Processing item-property {0} of {1} [item_id {2}]".format(count, total, item_id) count += 1 irt_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "in_reply_to") irts = db.get_itemproperties_by_property(irt_property_id) for ip in irts: item_id = ip[0] inreplyto = str(ip[1]).strip() #ignore value_text - none there if inreplyto in mid_dict: repl_dict[item_id] = mid_dict[inreplyto] count = 1 total = len(repl_dict) for repl in repl_dict: if count % 100 == 0: print "Storing reply {0} of {1} [item_ids: {2}, {3}]".format(count, total, item1_id, item2_id) count += 1 item1_id = repl item2_id = repl_dict[repl] db.create_item_relationship(item1_id, common.IS_REPLYTO_RELTYPE_ID, item2_id, "by messageid") print "found {0} replies".format(len(repl_dict)) db.commit() db.close() print common.display_elapsed(timer_start, "threading email")
def run(): timer_start = time.time() db = database.DbTool() db.open() _validate_input(db) _create_output_dirs() path_in = sys.argv[1] _extract_sources(db, path_in) db.commit() #final commit (cleanup) db.close() print common.display_elapsed(timer_start, "extract sources/top-level pst files")
def run(): timer_start = time.time() rootpath_in = common.get_path_files_processed() path_out = common.get_path_files_text() log_filename = "paths-" + str(BATCH) + ".log" path_log = os.path.join(common.get_path_tikalogs(), log_filename) path_firstpathlog = path_log + "-1" process_sources( rootpath_in, path_out, BATCH, path_firstpathlog) #initial run: extract all you can, record errors process_errorlog( path_out, BATCH, path_log) #recursively process paths for missed files (java io error) store_text(path_out) #store extracted text in db print common.display_elapsed(timer_start, "tika extract meta data and text")
def run(): timer_start = time.time() md5 = {} rootpath = common.get_path_files_processed() count = 1 total = len(os.listdir(rootpath)) db = database.DbTool() db.open() for d in os.listdir(rootpath): print "[fi] Processing source {0} of {1} [dir {2}]".format(count, total, d) count += 1 d_path = os.path.join(rootpath, d, "*") cmd = "md5sum {0}".format(d_path); result = common.exec_cmd(cmd) lines = result[0].split("\n") for line in lines: if len(line) > 0: data = line.split() checksum = data[0] filepath = data[1] head, tail = os.path.split(filepath) item_id, ext = os.path.splitext(tail) if checksum in md5: canon_id = md5[checksum] db.create_item_relationship(item_id, common.IS_DUPLICATE_RELTYPE_ID, canon_id, "md5sum") else: md5[checksum] = item_id db.commit() #commit for each source db.close() print common.display_elapsed(timer_start, "STEP COMPLETED: dedup fileitems")