def run(self): db = database.DbTool() db.open() reltype_id = 3 #is duplicate of by MDB count = 0 total = len(self._duplicates) for d in self._duplicates: dupl_id = d[0] orig_id = d[1] dupl_hash_B = self._get_hash(dupl_id, 4) orig_hash_B = self._get_hash(orig_id, 4) dupl_hash_D = self._get_hash(dupl_id, 1) orig_hash_D = self._get_hash(orig_id, 1) if dupl_hash_D == orig_hash_D and dupl_hash_B == orig_hash_B: db.create_item_relationship(dupl_id, reltype_id, orig_id, 'MDB') if count % 100 == 0: print "Processing {0} of {1} [item_id {2}]".format( count, total, dupl_id) db.commit() count += 1 db.commit() db.close()
def run(): rootpath = common.get_path_files_processed() db = database.DbTool() db.open() fileitems = db.get_fileitems() total = len(fileitems) count = 0 for r in fileitems: item_id = r[0] source_id = r[1] extension = r[2] if count % 1000 == 0: print 'Processing item {0} of {1} [id# {2}]'.format( count, total, item_id) db.commit() count += 1 f_path = os.path.join(rootpath, str(source_id), str(item_id) + extension) cmd = "md5sum \"{0}\"".format(f_path) result = common.exec_cmd(cmd) data = result[0].split() checksum = data[0] db.update_file_md5sum(item_id, checksum) db.commit() db.close()
def store_text(path): cnf = config.ConfigReader() p_type = common.PROPERTYTYPE_FILEITEM p_name = cnf.get('EXTRACTED_TEXT_PROPERTY_NAME') db = database.DbTool() db.open() property_id = db.get_property_id(p_type, p_name) total = len(os.listdir(path)) count = 1 for f in os.listdir(path): if count % 100 == 0: print "Processing file {0} of {1} [{2}]".format(count, total, f) db.commit() #commit every 100: extracted texts can be large count += 1 f_path = os.path.join(path, f) if os.path.getsize(f_path) > 0: item_id = int(f) file = open(f_path) value = file.read() file.close() db.create_item_property(item_id, property_id, value, BATCH) db.commit() db.close()
def run(): file_dict = {} pst_dict = {} email_dict = {} db = database.DbTool() db.open() rows = db.get_dedup_data() count = 0 total = len(rows) for r in rows: item_id = r[0] file_checksum = r[1] pst_checksum = r[2] pst_messageid = r[3] if count % 1000 == 0: print "Processing item {0} of {1} [item_id {2}]".format( count, total, item_id) db.commit() count += 1 if file_checksum is not None: process_item(item_id, file_checksum, file_dict, "md5sum-file", db) elif pst_checksum is not None: process_item(item_id, pst_checksum, pst_dict, "md5sum-pst", db) elif pst_messageid is not None: process_item(item_id, pst_messageid, email_dict, "messageid", db) db.commit() db.close()
def run(): md5 = {} db = database.DbTool() db.open() fileitems = db.get_md5sums() total = len(fileitems) count = 0 for r in fileitems: item_id = r[0] md5sum = r[1] if count % 1000 == 0: print 'Processing item {0} of {1} [id# {2}]'.format( count, total, item_id) db.commit() count += 1 if md5sum in md5: primary_id = md5[md5sum] db.create_item_relationship(item_id, common.IS_DUPLICATE_RELTYPE_ID, primary_id, "md5sum-2") else: md5[md5sum] = item_id db.commit() db.close()
def main(): db = database.DbTool() db.open() item_origins = { } #stores origin_id for each item_id. Items with level=0 have themselves as origins items = db.get_items_ordered_by_level() #must be ordered by level! for i in items: item_id = i[0] parent_id = i[1] level = i[2] if parent_id is None: item_origins[ item_id] = item_id #we need this so we can retrieve this origin later else: origin_id = item_origins[parent_id] #retrieve origin for parent item_origins[item_id] = origin_id #assign same origin to item count = 1 total = len(item_origins) for key in item_origins: if (count % 1000 == 0): print "Processing item {0} of {1} [{2}]".format( count, total, item_id) count += 1 if key != item_origins[key]: #ignore top-levels db.set_origin_id(key, item_origins[key]) db.commit() db.close()
def __init__(self): self._dict_redact = { } #dictionary: original_string >> generated_string self._dict_redactid = {} #dictionary: generated_string >> redaction_id self._redactitems = set() #set of items with redacted content self._db = database.DbTool() random.seed(835291) self._init_regex()
def _load(self): db = mod_db.DbTool() db.open() self._load_items(db) self._load_publicids(db) self._load_sources(db) self._load_source_folders(db) self._load_redactions(db) self._load_pstfolders(db) self._sort_folderitems() db.close()
def main(): timer_start = time.time() db = database.DbTool() db.open() _detect_mimetypes(db) db.commit() #final commit (cleanup) db.close() print common.display_elapsed(timer_start, "detect MIME-types")
def main(): timer_start = time.time() db = database.DbTool() db.open() _extract_containers(db) db.commit() db.close() print common.display_elapsed(timer_start, "extract containers")
def _load(self): db = database.DbTool() db.open() rows = db.get_properties() for r in rows: prop_id = r[0] type_id = r[1] prop_name = r[2] self._id_by_name_dict[prop_name] = prop_id self._name_by_id_dict[prop_id] = prop_name db.close()
def run(): timer_start = time.time() md5 = {} root_path = common.get_path_pstitems_text() db = database.DbTool() db.open() pstitems = db.get_pstitems() count = 1 total = len(pstitems) for pi in pstitems: item_id = pi[0] psttype = pi[1] if count % 100 == 0: print "[nepi_tofile] Processing pstitem {0} of {1} [item_id {2}]".format( count, total, item_id) count += 1 if psttype != common.PSTTYPE_NOTE and psttype != common.PSTTYPE_ATTACHMENT: #process emails and attachments separately f_path = os.path.join(root_path, str(item_id)) sb = [] ips = db.get_itemproperties_by_item(item_id) for ip in ips: name = str(ip[0]) value = str(ip[1]) sb.append(name) sb.append(": ") sb.append(value) sb.append("\n") text = '' #create empty files if no text if len(sb) > 0: text = "".join(sb) f = open(f_path, 'w') f.write(text) f.close() db.close() print common.display_elapsed( timer_start, "STEP COMPLETED: generate text files from non-email pstitems")
def main(): db = database.DbTool() db.open() slist = db.get_source_list() # source list s_count = 0 s_total = len(slist) for s in slist: s_count += 1 source_id = s[0] children = {} #key = parent_id; val = list of children ids psttypes = {} top_item_ids = [] ilist = db.get_items_by_source(source_id) # item list i_count = 1 i_total = len(ilist) for i in ilist: if (i_count % 10 == 0): print "\t/* Processing item {0} of {1} / source {2} of {3} */".format( i_count, i_total, s_count, s_total) i_count += 1 item_id = i[0] parent_id = i[1] psttype_id = i[2] psttypes[item_id] = psttype_id if parent_id is None: top_item_ids.append(item_id) #found top-level item! else: if parent_id in children: # if child list exists - fetch it! child_list = children[parent_id] else: # otherwise - create and load it! child_list = [] children[parent_id] = child_list child_list.append(item_id) # append child to list prefix = "{0}-".format(str(source_id).zfill(4)) process_items(db, psttypes, children, top_item_ids, prefix, 6) db.commit() # commit for each source db.commit() db.close()
def main(): timer_start = time.time() mid_dict = {} #dictionary: key = messageid, value = item_id repl_dict = {} #dictionary: key = item_id-reply, value = item_id-original db = database.DbTool() db.open() mid_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "messageid") mids = db.get_itemproperties_by_property(mid_property_id) total = len(mids) count = 1 for ip in mids: item_id = ip[0] messageid = str(ip[1]).strip() #ignore value_text - none there mid_dict[messageid] = item_id if count % 100 == 0: print "Processing item-property {0} of {1} [item_id {2}]".format(count, total, item_id) count += 1 irt_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "in_reply_to") irts = db.get_itemproperties_by_property(irt_property_id) for ip in irts: item_id = ip[0] inreplyto = str(ip[1]).strip() #ignore value_text - none there if inreplyto in mid_dict: repl_dict[item_id] = mid_dict[inreplyto] count = 1 total = len(repl_dict) for repl in repl_dict: if count % 100 == 0: print "Storing reply {0} of {1} [item_ids: {2}, {3}]".format(count, total, item1_id, item2_id) count += 1 item1_id = repl item2_id = repl_dict[repl] db.create_item_relationship(item1_id, common.IS_REPLYTO_RELTYPE_ID, item2_id, "by messageid") print "found {0} replies".format(len(repl_dict)) db.commit() db.close() print common.display_elapsed(timer_start, "threading email")
def __init__(self, item_id): db = database.DbTool() db.open() self._load_item(item_id, db) self._load_properties(item_id, db) self._load_relationships(item_id, db) self._load_children(item_id, db) db.close() if self.is_file(): self._load_file_data() if self.is_pstitem(): self._load_common_pst_data() if self.is_email(): self._load_email_data()
def run(): timer_start = time.time() db = database.DbTool() db.open() _validate_input(db) _create_output_dirs() path_in = sys.argv[1] _extract_sources(db, path_in) db.commit() #final commit (cleanup) db.close() print common.display_elapsed(timer_start, "extract sources/top-level pst files")
def run(): timer_start = time.time() md5 = {} rootpath = common.get_path_files_processed() count = 1 total = len(os.listdir(rootpath)) db = database.DbTool() db.open() for d in os.listdir(rootpath): print "[fi] Processing source {0} of {1} [dir {2}]".format(count, total, d) count += 1 d_path = os.path.join(rootpath, d, "*") cmd = "md5sum {0}".format(d_path); result = common.exec_cmd(cmd) lines = result[0].split("\n") for line in lines: if len(line) > 0: data = line.split() checksum = data[0] filepath = data[1] head, tail = os.path.split(filepath) item_id, ext = os.path.splitext(tail) if checksum in md5: canon_id = md5[checksum] db.create_item_relationship(item_id, common.IS_DUPLICATE_RELTYPE_ID, canon_id, "md5sum") else: md5[checksum] = item_id db.commit() #commit for each source db.close() print common.display_elapsed(timer_start, "STEP COMPLETED: dedup fileitems")
def run(self): db = database.DbTool() db.open() duplicates = db.get_records("select item1_id, item2_id from item_relationship where reltype_id = 3") count = 0 total = len(duplicates) for d in duplicates: dupl_id = d[0] orig_id = d[1] if count % 1000 == 0: print "Processing {0} of {1} [item_id {2}]".format(count, total, dupl_id) count += 1 attch_dupl = self._get_attachment_hashes(dupl_id, db) attch_orig = self._get_attachment_hashes(orig_id, db) if len(attch_dupl) != len(attch_orig): raise Exception(str(dupl_id) + " --- " + str(orig_id)) db.close()
def run(): db = database.DbTool() db.open() rows = db.get_pstitems() count = 0 total = len(rows) for r in rows: item_id = r[0] psttype = r[1] if count % 1000 == 0: print "Processing pstitem {0} of {1} [item_id {2}]".format( count, total, item_id) db.commit() count += 1 #process emails and attachments separately; attachments are processed as fileitems if psttype != common.PSTTYPE_ATTACHMENT and psttype != common.PSTTYPE_NOTE: all_text = [] ips = db.get_itemproperties_by_item(item_id) for ip in ips: prop_id = ip[0] prop_value = ip[1] all_text.append(str(prop_id)) all_text.append(' : ') all_text.append(str(prop_value)) all_text.append('\n') md5sum = hashlib.md5(''.join(all_text)).hexdigest() db.create_data_dedup_nonemailpst(item_id, md5sum) db.commit() db.close()
def main(): db = database.DbTool() db.open() delim = ' ||| ' #init property_ids for properties we'll need pid_body = 1 pid_header = 18 pid_outlook_sender_name = 35 pid_sender_address = 54 pid_sentto_address = 57 pid_cc_address = 14 pid_bcc_address = 13 pid_date = 65 pid_subject = 9 pid_message_id = 24 pid_importance = 60 pid_in_reply_to = 20 pid_extr_text = 188 #load email item ids emails = set() pstitems = db.get_pstitems() for pi in pstitems: item_id = pi[0] psttype_id = pi[1] if psttype_id == common.PSTTYPE_NOTE: emails.add(item_id) #get all items items = db.get_items() is_email = False count = 1 total = len(items) for i in items: item_id = i[0] if (count % 100 == 0): print "Processing item {0} of {1} [{2}]".format(count, total, item_id) db.commit() #commit every 100 items count += 1 is_email = item_id in emails #check if item is an email #init empty dictionary prop_dict = {} #key = property_id, value = item_property value #init empty named properties email_from = "" email_to = "" email_cc = "" email_bcc = "" email_date = "" email_subject = "" email_header = "" pst_body = "" extr_text = "" metadata = [] #all properties excluding body and extr_text #get all properties for each item and load into dictionary (we don't care for prop_name at ip[0]) ips = db.get_itemproperties_by_item(item_id); for ip in ips: prop_value = ip[1] prop_id = ip[0] prop_dict[prop_id] = prop_value #build the values: load extr text if not email, otherwise - load email values. Load body and meta data for all if not is_email: if pid_extr_text in prop_dict: extr_text = prop_dict[pid_extr_text] else: if pid_outlook_sender_name in prop_dict: email_from = prop_dict[pid_outlook_sender_name] if pid_sender_address in prop_dict: email_from = email_from + ", " + prop_dict[pid_sender_address] if pid_sentto_address in prop_dict: email_to = prop_dict[pid_sentto_address] if pid_cc_address in prop_dict: email_cc = prop_dict[pid_cc_address] if pid_bcc_address in prop_dict: email_bcc = prop_dict[pid_bcc_address] if pid_date in prop_dict: email_date = make_date(prop_dict[pid_date]) #convert to date if pid_subject in prop_dict: email_subject = prop_dict[pid_subject] if pid_header in prop_dict: email_header = prop_dict[pid_header] if email_header == "": email_header = build_header(prop_dict) #load metadata for pid in prop_dict: if pid != pid_body and pid != pid_extr_text: if prop_dict[pid] is not None: metadata.append(prop_dict[pid]) #load body (loads for pst and nonpst items - not a problem) if pid_body in prop_dict: pst_body = prop_dict[pid_body] #load into db for this item text = email_header + delim + pst_body + delim + extr_text meta = "" if len(metadata) > 0: meta = delim.join(metadata) db.create_item_all_data(item_id, text, meta) if not is_email: db.create_item_nonemail_data(item_id, text, meta) else: db.create_item_email_data( item_id, email_from, email_to, email_cc, email_bcc, email_date, email_subject, email_header, pst_body, meta) #MUST RESET THESE!!! text = "" meta = "" db.commit() db.close()
def __init__(self): db = database.DbTool() db.open() self._load_hashes(db) self._load_duplicates(db) db.close()