Esempio n. 1
0
    def run(self):
        db = database.DbTool()
        db.open()

        reltype_id = 3  #is duplicate of by MDB

        count = 0
        total = len(self._duplicates)
        for d in self._duplicates:
            dupl_id = d[0]
            orig_id = d[1]

            dupl_hash_B = self._get_hash(dupl_id, 4)
            orig_hash_B = self._get_hash(orig_id, 4)

            dupl_hash_D = self._get_hash(dupl_id, 1)
            orig_hash_D = self._get_hash(orig_id, 1)

            if dupl_hash_D == orig_hash_D and dupl_hash_B == orig_hash_B:
                db.create_item_relationship(dupl_id, reltype_id, orig_id,
                                            'MDB')

            if count % 100 == 0:
                print "Processing {0} of {1} [item_id {2}]".format(
                    count, total, dupl_id)
                db.commit()
            count += 1

        db.commit()
        db.close()
Esempio n. 2
0
def run():

    rootpath = common.get_path_files_processed()

    db = database.DbTool()
    db.open()

    fileitems = db.get_fileitems()
    total = len(fileitems)
    count = 0
    for r in fileitems:
        item_id = r[0]
        source_id = r[1]
        extension = r[2]

        if count % 1000 == 0:
            print 'Processing item {0} of {1} [id# {2}]'.format(
                count, total, item_id)
            db.commit()
        count += 1

        f_path = os.path.join(rootpath, str(source_id),
                              str(item_id) + extension)

        cmd = "md5sum \"{0}\"".format(f_path)
        result = common.exec_cmd(cmd)
        data = result[0].split()
        checksum = data[0]

        db.update_file_md5sum(item_id, checksum)

    db.commit()
    db.close()
Esempio n. 3
0
def store_text(path):
    cnf = config.ConfigReader()
    p_type = common.PROPERTYTYPE_FILEITEM
    p_name = cnf.get('EXTRACTED_TEXT_PROPERTY_NAME')

    db = database.DbTool()
    db.open()

    property_id = db.get_property_id(p_type, p_name)

    total = len(os.listdir(path))
    count = 1

    for f in os.listdir(path):
        if count % 100 == 0:
            print "Processing file {0} of {1} [{2}]".format(count, total, f)
            db.commit()  #commit every 100: extracted texts can be large
        count += 1

        f_path = os.path.join(path, f)
        if os.path.getsize(f_path) > 0:
            item_id = int(f)
            file = open(f_path)
            value = file.read()
            file.close()
            db.create_item_property(item_id, property_id, value, BATCH)

    db.commit()
    db.close()
Esempio n. 4
0
def run():
    file_dict = {}
    pst_dict = {}
    email_dict = {}

    db = database.DbTool()
    db.open()

    rows = db.get_dedup_data()
    count = 0
    total = len(rows)
    for r in rows:
        item_id = r[0]
        file_checksum = r[1]
        pst_checksum = r[2]
        pst_messageid = r[3]

        if count % 1000 == 0:
            print "Processing item {0} of {1} [item_id {2}]".format(
                count, total, item_id)
            db.commit()
        count += 1

        if file_checksum is not None:
            process_item(item_id, file_checksum, file_dict, "md5sum-file", db)
        elif pst_checksum is not None:
            process_item(item_id, pst_checksum, pst_dict, "md5sum-pst", db)
        elif pst_messageid is not None:
            process_item(item_id, pst_messageid, email_dict, "messageid", db)

    db.commit()
    db.close()
Esempio n. 5
0
def run():
    md5 = {}

    db = database.DbTool()
    db.open()

    fileitems = db.get_md5sums()
    total = len(fileitems)
    count = 0
    for r in fileitems:
        item_id = r[0]
        md5sum = r[1]

        if count % 1000 == 0:
            print 'Processing item {0} of {1} [id# {2}]'.format(
                count, total, item_id)
            db.commit()
        count += 1

        if md5sum in md5:
            primary_id = md5[md5sum]
            db.create_item_relationship(item_id,
                                        common.IS_DUPLICATE_RELTYPE_ID,
                                        primary_id, "md5sum-2")
        else:
            md5[md5sum] = item_id

    db.commit()
    db.close()
Esempio n. 6
0
def main():
    db = database.DbTool()
    db.open()

    item_origins = {
    }  #stores origin_id for each item_id. Items with level=0 have themselves as origins

    items = db.get_items_ordered_by_level()  #must be ordered by level!
    for i in items:
        item_id = i[0]
        parent_id = i[1]
        level = i[2]

        if parent_id is None:
            item_origins[
                item_id] = item_id  #we need this so we can retrieve this origin later
        else:
            origin_id = item_origins[parent_id]  #retrieve origin for parent
            item_origins[item_id] = origin_id  #assign same origin to item

    count = 1
    total = len(item_origins)
    for key in item_origins:
        if (count % 1000 == 0):
            print "Processing item {0} of {1} [{2}]".format(
                count, total, item_id)
        count += 1

        if key != item_origins[key]:  #ignore top-levels
            db.set_origin_id(key, item_origins[key])

    db.commit()
    db.close()
Esempio n. 7
0
 def __init__(self):
     self._dict_redact = {
     }  #dictionary: original_string >> generated_string
     self._dict_redactid = {}  #dictionary: generated_string >> redaction_id
     self._redactitems = set()  #set of items with redacted content
     self._db = database.DbTool()
     random.seed(835291)
     self._init_regex()
Esempio n. 8
0
 def _load(self):
     db = mod_db.DbTool()
     db.open()
     self._load_items(db)
     self._load_publicids(db)
     self._load_sources(db)
     self._load_source_folders(db)
     self._load_redactions(db)
     self._load_pstfolders(db)
     self._sort_folderitems()
     db.close()
Esempio n. 9
0
def main():
    timer_start = time.time()

    db = database.DbTool()
    db.open()

    _detect_mimetypes(db)

    db.commit()  #final commit (cleanup)
    db.close()

    print common.display_elapsed(timer_start, "detect MIME-types")
Esempio n. 10
0
def main():
    timer_start = time.time()
    
    db = database.DbTool()
    db.open()

    _extract_containers(db)

    db.commit() 
    db.close()

    print common.display_elapsed(timer_start, "extract containers")
Esempio n. 11
0
    def _load(self):
        db = database.DbTool()
        db.open()

        rows = db.get_properties()
        for r in rows:
            prop_id   = r[0]
            type_id   = r[1]
            prop_name = r[2]
            self._id_by_name_dict[prop_name] = prop_id
            self._name_by_id_dict[prop_id] = prop_name

        db.close()
Esempio n. 12
0
def run():
    timer_start = time.time()

    md5 = {}

    root_path = common.get_path_pstitems_text()

    db = database.DbTool()
    db.open()

    pstitems = db.get_pstitems()

    count = 1
    total = len(pstitems)

    for pi in pstitems:
        item_id = pi[0]
        psttype = pi[1]

        if count % 100 == 0:
            print "[nepi_tofile] Processing pstitem {0} of {1} [item_id {2}]".format(
                count, total, item_id)
        count += 1

        if psttype != common.PSTTYPE_NOTE and psttype != common.PSTTYPE_ATTACHMENT:  #process emails and attachments separately
            f_path = os.path.join(root_path, str(item_id))
            sb = []

            ips = db.get_itemproperties_by_item(item_id)
            for ip in ips:
                name = str(ip[0])
                value = str(ip[1])

                sb.append(name)
                sb.append(": ")
                sb.append(value)
                sb.append("\n")

            text = ''  #create empty files if no text
            if len(sb) > 0:
                text = "".join(sb)

            f = open(f_path, 'w')
            f.write(text)
            f.close()

    db.close()

    print common.display_elapsed(
        timer_start,
        "STEP COMPLETED: generate text files from non-email pstitems")
Esempio n. 13
0
def main():
    db = database.DbTool()
    db.open()

    slist = db.get_source_list()  # source list
    s_count = 0
    s_total = len(slist)
    for s in slist:
        s_count += 1

        source_id = s[0]

        children = {}  #key = parent_id; val = list of children ids
        psttypes = {}
        top_item_ids = []

        ilist = db.get_items_by_source(source_id)  # item list

        i_count = 1
        i_total = len(ilist)
        for i in ilist:
            if (i_count % 10 == 0):
                print "\t/* Processing item {0} of {1} / source {2} of {3} */".format(
                    i_count, i_total, s_count, s_total)
            i_count += 1

            item_id = i[0]
            parent_id = i[1]
            psttype_id = i[2]

            psttypes[item_id] = psttype_id

            if parent_id is None:
                top_item_ids.append(item_id)  #found top-level item!
            else:

                if parent_id in children:  # if child list exists - fetch it!
                    child_list = children[parent_id]
                else:  # otherwise - create and load it!
                    child_list = []
                    children[parent_id] = child_list

                child_list.append(item_id)  # append child to list

        prefix = "{0}-".format(str(source_id).zfill(4))
        process_items(db, psttypes, children, top_item_ids, prefix, 6)

        db.commit()  # commit for each source

    db.commit()
    db.close()
Esempio n. 14
0
def main():
    timer_start = time.time()

    mid_dict = {} #dictionary: key = messageid, value = item_id
    repl_dict = {}  #dictionary: key = item_id-reply, value = item_id-original
    
    db = database.DbTool()
    db.open()

    mid_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "messageid")
    mids = db.get_itemproperties_by_property(mid_property_id)

    total = len(mids)
    count = 1
    for ip in mids:
        item_id = ip[0]
        messageid = str(ip[1]).strip() #ignore value_text - none there
        mid_dict[messageid] = item_id

        if count % 100 == 0:
            print "Processing item-property {0} of {1} [item_id {2}]".format(count, total, item_id)            
        count += 1 

    irt_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "in_reply_to")
    irts = db.get_itemproperties_by_property(irt_property_id)

    for ip in irts:
        item_id = ip[0]
        inreplyto = str(ip[1]).strip() #ignore value_text - none there

        if inreplyto in mid_dict:
            repl_dict[item_id] = mid_dict[inreplyto]

    count = 1
    total = len(repl_dict)
    for repl in repl_dict:
        if count % 100 == 0:
            print "Storing reply {0} of {1} [item_ids: {2}, {3}]".format(count, total, item1_id, item2_id) 
        count += 1 
        
        item1_id = repl
        item2_id = repl_dict[repl]
        db.create_item_relationship(item1_id, common.IS_REPLYTO_RELTYPE_ID, item2_id, "by messageid")

    print "found {0} replies".format(len(repl_dict))

    db.commit()
    db.close()

    print common.display_elapsed(timer_start, "threading email")
Esempio n. 15
0
    def __init__(self, item_id):
        db = database.DbTool()
        db.open()
        self._load_item(item_id, db)
        self._load_properties(item_id, db)
        self._load_relationships(item_id, db)
        self._load_children(item_id, db)
        db.close()

        if self.is_file():
            self._load_file_data()
        if self.is_pstitem():
            self._load_common_pst_data()
        if self.is_email():
            self._load_email_data()
Esempio n. 16
0
def run():
    timer_start = time.time()

    db = database.DbTool()
    db.open()

    _validate_input(db)

    _create_output_dirs()

    path_in = sys.argv[1]
    _extract_sources(db, path_in)

    db.commit()  #final commit (cleanup)
    db.close()

    print common.display_elapsed(timer_start,
                                 "extract sources/top-level pst files")
Esempio n. 17
0
def run():
    timer_start = time.time()

    md5 = {}

    rootpath = common.get_path_files_processed()
    count = 1
    total = len(os.listdir(rootpath))

    db = database.DbTool()
    db.open()
    
    for d in os.listdir(rootpath):
        print "[fi] Processing source {0} of {1} [dir {2}]".format(count, total, d)
        count += 1
                
        d_path = os.path.join(rootpath, d, "*")
        
        cmd = "md5sum {0}".format(d_path);
        result = common.exec_cmd(cmd)

        lines = result[0].split("\n")
        for line in lines:
            if len(line) > 0:
                data = line.split()
                checksum = data[0]
                filepath = data[1]
                head, tail = os.path.split(filepath)
                item_id, ext = os.path.splitext(tail)

                if checksum in md5:
                    canon_id = md5[checksum]
                    db.create_item_relationship(item_id, common.IS_DUPLICATE_RELTYPE_ID, canon_id, "md5sum")
                else:
                    md5[checksum] = item_id
        
        db.commit() #commit for each source

    db.close()
    print common.display_elapsed(timer_start, "STEP COMPLETED: dedup fileitems")
Esempio n. 18
0
    def run(self):
        db = database.DbTool()
        db.open()  

        duplicates = db.get_records("select item1_id, item2_id from item_relationship where reltype_id = 3")
        count = 0
        total = len(duplicates)
        for d in duplicates:
            dupl_id = d[0]
            orig_id = d[1]

            if count % 1000 == 0:
                print "Processing {0} of {1} [item_id {2}]".format(count, total, dupl_id)
            count += 1

            attch_dupl = self._get_attachment_hashes(dupl_id, db)
            attch_orig = self._get_attachment_hashes(orig_id, db)

            if len(attch_dupl) != len(attch_orig):
                raise Exception(str(dupl_id) + " --- " + str(orig_id))

        db.close()
Esempio n. 19
0
def run():

    db = database.DbTool()
    db.open()

    rows = db.get_pstitems()
    count = 0
    total = len(rows)
    for r in rows:
        item_id = r[0]
        psttype = r[1]

        if count % 1000 == 0:
            print "Processing pstitem {0} of {1} [item_id {2}]".format(
                count, total, item_id)
            db.commit()
        count += 1

        #process emails and attachments separately; attachments are processed as fileitems
        if psttype != common.PSTTYPE_ATTACHMENT and psttype != common.PSTTYPE_NOTE:

            all_text = []

            ips = db.get_itemproperties_by_item(item_id)
            for ip in ips:
                prop_id = ip[0]
                prop_value = ip[1]

                all_text.append(str(prop_id))
                all_text.append(' : ')
                all_text.append(str(prop_value))
                all_text.append('\n')

            md5sum = hashlib.md5(''.join(all_text)).hexdigest()

            db.create_data_dedup_nonemailpst(item_id, md5sum)

    db.commit()
    db.close()
Esempio n. 20
0
def main():
    db = database.DbTool()
    db.open()

    delim = ' ||| '

    #init property_ids for properties we'll need
    pid_body = 1 
    pid_header = 18 
    pid_outlook_sender_name = 35
    pid_sender_address = 54
    pid_sentto_address = 57
    pid_cc_address = 14
    pid_bcc_address = 13
    pid_date = 65
    pid_subject = 9
    pid_message_id = 24
    pid_importance = 60 
    pid_in_reply_to = 20
    pid_extr_text = 188 
    
    #load email item ids
    emails = set()
    pstitems = db.get_pstitems()
    for pi in pstitems:
        item_id = pi[0]
        psttype_id = pi[1]
        if psttype_id == common.PSTTYPE_NOTE:
            emails.add(item_id)

    #get all items
    items = db.get_items()
    is_email = False
    count = 1
    total = len(items)
    for i in items:
        item_id = i[0]

        if (count % 100 == 0):
            print "Processing item {0} of {1} [{2}]".format(count, total, item_id)
            db.commit()  #commit every 100 items
        count += 1

        is_email = item_id in emails #check if item is an email

        #init empty dictionary
        prop_dict = {} #key = property_id, value = item_property value

        #init empty named properties
        email_from = ""
        email_to = ""
        email_cc = ""
        email_bcc = ""
        email_date = ""
        email_subject = ""
        email_header = ""
        
        pst_body = ""
        extr_text = ""
        metadata = [] #all properties excluding body and extr_text       
        
        #get all properties for each item and load into dictionary (we don't care for prop_name at ip[0])
        ips = db.get_itemproperties_by_item(item_id);
        for ip in ips:
            prop_value = ip[1]
            prop_id = ip[0]
            prop_dict[prop_id] = prop_value

        #build the values: load extr text if not email, otherwise - load email values. Load body and meta data for all
        if not is_email: 
            if pid_extr_text in prop_dict: 
                extr_text = prop_dict[pid_extr_text]
        else:
            if pid_outlook_sender_name in prop_dict: 
                email_from = prop_dict[pid_outlook_sender_name]
            if pid_sender_address in prop_dict: 
                email_from = email_from + ", " + prop_dict[pid_sender_address]
    
            if pid_sentto_address in prop_dict: 
                email_to = prop_dict[pid_sentto_address]

            if pid_cc_address in prop_dict: 
                email_cc = prop_dict[pid_cc_address]
        
            if pid_bcc_address in prop_dict: 
                email_bcc = prop_dict[pid_bcc_address]
                
            if pid_date in prop_dict: 
                email_date = make_date(prop_dict[pid_date]) #convert to date

            if pid_subject in prop_dict: 
                email_subject = prop_dict[pid_subject]
        
            if pid_header in prop_dict: 
                email_header = prop_dict[pid_header]

            if email_header == "":
                email_header = build_header(prop_dict)
            
        #load metadata            
        for pid in prop_dict:
            if pid != pid_body and pid != pid_extr_text:
                if prop_dict[pid] is not None:
                    metadata.append(prop_dict[pid])

        #load body (loads for pst and nonpst items - not a problem)
        if pid_body in prop_dict: 
            pst_body = prop_dict[pid_body]


        #load into db for this item
        text = email_header + delim + pst_body + delim + extr_text

        meta = ""
        if len(metadata) > 0:
            meta = delim.join(metadata)

        db.create_item_all_data(item_id, text, meta)

        if not is_email:
            db.create_item_nonemail_data(item_id, text, meta)
        else:
            db.create_item_email_data(
                item_id, email_from, email_to, email_cc, email_bcc, email_date, email_subject, email_header, pst_body, meta)

        #MUST RESET THESE!!!
        text = "" 
        meta = ""

    db.commit() 
    db.close()
Esempio n. 21
0
 def __init__(self):
     db = database.DbTool()
     db.open()
     self._load_hashes(db)
     self._load_duplicates(db)
     db.close()