Exemple #1
0
 def run(self):
     timer_start = time.time()
     self._db.open()
     self._redact()
     self._db.commit()  #final commit just in case
     self._db.close()
     print common.display_elapsed(timer_start, "cc redaction")
Exemple #2
0
def main():
    timer_start = time.time()

    db = database.DbTool()
    db.open()

    _detect_mimetypes(db)

    db.commit()  #final commit (cleanup)
    db.close()

    print common.display_elapsed(timer_start, "detect MIME-types")
Exemple #3
0
def main():
    timer_start = time.time()
    
    db = database.DbTool()
    db.open()

    _extract_containers(db)

    db.commit() 
    db.close()

    print common.display_elapsed(timer_start, "extract containers")
Exemple #4
0
def run():
    timer_start = time.time()

    md5 = {}

    root_path = common.get_path_pstitems_text()

    db = database.DbTool()
    db.open()

    pstitems = db.get_pstitems()

    count = 1
    total = len(pstitems)

    for pi in pstitems:
        item_id = pi[0]
        psttype = pi[1]

        if count % 100 == 0:
            print "[nepi_tofile] Processing pstitem {0} of {1} [item_id {2}]".format(
                count, total, item_id)
        count += 1

        if psttype != common.PSTTYPE_NOTE and psttype != common.PSTTYPE_ATTACHMENT:  #process emails and attachments separately
            f_path = os.path.join(root_path, str(item_id))
            sb = []

            ips = db.get_itemproperties_by_item(item_id)
            for ip in ips:
                name = str(ip[0])
                value = str(ip[1])

                sb.append(name)
                sb.append(": ")
                sb.append(value)
                sb.append("\n")

            text = ''  #create empty files if no text
            if len(sb) > 0:
                text = "".join(sb)

            f = open(f_path, 'w')
            f.write(text)
            f.close()

    db.close()

    print common.display_elapsed(
        timer_start,
        "STEP COMPLETED: generate text files from non-email pstitems")
Exemple #5
0
def main():
    timer_start = time.time()

    mid_dict = {} #dictionary: key = messageid, value = item_id
    repl_dict = {}  #dictionary: key = item_id-reply, value = item_id-original
    
    db = database.DbTool()
    db.open()

    mid_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "messageid")
    mids = db.get_itemproperties_by_property(mid_property_id)

    total = len(mids)
    count = 1
    for ip in mids:
        item_id = ip[0]
        messageid = str(ip[1]).strip() #ignore value_text - none there
        mid_dict[messageid] = item_id

        if count % 100 == 0:
            print "Processing item-property {0} of {1} [item_id {2}]".format(count, total, item_id)            
        count += 1 

    irt_property_id = db.get_property_id(common.PROPERTYTYPE_PSTITEM_NOTE, "in_reply_to")
    irts = db.get_itemproperties_by_property(irt_property_id)

    for ip in irts:
        item_id = ip[0]
        inreplyto = str(ip[1]).strip() #ignore value_text - none there

        if inreplyto in mid_dict:
            repl_dict[item_id] = mid_dict[inreplyto]

    count = 1
    total = len(repl_dict)
    for repl in repl_dict:
        if count % 100 == 0:
            print "Storing reply {0} of {1} [item_ids: {2}, {3}]".format(count, total, item1_id, item2_id) 
        count += 1 
        
        item1_id = repl
        item2_id = repl_dict[repl]
        db.create_item_relationship(item1_id, common.IS_REPLYTO_RELTYPE_ID, item2_id, "by messageid")

    print "found {0} replies".format(len(repl_dict))

    db.commit()
    db.close()

    print common.display_elapsed(timer_start, "threading email")
Exemple #6
0
def run():
    timer_start = time.time()

    db = database.DbTool()
    db.open()

    _validate_input(db)

    _create_output_dirs()

    path_in = sys.argv[1]
    _extract_sources(db, path_in)

    db.commit()  #final commit (cleanup)
    db.close()

    print common.display_elapsed(timer_start,
                                 "extract sources/top-level pst files")
Exemple #7
0
def run():
    timer_start = time.time()

    rootpath_in = common.get_path_files_processed()
    path_out = common.get_path_files_text()
    log_filename = "paths-" + str(BATCH) + ".log"
    path_log = os.path.join(common.get_path_tikalogs(), log_filename)
    path_firstpathlog = path_log + "-1"

    process_sources(
        rootpath_in, path_out, BATCH,
        path_firstpathlog)  #initial run: extract all you can, record errors
    process_errorlog(
        path_out, BATCH,
        path_log)  #recursively process paths for missed files (java io error)
    store_text(path_out)  #store extracted text in db

    print common.display_elapsed(timer_start,
                                 "tika extract meta data and text")
Exemple #8
0
def run():
    timer_start = time.time()

    md5 = {}

    rootpath = common.get_path_files_processed()
    count = 1
    total = len(os.listdir(rootpath))

    db = database.DbTool()
    db.open()
    
    for d in os.listdir(rootpath):
        print "[fi] Processing source {0} of {1} [dir {2}]".format(count, total, d)
        count += 1
                
        d_path = os.path.join(rootpath, d, "*")
        
        cmd = "md5sum {0}".format(d_path);
        result = common.exec_cmd(cmd)

        lines = result[0].split("\n")
        for line in lines:
            if len(line) > 0:
                data = line.split()
                checksum = data[0]
                filepath = data[1]
                head, tail = os.path.split(filepath)
                item_id, ext = os.path.splitext(tail)

                if checksum in md5:
                    canon_id = md5[checksum]
                    db.create_item_relationship(item_id, common.IS_DUPLICATE_RELTYPE_ID, canon_id, "md5sum")
                else:
                    md5[checksum] = item_id
        
        db.commit() #commit for each source

    db.close()
    print common.display_elapsed(timer_start, "STEP COMPLETED: dedup fileitems")