Example #1
0
def _extract_zipjar(path_in, path_out):
    message = "No errors detected"
    cmd_test = """unzip -qtP '
' {0}""".format(path_in)
    result_test = common.exec_cmd(cmd_test)

    if result_test[1] is not None and result_test[1] != "":
        print "ERROR: container-extraction-3: " + result_test[1]

    if result_test[0].startswith(message):
        cmd = """unzip -q {0} -d {1}""".format(path_in, path_out)
        result = common.exec_cmd(cmd)

        if result[1] is not None and result[1] != "":
            print "ERROR: container-extraction-4: " + result[1]
Example #2
0
def process_errorlog(path_out, BATCH, path_log):
    cnf = config.ConfigReader()
    max = 20
    iteration = 1
    while True:
        path_thislog = path_log + "-" + str(iteration)
        # continue only if current log exists (i.e. new log file has been created at previous iteration) or you iterations < max
        if not os.path.exists(path_thislog) or iteration > max: break

        print "PROCESSING iteration " + str(iteration)
        iteration += 1
        path_nextlog = path_log + "-" + str(iteration)

        cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 2 {3} {4} {5}".format(
            cnf.get('JAVA_CLASSPATH'), path_thislog, path_out, BATCH,
            path_nextlog, common.CONFIG_FILE_NAME)
        result = common.exec_cmd(cmd)

        if len(result[0]) > 0:
            print "\nRESULTS OF JVM EXECUTION"
            print "{0}\n".format(result[0])

        if len(
                result[1]
        ) > 0:  #we only print out errors in execution; the rest is logged in the java file
            print "\nERRORS FROM JVM EXECUTION"
            print "{0}\n".format(result[1])

    print "COMPLETED at iteration " + str(iteration - 1)
Example #3
0
def process_sources(rootpath_in, path_out, BATCH, path_pathlog):
    cnf = config.ConfigReader()
    count = 1
    total = len(os.listdir(rootpath_in))

    dirs = os.listdir(rootpath_in)
    p = Pool(PARALLEL_PROC)

    pool_args = itertools.izip(dirs, itertools.repeat(rootpath_in),
                               itertools.repeat(path_out),
                               itertools.repeat(BATCH),
                               itertools.repeat(path_pathlog))

    p.map(process_dir_star, pool_args)

    for d in os.listdir(rootpath_in):
        print "Processing source {0} of {1} [dir {2}]".format(count, total, d)
        count += 1

        path_in = os.path.join(rootpath_in, d)
        print "From %s to %s" % (path_in, path_out)
        #cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4}".format(self._cnf.JAVA_CLASSPATH, path_in, path_out, BATCH, path_pathlog)

        cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format(
            cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog,
            common.CONFIG_FILE_NAME)

        result = common.exec_cmd(cmd)

        if len(
                result[1]
        ) > 0:  #we only print out errors in execution; the rest is logged in the java file
            print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d)
            print "{0}\n".format(result[1])
Example #4
0
def run():

    rootpath = common.get_path_files_processed()

    db = database.DbTool()
    db.open()

    fileitems = db.get_fileitems()
    total = len(fileitems)
    count = 0
    for r in fileitems:
        item_id = r[0]
        source_id = r[1]
        extension = r[2]

        if count % 1000 == 0:
            print 'Processing item {0} of {1} [id# {2}]'.format(
                count, total, item_id)
            db.commit()
        count += 1

        f_path = os.path.join(rootpath, str(source_id),
                              str(item_id) + extension)

        cmd = "md5sum \"{0}\"".format(f_path)
        result = common.exec_cmd(cmd)
        data = result[0].split()
        checksum = data[0]

        db.update_file_md5sum(item_id, checksum)

    db.commit()
    db.close()
Example #5
0
def process_non_pst(db, path_in, item_id, source_id, level, type_id, extension,
                    path_in_dir):
    path_out = common.get_path_files_original_container(
        item_id)  # path for extracted files
    os.mkdir(path_out)

    _extract_files(type_id, path_in, path_out)

    # change permissions for further program execution
    cmd = "chmod 770 -R {0}".format(path_out)
    result = common.exec_cmd(cmd)

    # check if any files were extracted
    extracted_files = 0
    for root, dirs, files in os.walk(path_out):
        extracted_files += len(files)

    if extracted_files == 0:
        db.update_container(item_id, 0)  # mark as NOT extracted
        os.rmdir(path_out)  # remove container directory
    else:
        db.update_container(item_id, 1)  # mark as extracted
        source_dir = path_out  # dir with extracted files
        target_dir = path_in_dir  # dir for processed files
        _process_non_pst_extracted(
            db, item_id, source_id, level + 1, source_dir,
            target_dir)  # add to db and copy to output location
Example #6
0
def _extract_gzip(path_in, path_out):
    root, file = os.path.split(path_in)
    filename, ext = os.path.splitext(file)
    path_out_file = os.path.join(path_out, filename)

    cmd = """gunzip -c {0} > {1}""".format(path_in, path_out_file)
    result = common.exec_cmd(cmd)
    if result[1] is not None and result[1] != "":
        print "ERROR: container-extraction-1: " + result[1]
Example #7
0
def _create_output_dirs():

    cnf = config.ConfigReader()
    outputRoot = cnf.get('OUTPUT_ROOT')

    #this wipes out the contents of the output directory
    if os.path.exists(outputRoot):
        cmd = "chmod -R 777 {0}/*".format(
            outputRoot)  #chmod to make them removable
        common.exec_cmd(cmd)

        cmd = "rm {0}/* -rf".format(outputRoot)  #remove
        common.exec_cmd(cmd)

    path_files = "{0}files/".format(outputRoot)
    if not os.path.exists(path_files):
        os.mkdir(path_files)

    path_files_orig = "{0}files/original_by_container/".format(outputRoot)
    if not os.path.exists(path_files_orig):
        os.mkdir(path_files_orig)

    path_files_proc = "{0}files/processed/".format(outputRoot)
    if not os.path.exists(path_files_proc):
        os.mkdir(path_files_proc)

    path_files_text = "{0}files/text/".format(outputRoot)
    if not os.path.exists(path_files_text):
        os.mkdir(path_files_text)

    path_pst = "{0}pstitems/".format(outputRoot)
    if not os.path.exists(path_pst):
        os.mkdir(path_pst)

    path_pst_text = "{0}pstitems/text/".format(outputRoot)
    if not os.path.exists(path_pst_text):
        os.mkdir(path_pst_text)

    path_tikalogs = "{0}{1}/".format(outputRoot, cnf.get('TIKA_LOG_RELPATH'))
    if not os.path.exists(path_tikalogs):
        os.mkdir(path_tikalogs)
Example #8
0
def process_dir(d, rootpath_in, path_out, BATCH, path_pathlog):
    cnf = config.ConfigReader()
    path_in = os.path.join(rootpath_in, d)
    print "From %s to %s" % (path_in, path_out)
    cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format(
        cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog,
        common.CONFIG_FILE_NAME)
    print cmd
    result = common.exec_cmd(cmd)

    if len(
            result[1]
    ) > 0:  #we only print out errors in execution; the rest is logged in the java file
        print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d)
        print "{0}\n".format(result[1])
Example #9
0
def run():
    timer_start = time.time()

    md5 = {}

    rootpath = common.get_path_files_processed()
    count = 1
    total = len(os.listdir(rootpath))

    db = database.DbTool()
    db.open()
    
    for d in os.listdir(rootpath):
        print "[fi] Processing source {0} of {1} [dir {2}]".format(count, total, d)
        count += 1
                
        d_path = os.path.join(rootpath, d, "*")
        
        cmd = "md5sum {0}".format(d_path);
        result = common.exec_cmd(cmd)

        lines = result[0].split("\n")
        for line in lines:
            if len(line) > 0:
                data = line.split()
                checksum = data[0]
                filepath = data[1]
                head, tail = os.path.split(filepath)
                item_id, ext = os.path.splitext(tail)

                if checksum in md5:
                    canon_id = md5[checksum]
                    db.create_item_relationship(item_id, common.IS_DUPLICATE_RELTYPE_ID, canon_id, "md5sum")
                else:
                    md5[checksum] = item_id
        
        db.commit() #commit for each source

    db.close()
    print common.display_elapsed(timer_start, "STEP COMPLETED: dedup fileitems")
Example #10
0
def _extract_tar(path_in, path_out):
    pass
    cmd = """tar -xf {0} -C {1}""".format(path_in, path_out)
    result = common.exec_cmd(cmd)
    if result[1] is not None and result[1] != "":
        print "ERROR: container-extraction-2: " + result[1]