def _write_collection_xml_files(self): print "Writing collection xml files" collection_name = config.ConfigReader().get('COLLECTION_NAME') sb = [] sb.append('<?xml version="1.0" ?>') sb.append('\n<collection xmlns:xi="http://www.w3.org/2001/XInclude">') sb.append('\n\t<name>{0}</name>'.format(collection_name)) sb.append('\n\t<timestamp>{0}</timestamp>'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sb.append('\n\t<custodians>') for s in self._sources: src_id = s[0] src_name = s[1] src_size = s[2] itemcount = s[3] src_id_str = mod_comm.get_source_dir(src_id) src_name = mod_comm.xml_filter_encoding(src_name) #just a safeguard src_size = mod_comm.format_size(src_size) self._write_custodian_xml_file(src_id, src_id_str, src_name, src_size, itemcount) sb.append('\n\t\t<!-- id="{0}" items="{1}" source_file_name="{2}" source_file_size="{3}" -->'.format(src_id, itemcount, src_name, src_size)) sb.append('\n\t\t<xi:include href="custodians/{0}.xml" />'.format(src_id_str)) sb.append('\n\t</custodians>') sb.append('\n</collection>') sb.append('\n\n') filepath = os.path.join(self.path, 'collection.xml') self._write_file(filepath, sb)
def process_errorlog(path_out, BATCH, path_log): cnf = config.ConfigReader() max = 20 iteration = 1 while True: path_thislog = path_log + "-" + str(iteration) # continue only if current log exists (i.e. new log file has been created at previous iteration) or you iterations < max if not os.path.exists(path_thislog) or iteration > max: break print "PROCESSING iteration " + str(iteration) iteration += 1 path_nextlog = path_log + "-" + str(iteration) cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 2 {3} {4} {5}".format( cnf.get('JAVA_CLASSPATH'), path_thislog, path_out, BATCH, path_nextlog, common.CONFIG_FILE_NAME) result = common.exec_cmd(cmd) if len(result[0]) > 0: print "\nRESULTS OF JVM EXECUTION" print "{0}\n".format(result[0]) if len( result[1] ) > 0: #we only print out errors in execution; the rest is logged in the java file print "\nERRORS FROM JVM EXECUTION" print "{0}\n".format(result[1]) print "COMPLETED at iteration " + str(iteration - 1)
def process_sources(rootpath_in, path_out, BATCH, path_pathlog): cnf = config.ConfigReader() count = 1 total = len(os.listdir(rootpath_in)) dirs = os.listdir(rootpath_in) p = Pool(PARALLEL_PROC) pool_args = itertools.izip(dirs, itertools.repeat(rootpath_in), itertools.repeat(path_out), itertools.repeat(BATCH), itertools.repeat(path_pathlog)) p.map(process_dir_star, pool_args) for d in os.listdir(rootpath_in): print "Processing source {0} of {1} [dir {2}]".format(count, total, d) count += 1 path_in = os.path.join(rootpath_in, d) print "From %s to %s" % (path_in, path_out) #cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4}".format(self._cnf.JAVA_CLASSPATH, path_in, path_out, BATCH, path_pathlog) cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format( cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog, common.CONFIG_FILE_NAME) result = common.exec_cmd(cmd) if len( result[1] ) > 0: #we only print out errors in execution; the rest is logged in the java file print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d) print "{0}\n".format(result[1])
def store_text(path): cnf = config.ConfigReader() p_type = common.PROPERTYTYPE_FILEITEM p_name = cnf.get('EXTRACTED_TEXT_PROPERTY_NAME') db = database.DbTool() db.open() property_id = db.get_property_id(p_type, p_name) total = len(os.listdir(path)) count = 1 for f in os.listdir(path): if count % 100 == 0: print "Processing file {0} of {1} [{2}]".format(count, total, f) db.commit() #commit every 100: extracted texts can be large count += 1 f_path = os.path.join(path, f) if os.path.getsize(f_path) > 0: item_id = int(f) file = open(f_path) value = file.read() file.close() db.create_item_property(item_id, property_id, value, BATCH) db.commit() db.close()
def process_dir(d, rootpath_in, path_out, BATCH, path_pathlog): cnf = config.ConfigReader() path_in = os.path.join(rootpath_in, d) print "From %s to %s" % (path_in, path_out) cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format( cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog, common.CONFIG_FILE_NAME) print cmd result = common.exec_cmd(cmd) if len( result[1] ) > 0: #we only print out errors in execution; the rest is logged in the java file print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d) print "{0}\n".format(result[1])
def _create_output_dirs(): cnf = config.ConfigReader() outputRoot = cnf.get('OUTPUT_ROOT') #this wipes out the contents of the output directory if os.path.exists(outputRoot): cmd = "chmod -R 777 {0}/*".format( outputRoot) #chmod to make them removable common.exec_cmd(cmd) cmd = "rm {0}/* -rf".format(outputRoot) #remove common.exec_cmd(cmd) path_files = "{0}files/".format(outputRoot) if not os.path.exists(path_files): os.mkdir(path_files) path_files_orig = "{0}files/original_by_container/".format(outputRoot) if not os.path.exists(path_files_orig): os.mkdir(path_files_orig) path_files_proc = "{0}files/processed/".format(outputRoot) if not os.path.exists(path_files_proc): os.mkdir(path_files_proc) path_files_text = "{0}files/text/".format(outputRoot) if not os.path.exists(path_files_text): os.mkdir(path_files_text) path_pst = "{0}pstitems/".format(outputRoot) if not os.path.exists(path_pst): os.mkdir(path_pst) path_pst_text = "{0}pstitems/text/".format(outputRoot) if not os.path.exists(path_pst_text): os.mkdir(path_pst_text) path_tikalogs = "{0}{1}/".format(outputRoot, cnf.get('TIKA_LOG_RELPATH')) if not os.path.exists(path_tikalogs): os.mkdir(path_tikalogs)