def unzip_and_index_files(self, zipfilename, parameters={}, verbose=False): # create temp dir where to unzip the archive if 'tmp' in parameters: system_temp_dirname = parameters['tmp'] if not os.path.exists(system_temp_dirname): os.mkdir(system_temp_dirname) else: system_temp_dirname = tempfile.gettempdir() # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs h = hashlib.md5(parameters['id'].encode('UTF-8')) temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_zip_" + h.hexdigest( ) if os.path.exists(temp_dirname) == False: os.mkdir(temp_dirname) # unzip the files my_zip = zipfile.ZipFile(zipfilename) my_zip.extractall(temp_dirname) my_zip.close() # prepare document processing connector = Connector_File() connector.verbose = verbose connector.config = parameters.copy() # only set container if not yet set by a zip before (if this zip is inside another zip) if not 'container' in connector.config: connector.config['container'] = zipfilename # walk trough all unzipped directories / files and index all files for dirName, subdirList, fileList in os.walk(temp_dirname): if verbose: print('Scanning directory: %s' % dirName) for fileName in fileList: if verbose: print('Scanning file: %s' % fileName) try: # replace temp dirname from indexed id zipped_dirname = dirName.replace(temp_dirname, '', 1) # build a virtual filename pointing to original zip file if zipped_dirname: zipped_dirname = zipped_dirname + os.path.sep else: zipped_dirname = os.path.sep connector.config[ 'id'] = parameters['id'] + zipped_dirname + fileName unziped_filename = dirName + os.path.sep + fileName try: connector.index_file(filename=unziped_filename) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while indexing zipped content {} from {} : {}\n" .format(fileName, connector.config['container'], e)) os.remove(unziped_filename) except BaseException as e: sys.stderr.write( "Exception while indexing file {} : {}\n".format( fileName, e)) shutil.rmtree(temp_dirname)
def unwarc_and_index_files(self, warcfilename, parameters=None, verbose=False): if parameters is None: parameters = {} # create temp dir where to unwarc the archive if 'tmp' in parameters: system_temp_dirname = parameters['tmp'] if not os.path.exists(system_temp_dirname): os.mkdir(system_temp_dirname) else: system_temp_dirname = tempfile.gettempdir() # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs h = hashlib.md5(parameters['id'].encode('UTF-8')) temp_dirname = system_temp_dirname + os.path.sep + \ "opensemanticetl_enhancer_warc_" + h.hexdigest() if os.path.exists(temp_dirname) == False: os.mkdir(temp_dirname) # prepare document processing connector = Connector_File() connector.verbose = verbose connector.config = parameters.copy() # only set container if not yet set by a zip before (if this zip is inside another zip) if not 'container' in connector.config: connector.config['container'] = warcfilename i = 0 with open(warcfilename, 'rb') as stream: for record in ArchiveIterator(stream): i += 1 if record.rec_type == 'response': print(record.rec_headers) # write WARC record content to tempfile tempfilename = temp_dirname + \ os.path.sep + 'warcrecord' + str(i) tmpfile = open(tempfilename, 'wb') tmpfile.write(record.content_stream().read()) tmpfile.close() # set last modification time of the file to WARC-Date try: last_modified = time.mktime(time.strptime( record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ')) os.utime(tempfilename, (last_modified, last_modified)) except BaseException as e: sys.stderr.write("Exception while reading filedate to warc content {} from {} : {}\n".format( tempfilename, connector.config['container'], e)) # set id (URL and WARC Record ID) connector.config['id'] = record.rec_headers.get_header( 'WARC-Target-URI') + '/' + record.rec_headers.get_header('WARC-Record-ID') # index the extracted file try: connector.index_file(filename=tempfilename) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write("Exception while indexing warc content {} from {} : {}\n".format( tempfilename, connector.config['container'], e)) os.remove(tempfilename) shutil.rmtree(temp_dirname)
def pst2email(self, pstfilename, parameters={}, verbose=False): # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs if 'tmp' in parameters: system_temp_dirname = parameters['tmp'] if not os.path.exists(system_temp_dirname): os.mkdir(system_temp_dirname) else: system_temp_dirname = tempfile.gettempdir() h = hashlib.md5(parameters['id']) temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_pst_" + h.hexdigest( ) if not os.path.exists(temp_dirname): os.mkdir(temp_dirname) # start external PST extractor / converter result = subprocess.call( ['readpst', '-S', '-D', '-o', temp_dirname, pstfilename]) if not result == 0: sys.stderr.write( "Error: readpst failed for {}".format(pstfilename)) # prepare document processing connector = Connector_File() connector.verbose = verbose connector.config = parameters.copy() # only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST) if not 'container' in connector.config: connector.config['container'] = pstfilename for dirName, subdirList, fileList in os.walk(temp_dirname): if verbose: print('Scanning directory: %s' % dirName) for fileName in fileList: if verbose: print('Scanning file: %s' % fileName) try: # replace temp dirname from indexed id contained_dirname = dirName.replace(temp_dirname, '', 1) # build a virtual filename pointing to original PST file if contained_dirname: contained_dirname = contained_dirname + os.path.sep else: contained_dirname = os.path.sep connector.config[ 'id'] = parameters['id'] + contained_dirname + fileName contained_filename = dirName + os.path.sep + fileName # E-mails filenames are pure number # Attachment file names are number-filename # if temp_filename without - in filename, its a mail file # rename to suffix .eml so Tika will extract more metadata like from and to if not '-' in fileName: os.rename(contained_filename, contained_filename + '.eml') contained_filename += '.eml' connector.config['id'] += '.eml' try: connector.index_file(filename=contained_filename) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while indexing contained content {} from {} : {}\n" .format(fileName, connector.config['container'], e.message)) os.remove(contained_filename) except BaseException as e: sys.stderr.write( "Exception while indexing file {} : {}\n".format( fileName, e.message)) shutil.rmtree(temp_dirname)
def unwarc_and_index_files(self, warcfilename, parameters={}, verbose=False): # create temp dir where to unwarc the archive if 'tmp' in parameters: system_temp_dirname = parameters['tmp'] if not os.path.exists(system_temp_dirname): os.mkdir(system_temp_dirname) else: system_temp_dirname = tempfile.gettempdir() # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs h = hashlib.md5(parameters['id'].encode('UTF-8')) temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_warc_" + h.hexdigest() if os.path.exists(temp_dirname) == False: os.mkdir(temp_dirname) # prepare document processing connector = Connector_File() connector.verbose = verbose connector.config = parameters.copy() # only set container if not yet set by a zip before (if this zip is inside another zip) if not 'container' in connector.config: connector.config['container'] = warcfilename i = 0 with open(warcfilename, 'rb') as stream: for record in ArchiveIterator(stream): i += 1 if record.rec_type == 'response': print(record.rec_headers) # write WARC record content to tempfile tempfilename = temp_dirname + os.path.sep + 'warcrecord' + str(i) tmpfile = open(tempfilename, 'wb') tmpfile.write(record.content_stream().read()) tmpfile.close() # set last modification time of the file to WARC-Date try: last_modified = time.mktime(time.strptime(record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ')) os.utime( tempfilename, (last_modified, last_modified) ) except BaseException as e: sys.stderr.write( "Exception while reading filedate to warc content {} from {} : {}\n".format(tempfilename, connector.config['container'], e) ) # set id (URL and WARC Record ID) connector.config['id'] = record.rec_headers.get_header('WARC-Target-URI')+ '/' + record.rec_headers.get_header('WARC-Record-ID') # index the extracted file try: connector.index_file(filename = tempfilename) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while indexing warc content {} from {} : {}\n".format(tempfilename, connector.config['container'], e) ) os.remove(tempfilename) shutil.rmtree(temp_dirname)
if __name__ == "__main__": from optparse import OptionParser parser = OptionParser("etl-tasks [options]") parser.add_option("-q", "--quiet", dest="quiet", action="store_true", default=False, help="Don\'t print status (filenames) while indexing") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Print debug messages") (options, args) = parser.parse_args() if options.verbose == False or options.verbose == True: verbose = options.verbose etl_delete.verbose = options.verbose etl_file.verbose = options.verbose etl_web.verbose = options.verbose etl_rss.verbose = options.verbose if options.quiet == False or options.quiet == True: etl_file.quiet = options.quiet app.worker_main()
def pst2email(self, pstfilename, parameters={}, verbose=False): # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs if 'tmp' in parameters: system_temp_dirname = parameters['tmp'] if not os.path.exists(system_temp_dirname): os.mkdir(system_temp_dirname) else: system_temp_dirname = tempfile.gettempdir() h = hashlib.md5(parameters['id'].encode('UTF-8')) temp_dirname = system_temp_dirname + os.path.sep + "opensemanticetl_enhancer_pst_" + str(os.getpid()) + "_" + h.hexdigest() if not os.path.exists(temp_dirname): os.mkdir(temp_dirname) # start external PST extractor / converter result = subprocess.call(['readpst', '-S', '-D' , '-o', temp_dirname, pstfilename]) if not result == 0: sys.stderr.write( "Error: readpst failed for {}".format(pstfilename) ) # prepare document processing connector = Connector_File() connector.verbose=verbose connector.config = parameters.copy() # only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST) if not 'container' in connector.config: connector.config['container'] = pstfilename for dirName, subdirList, fileList in os.walk(temp_dirname): if verbose: print('Scanning directory: %s' % dirName) for fileName in fileList: if verbose: print('Scanning file: %s' % fileName) try: # replace temp dirname from indexed id contained_dirname = dirName.replace(temp_dirname, '', 1) # build a virtual filename pointing to original PST file if contained_dirname: contained_dirname = contained_dirname + os.path.sep else: contained_dirname = os.path.sep connector.config['id'] = parameters['id'] + contained_dirname + fileName contained_filename = dirName + os.path.sep + fileName # E-mails filenames are pure number # Attachment file names are number-filename # if temp_filename without - in filename, its a mail file # rename to suffix .eml so Tika will extract more metadata like from and to if not '-' in fileName: os.rename(contained_filename, contained_filename + '.eml') contained_filename += '.eml' connector.config['id'] += '.eml' try: connector.index_file(filename=contained_filename) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while indexing contained content {} from {} : {}\n".format(fileName, connector.config['container'], e.message) ) os.remove(contained_filename) except BaseException as e: sys.stderr.write( "Exception while indexing file {} : {}\n".format(fileName, e.message) ) shutil.rmtree(temp_dirname)