def test_file_mod_wait_time(self): """ that the file mod wait time is actually waiting before finding files """ memento = None file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.modified_files_found_callback, self.file_exception_callback) file_harvester.start() # put a file in the directory, the mod time will be the create time self.fill_directory_with_files(CONFIG[DataSetDriverConfigKeys.DIRECTORY], CONFIG[DataSetDriverConfigKeys.PATTERN], 0, 1, 0) # wait until just before the file mod time should allow us to find the files # keep track of how long it takes to find the file approximately file_found_time = 0; while(self.found_file_count == 0): time.sleep(1) file_found_time += 1 if file_found_time > 60: raise Exception("Timeout waiting to find file") if file_found_time < CONFIG.get(DataSetDriverConfigKeys.FILE_MOD_WAIT_TIME): # we found the file before the mod time, this is bad! file_harvester.shutdown() self.fail('Files found in %s seconds' % file_found_time) log.debug('File found in %s seconds', file_found_time) file_harvester.shutdown()
def test_harvester_without_frequency(self): """ Test that we can use a default frequency """ config = {'directory': TESTDIR, 'pattern': CONFIG['pattern']} # start the harvester from scratch memento = None file_harvester = SingleDirectoryHarvester(config, memento, self.new_file_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will copy the first file and increase the # file index into data directory with a delay in between self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG['directory'], CONFIG['pattern'], 2) # Wait for three sets of new files to be discovered self.wait_for_file(0) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) file_harvester.shutdown()
def test_harvester_without_mod_time(self): """ Test that we can use a default frequency """ config = {DataSetDriverConfigKeys.DIRECTORY: TESTDIR, DataSetDriverConfigKeys.STORAGE_DIRECTORY: TESTDIR, DataSetDriverConfigKeys.PATTERN: CONFIG[DataSetDriverConfigKeys.PATTERN], DataSetDriverConfigKeys.FREQUENCY: 5} # start the harvester from scratch memento = None file_harvester = SingleDirectoryHarvester(config, memento, self.new_file_found_callback, self.modified_files_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will increase the file index using INDICIES self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG[DataSetDriverConfigKeys.DIRECTORY], CONFIG[DataSetDriverConfigKeys.PATTERN], 0, 2) # Wait for two sets of new files to be discovered self.wait_for_file(0, 2) self.wait_for_file(self.found_file_count, 2) file_harvester.shutdown()
def test_harvester_from_scratch(self): """ Test that the harvester can find files as they are added to a directory, starting with just the base file in the directory """ # start the harvester from scratch memento = None file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will increase the file index using INDICIES self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG['directory'], CONFIG['pattern'], 0, 6) # Wait for three sets of new files to be discovered self.wait_for_file(0) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) file_harvester.shutdown()
def test_harvester_with_memento(self): """ Test that the harvester can find file as they are added to a directory, using a memento to start partway through the indices """ # make sure we have 2 files already in the directory self.fill_directory_with_files(CONFIG['directory'], CONFIG['pattern'], 0, 2, 0) # start at index 2 memento = CONFIG['directory'] + '/' + 'unit_' + INDICIES[1] + CONFIG['pattern'].replace('*', '') log.debug("starting with memento %s", memento) file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will increase the file index using INDICIES # with a delay in between self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG['directory'], CONFIG['pattern'], 2, 9) # Wait for three sets of new files to be discovered self.wait_for_file(0) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) file_harvester.shutdown()
def test_missing_directory(self): config = {'directory': TESTDIR, 'pattern': CONFIG['pattern']} self.clean_directory(TESTDIR) os.rmdir(TESTDIR) self.assertFalse(os.path.exists(TESTDIR)) # start the harvester from scratch memento = None os.mkdir(TESTDIR) file_harvester = SingleDirectoryHarvester(config, memento, self.new_file_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will increase the file index using INDICIES self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG['directory'], CONFIG['pattern'], 0, 2) # Wait for three sets of new files to be discovered self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) file_harvester.shutdown()
def test_harvester_from_scratch(self): """ Test that the harvester can find files as they are added to a directory, starting with just the base file in the directory """ # start the harvester from scratch memento = None config = CONFIG.copy() config[DataSetDriverConfigKeys.FILE_MOD_WAIT_TIME] = 10 file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.modified_files_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will increase the file index using INDICIES self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG[DataSetDriverConfigKeys.DIRECTORY], CONFIG[DataSetDriverConfigKeys.PATTERN], 0, 5, 10) # Wait for new files to be discovered self.wait_for_file(0, 5) self.wait_for_file(self.found_file_count, 5) self.wait_for_file(self.found_file_count, 5) self.wait_for_file(self.found_file_count, 5) self.wait_for_file(self.found_file_count, 5) file_harvester.shutdown()
def test_harvester_multi_file(self): """ Set the timing so the harvester finds multiple new files at once """ config = CONFIG.copy() config[DataSetDriverConfigKeys.FREQUENCY] = 1 config[DataSetDriverConfigKeys.FILE_MOD_WAIT_TIME] = 15 # start the harvester from scratch memento = None file_harvester = SingleDirectoryHarvester(config, memento, self.new_file_found_callback, self.modified_files_found_callback, self.file_exception_callback) file_harvester.start() # set the file filler to generate files with only .5 secs between, # meaning 2 files will appear in the 1 seconds between the # harvester checking self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG[DataSetDriverConfigKeys.DIRECTORY], CONFIG[DataSetDriverConfigKeys.PATTERN], 0, 12, .5) # Wait for sets of new files to be discovered self.wait_for_file(0) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) file_harvester.shutdown()
def test_harvester_with_memento(self): """ Test that the harvester can find file as they are added to a directory, using a memento to start partway through the indices """ # make sure we have 2 files already in the directory self.fill_directory_with_files(CONFIG['directory'], CONFIG['pattern'], 2, 0) # start at index 2 dir_files = glob.glob(CONFIG['directory'] + '/' + CONFIG['pattern']) memento = self.replace_file_index(dir_files[0], 2) file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will copy the first file and increase the # file index into data directory with a delay in between self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG['directory'], CONFIG['pattern'], 3) # Wait for three sets of new files to be discovered self.wait_for_file(0) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) file_harvester.shutdown()
def test_harvester_multi_file(self): """ Set the timing so the harvester finds multiple new files at once """ # start the harvester from scratch memento = None file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.file_exception_callback) file_harvester.start() # set the file filler to generate files with only .5 secs between, # meaning 2 files will appear in the 1 seconds between the # harvester checking self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG['directory'], CONFIG['pattern'], 0, 12, .5) # Wait for sets of new files to be discovered self.wait_for_file(0) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) self.wait_for_file(self.found_file_count) file_harvester.shutdown()
def test_init(self): """ Test initialize """ # start the harvester from scratch memento = None file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.modified_files_found_callback, self.file_exception_callback) file_harvester.start() file_harvester.shutdown()
def test_harvester_with_memento(self): """ Test that the harvester can find file as they are added to a directory, using a memento to start partway through the indices """ # make sure we have 2 files already in the directory self.fill_directory_with_files(CONFIG[DataSetDriverConfigKeys.DIRECTORY], CONFIG[DataSetDriverConfigKeys.PATTERN], 0, 2, 0) filename_1 = 'unit_' + INDICIES[0] + CONFIG[DataSetDriverConfigKeys.PATTERN].replace('*', '') filename_2 = 'unit_' + INDICIES[1] + CONFIG[DataSetDriverConfigKeys.PATTERN].replace('*', '') # get metadata for the files metadata_1 = self.get_file_metadata(filename_1) metadata_1[DriverStateKey.INGESTED] = True metadata_1[DriverStateKey.PARSER_STATE] = None metadata_2 = self.get_file_metadata(filename_2) metadata_2[DriverStateKey.INGESTED] = True metadata_2[DriverStateKey.PARSER_STATE] = None # generate memento with two files ingested (parser state is not looked at) memento = {DriverStateKey.VERSION: 0.1, filename_1: metadata_1, filename_2: metadata_2 } log.debug("starting with memento %s", memento) config = CONFIG.copy() config[DataSetDriverConfigKeys.FILE_MOD_WAIT_TIME] = 15 file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.modified_files_found_callback, self.file_exception_callback) file_harvester.start() # start a new event which will increase the file index using INDICIES # with a delay in between self.directory_filler = gevent.spawn(self.fill_directory_with_files, CONFIG[DataSetDriverConfigKeys.DIRECTORY], CONFIG[DataSetDriverConfigKeys.PATTERN], 2, 9, 5) # Wait for three sets of new files to be discovered self.wait_for_file(0, 2) self.wait_for_file(self.found_file_count, 2) self.wait_for_file(self.found_file_count, 2) self.wait_for_file(self.found_file_count, 2) self.wait_for_file(self.found_file_count, 2) self.wait_for_file(self.found_file_count, 2) file_harvester.shutdown()
def test_init(self): """ Test initialize """ config = {'directory': TESTDIR, 'pattern': CONFIG['pattern']} # start the harvester from scratch memento = None file_harvester = SingleDirectoryHarvester(config, memento, self.new_file_found_callback, self.file_exception_callback) file_harvester.sort_files(['a_1_2.bla', 'a_2_2.bla']) file_harvester.start() file_harvester.shutdown()
def test_harvester_with_modified(self): """ Test that the harvester can find file as they are added to a directory, using a memento to start partway through the indices """ # make sure we have 2 files already in the directory self.fill_directory_with_files(CONFIG[DataSetDriverConfigKeys.DIRECTORY], CONFIG[DataSetDriverConfigKeys.PATTERN], 0, 2, 0) filename_1 = 'unit_' + INDICIES[0] + CONFIG[DataSetDriverConfigKeys.PATTERN].replace('*', '') filename_2 = 'unit_' + INDICIES[1] + CONFIG[DataSetDriverConfigKeys.PATTERN].replace('*', '') # get metadata for the files metadata_1 = self.get_file_metadata(filename_1) metadata_1[DriverStateKey.INGESTED] = True metadata_1[DriverStateKey.PARSER_STATE] = None metadata_2 = self.get_file_metadata(filename_2) metadata_2[DriverStateKey.INGESTED] = True metadata_2[DriverStateKey.PARSER_STATE] = None # generate memento with two files ingested (parser state is not looked at) memento = {DriverStateKey.VERSION: 0.1, filename_1: metadata_1, filename_2: metadata_2 } log.debug("starting with memento %s", memento) config = CONFIG.copy() config[DataSetDriverConfigKeys.FILE_MOD_WAIT_TIME] = 15 file_harvester = SingleDirectoryHarvester(CONFIG, memento, self.new_file_found_callback, self.modified_files_found_callback, self.file_exception_callback) file_harvester.start() file_path = os.path.join(CONFIG[DataSetDriverConfigKeys.DIRECTORY], filename_1) with open(file_path, 'a') as filehandle: filehandle.write('a b c d') end_time = 0 while(self.found_modified_count == 0): log.debug("Waiting for modified file...") time.sleep(2) end_time += 2 if end_time > 60: raise Exception("Timeout waiting to find modified files") file_harvester.shutdown()