def rename_and_handle_fileexists(old_filepath, new_filepath): try: os.rename(old_filepath, new_filepath) except FileExistsError: files_hashes_match = hash_file(old_filepath) == hash_file(new_filepath) print(" File already exists at {}".format(new_filepath)) if files_hashes_match: os.remove(old_filepath) else: raise FileExistsError( "File already exists and the hash does not match")
def file_already_in_index(indexed_filepath, to_be_indexed_filepath, filehash): try: new_hash = hash_file(indexed_filepath) except FileNotFoundError: raise FileNotFoundError( "Indexed logfile can't be found in its declared location.") try: assert new_hash == filehash except AssertionError: raise AssertionError( "The located file doesn't agree with the index hash.") print('Already exists in index') os.remove(to_be_indexed_filepath)
def index_logfiles(centre_map, machine_map, logfile_data_directory): data_directory = logfile_data_directory index_filepath = os.path.abspath(os.path.join(data_directory, 'index.json')) to_be_indexed_directory = os.path.abspath( os.path.join(data_directory, 'to_be_indexed')) indexed_directory = os.path.abspath(os.path.join(data_directory, 'indexed')) no_mosaiq_record_found = os.path.abspath( os.path.join(data_directory, 'no_mosaiq_record_found')) unknown_error_in_logfile = os.path.abspath( os.path.join(data_directory, 'unknown_error_in_logfile')) no_field_label_in_logfile = os.path.abspath( os.path.join(data_directory, 'no_field_label_in_logfile')) # machine_map = config['machine_map'] centre_details = centre_map centre_server_map = { centre: centre_lookup['mosaiq_sql_server'] for centre, centre_lookup in centre_map.items() } sql_server_and_ports = [ "{}".format(details['mosaiq_sql_server']) for _, details in centre_details.items() ] with open(index_filepath, 'r') as json_data_file: index = json.load(json_data_file) indexset = set(index.keys()) print('\nConnecting to Mosaiq SQL servers...') with multi_mosaiq_connect(sql_server_and_ports) as cursors: print('Globbing index directory...') to_be_indexed = glob(os.path.join(to_be_indexed_directory, '**/*.trf'), recursive=True) chunk_size = 50 number_to_be_indexed = len(to_be_indexed) to_be_indexed_chunked = [ to_be_indexed[i:i + chunk_size] for i in range(0, number_to_be_indexed, chunk_size) ] for i, a_to_be_indexed_chunk in enumerate(to_be_indexed_chunked): print('\nHashing a chunk of logfiles ({}/{})'.format( i + 1, len(to_be_indexed_chunked))) hashlist = [ hash_file(filename, dot_feedback=True) for filename in a_to_be_indexed_chunk ] print(' ') to_be_indexed_dict = dict(zip(hashlist, a_to_be_indexed_chunk)) hashset = set(hashlist) for filehash in list(hashset.intersection(indexset)): file_already_in_index( os.path.join(indexed_directory, index[filehash]['filepath']), to_be_indexed_dict[filehash], filehash) file_ready_to_be_indexed( cursors, list(hashset.difference(indexset)), to_be_indexed_dict, unknown_error_in_logfile, no_mosaiq_record_found, no_field_label_in_logfile, indexed_directory, index_filepath, index, machine_map, centre_details, centre_server_map) print('Complete')