def rename_and_handle_fileexists(old_filepath, new_filepath): try: os.rename(old_filepath, new_filepath) except FileExistsError: files_hashes_match = hash_file(old_filepath) == hash_file(new_filepath) print(" File already exists at {}".format(new_filepath)) if files_hashes_match: os.remove(old_filepath) else: raise FileExistsError( "File already exists and the hash does not match")
def rename_and_handle_fileexists(old_filepath, new_filepath): pathlib.Path(new_filepath).parent.mkdir(parents=True, exist_ok=True) try: os.rename(old_filepath, new_filepath) except FileExistsError: files_hashes_match = hash_file(old_filepath) == hash_file(new_filepath) print(" File already exists at {}".format(new_filepath)) if files_hashes_match: os.remove(old_filepath) else: raise FileExistsError( "File already exists and the hash does not match")
def file_already_in_index(indexed_filepath, to_be_indexed_filepath, filehash): try: new_hash = hash_file(indexed_filepath) except FileNotFoundError: raise FileNotFoundError( "Indexed logfile can't be found in its declared location.") try: assert new_hash == filehash except AssertionError: raise AssertionError( "The located file doesn't agree with the index hash.") print("Already exists in index") os.remove(to_be_indexed_filepath)
def index_logfiles(centre_map, machine_map, logfile_data_directory): data_directory = logfile_data_directory index_filepath = os.path.abspath(os.path.join(data_directory, "index.json")) to_be_indexed_directory = os.path.abspath( os.path.join(data_directory, "to_be_indexed")) indexed_directory = os.path.abspath(os.path.join(data_directory, "indexed")) no_mosaiq_record_found = os.path.abspath( os.path.join(data_directory, "no_mosaiq_record_found")) unknown_error_in_logfile = os.path.abspath( os.path.join(data_directory, "unknown_error_in_logfile")) no_field_label_in_logfile = os.path.abspath( os.path.join(data_directory, "no_field_label_in_logfile")) # machine_map = config['machine_map'] centre_details = centre_map centre_server_map = { centre: centre_lookup["mosaiq_sql_server"] for centre, centre_lookup in centre_map.items() } sql_server_and_ports = [ "{}".format(details["mosaiq_sql_server"]) for _, details in centre_details.items() ] with open(index_filepath, "r") as json_data_file: index = json.load(json_data_file) indexset = set(index.keys()) print("\nConnecting to Mosaiq SQL servers...") with multi_mosaiq_connect(sql_server_and_ports) as cursors: print("Globbing index directory...") to_be_indexed = glob(os.path.join(to_be_indexed_directory, "**/*.trf"), recursive=True) chunk_size = 50 number_to_be_indexed = len(to_be_indexed) to_be_indexed_chunked = [ to_be_indexed[i:i + chunk_size] for i in range(0, number_to_be_indexed, chunk_size) ] for i, a_to_be_indexed_chunk in enumerate(to_be_indexed_chunked): print("\nHashing a chunk of logfiles ({}/{})".format( i + 1, len(to_be_indexed_chunked))) hashlist = [ hash_file(filename, dot_feedback=True) for filename in a_to_be_indexed_chunk ] print(" ") to_be_indexed_dict = dict(zip(hashlist, a_to_be_indexed_chunk)) hashset = set(hashlist) for filehash in list(hashset.intersection(indexset)): file_already_in_index( os.path.join(indexed_directory, index[filehash]["filepath"]), to_be_indexed_dict[filehash], filehash, ) file_ready_to_be_indexed( cursors, list(hashset.difference(indexset)), to_be_indexed_dict, unknown_error_in_logfile, no_mosaiq_record_found, no_field_label_in_logfile, indexed_directory, index_filepath, index, machine_map, centre_details, centre_server_map, ) print("Complete")