def configure_trial_metadata(self): """ This method configures the trial metadata file. This file contains information on the training job, including the data, model and task used, as well as things like hyperparameters, a training timestamp, and so on. Parameters ----------- arg_dict : dict Containing arguments that were used for training """ metadata = {} metadata['model_name'] = self.model_name metadata['data_name'] = self.dataset_name metadata['model_hash'] = self.model_hash metadata['data_hash'] = self.data_hash metadata['argument_cmds'] = self.arg_str if self.task_name is not None: metadata['task_name'] = self.task_name metadata['task_hash'] = self.task_hash else: metadata['task_name'] = 'none' metadata['task_hash'] = 'none' metadata['hyperparameters'] = self.get_hyperparameter_dict(metadata_dict=metadata) metadata['start_timestamp'] = int((datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)).total_seconds()) metadata['trial_group_hash'] = MantraHashed.get_256_hash_from_string(metadata['model_hash'] + metadata['data_hash'] + metadata['task_hash']) metadata['trial_hash'] = MantraHashed.get_256_hash_from_string(metadata['model_hash'] + metadata['data_hash'] + metadata['task_hash'] + str(metadata['start_timestamp'])) self.trial_folder_name = '%s_%s_%s_%s' % (metadata['start_timestamp'], metadata['model_name'], metadata['data_name'], metadata['trial_hash'][:SHORT_HASH_INT]) self.yaml_content = yaml.dump(metadata, default_flow_style=False) self.log_file_contents = '%s %s %s %s %s %s %s %s %s %s\n' % (metadata['start_timestamp'], self.trial_folder_name, metadata['trial_hash'], metadata['trial_group_hash'], metadata['model_name'], metadata['model_hash'], metadata['data_name'], metadata['data_hash'], metadata['task_name'], metadata['task_hash']) if self.name: trial_location = os.getcwd() + '/.mantra/TRIAL_GROUP_NAMES' with open(trial_location, 'r') as stream: yaml_content = yaml.load(stream) if not yaml_content: yaml_content = {} yaml_content[metadata['trial_group_hash']] = self.name new_yaml_content = yaml.dump(yaml_content, default_flow_style=False) yaml_file = open(trial_location, 'w') yaml_file.write(new_yaml_content) yaml_file.close()
def test_create_file_hash_dict(): file_info = MantraHashed.create_file_hash_dict(__file__, __file__) assert (isinstance(file_info, dict)) assert (file_info['path'] == __file__) assert (file_info['hash'] == MantraHashed.get_256_hash_from_file(__file__)) assert (file_info['type'] == 'file') assert (file_info['name'] == __file__) assert (isinstance(file_info['perm'], int)) assert (len(str(file_info['perm'])) == 3)
def version_artefact(self, artefact_type='MODELS', **kwargs): """ This method versions an artefacts used in the training: data, models tasks. We store to the .mantra folder - it means that we retrieve a hash for each artefact that we can record for the user in the UI; and also allows the user to retrieve old model or dataset versions on their local at any time. Parameters ----------- artefact_type : str Specifies the type of DMT artefact (data-model-task) Returns ----------- str - string to display to user containing the hashed artefact """ if artefact_type == 'MODELS': folder_name = 'models' artefact_name = self.model_name elif artefact_type == 'DATA': folder_name = 'data' artefact_name = self.dataset_name elif artefact_type == 'TASKS': folder_name = 'tasks' artefact_name = self.task_name artefact_dir = '%s/%s/%s' % (os.getcwd(), folder_name, artefact_name) artefact_hash, artefact_hash_dict = MantraHashed.get_folder_hash( folder_dir=artefact_dir) is_new_artefact = MantraHashed.save_artefact( cwd=os.getcwd(), hash=artefact_hash, objects=artefact_hash_dict, trial=self, artefact_type=artefact_type, **kwargs) if artefact_type == 'MODELS': self.model_hash = artefact_hash artefact_hash_text = colored(' \033[1m ...', 'white') + colored( ' Model hash: %s' % self.model_hash, 'blue') elif artefact_type == 'DATA': self.data_hash = artefact_hash artefact_hash_text = colored(' \033[1m ...', 'white') + colored( ' Data hash: %s' % self.data_hash, 'blue') elif artefact_type == 'TASKS': self.task_hash = artefact_hash artefact_hash_text = colored(' \033[1m ...', 'white') + colored( ' Task hash: %s' % self.task_hash, 'blue') if is_new_artefact: artefact_hash_text += colored(' (new)', 'white') return artefact_hash_text
def export_data_to_s3(self, args): """ This method exports data module dependencies to the user's S3. - We check for the existence of an Mantra bucket. If it doesn't exist we create one. - Next we check the files class variable in the dataset class.... - We calculate the (concatenated) hash of the files; if it differs from what we have on S3, we reupload. - In other words, we are only storing the data dependencies on S3 for convenience: the rest we treat as small code files that we can transfer easily between local/instances/S3 (e.g. data processing code) Parameters ----------- args : list Optional arguments that were used for training Returns ----------- void - setups the instance with the appropriate environment and files """ data_dir = '%s/data/%s/' % (os.getcwd(), args.dataset) config_dir = '%sconfig.yml' % data_dir s3_data_bucket_dir = 'data/%s/raw/' % args.dataset s3_data_hash_location = '%shash' % s3_data_bucket_dir # Hashing details local_data_dependency_hash = MantraHashed.get_data_dependency_hash(data_dir=data_dir, dataset_class=self.dataset_class) s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_buckets = [bucket['Name'] for bucket in s3_client.list_buckets()['Buckets']] self.create_s3_bucket(s3_client, s3_buckets) try: bucket_contents = [object['Key'] for object in s3_client.list_objects(Bucket=self.settings.S3_BUCKET_NAME)['Contents']] except KeyError: bucket_contents = [] hash_object = s3_resource.Object(self.settings.S3_BUCKET_NAME, s3_data_hash_location) if s3_data_hash_location in bucket_contents: s3_hash = hash_object.get()['Body'].read().decode('utf-8') if s3_hash == local_data_dependency_hash: print(colored(' \033[1m [+]', 'green') + colored(' Data exported to S3', 'white')) return # If the hash is different, or we don't have the files in S3, then upload the dataset dependencies to S3 for file in self.dataset_class.files: s3_client.upload_file('%sraw/%s' % (data_dir, file), self.settings.S3_BUCKET_NAME, '%s%s' % (s3_data_bucket_dir, file), Callback=S3ProgressPercentage('%sraw/%s' % (data_dir, file))) hash_object.put(Body=local_data_dependency_hash) print(colored('\n \033[1m [+]', 'green') + colored(' Data exported to S3', 'white'))
def extract_file_data(self): """ This method extracts data from the files list, and checks hashes based on old extractions Returns -------- void - extracts files and stores extract location """ self.hash_location = '%s%s' % (self.data_dir, 'raw/hash') self.raw_data_path = '%s%s' % (self.data_dir, 'raw') self.extracted_data_path = '%s%s' % (self.raw_data_path, '/.extract') is_hash = os.path.isfile(self.hash_location) is_extract_folder = os.path.exists(self.extracted_data_path) is_data_folder = os.path.exists(self.data_dir) if not is_extract_folder: os.mkdir(self.extracted_data_path) file_hashes = self.get_data_dependency_hashes( is_extract_folder=is_extract_folder, is_hash=is_hash) final_hash = MantraHashed.get_256_hash_from_string( ''.join(file_hashes)) # If there is no hash then we store the hash if not is_hash: hash_file = open(self.hash_location, 'w') hash_file.write(final_hash) hash_file.close() # If there is no extract folder then we create one and copy the files over if not is_extract_folder: for file in self.files: file_path = '%s/%s' % (self.raw_data_path, file) shutil.copy(file_path, self.extracted_data_path) return hash_file = open(self.hash_location, 'r') old_hash = hash_file.read() hash_file.close() # If the hash of dependency files hasn't changed, we are good to go; else we copy the new files over if old_hash == final_hash: return else: for file in self.files: file_path = '%s/%s' % (self.raw_data_path, file) shutil.copy(file_path, self.extracted_data_path)
def test_get_tree_contents(): tree_path = '/home/ubuntu' dirs = ['folder1', 'folder2'] files = ['file1', 'file2', 'file3'] ref_table = {} ref_table[tree_path] = {} ref_table['%s/%s' % (tree_path, 'folder1')] = {} ref_table['%s/%s' % (tree_path, 'folder1')]['perm'] = 700 ref_table['%s/%s' % (tree_path, 'folder1')]['hash'] = 'hash1' ref_table['%s/%s' % (tree_path, 'folder2')] = {} ref_table['%s/%s' % (tree_path, 'folder2')]['perm'] = 700 ref_table['%s/%s' % (tree_path, 'folder2')]['hash'] = 'hash2' ref_table['%s/%s' % (tree_path, 'folder3')] = {} ref_table['%s/%s' % (tree_path, 'folder3')]['perm'] = 700 ref_table['%s/%s' % (tree_path, 'folder3')]['hash'] = 'hash3' ref_table['%s/%s' % (tree_path, 'file1')] = {} ref_table['%s/%s' % (tree_path, 'file1')]['perm'] = 700 ref_table['%s/%s' % (tree_path, 'file1')]['hash'] = 'hash4' ref_table['%s/%s' % (tree_path, 'file2')] = {} ref_table['%s/%s' % (tree_path, 'file2')]['perm'] = 700 ref_table['%s/%s' % (tree_path, 'file2')]['hash'] = 'hash5' ref_table['%s/%s' % (tree_path, 'file3')] = {} ref_table['%s/%s' % (tree_path, 'file3')]['perm'] = 700 ref_table['%s/%s' % (tree_path, 'file3')]['hash'] = 'hash6' tree_str, tree_hash = MantraHashed.get_tree_contents( tree_path, dirs, files, ref_table) tree_lines = tree_str.split('\n') assert (tree_lines[0] == '700 tree hash1 folder1 ') assert (tree_lines[1] == '700 tree hash2 folder2 ') assert (tree_lines[2] == '700 file hash4 file1 ') assert (tree_lines[3] == '700 file hash5 file2 ') assert (tree_lines[4] == '700 file hash6 file3 ') assert (tree_hash == 'b258eeaf5c932c3b57a0e1f955f11331df5b66f6a1dfb470686397f6c3726c4c')
def get_data_dependency_hashes(self, is_extract_folder, is_hash): """ This method obtains a list of hashes of the file dependencies (Dataset.files) specified in the dataset. Parameters -------- is_extract_folder - bool Whether an .extract folder currently exists within the data project folder is_hash - bool Whether a concatenated hash file exists for the raw data dependencies Returns -------- list of strs - containing the hashes of the files in Dataset.files, hash of tar if exists """ file_hashes = [] for file in sorted(self.files): file_path = '%s/%s' % (self.raw_data_path, file) if not os.path.isfile(file_path): raise IOError('The following file does no exist: %s' % file) tar_hash = MantraHashed.get_256_hash_from_file(file_path) file_hashes.append(tar_hash) if not is_extract_folder or not is_hash: if self.extract_file_dict[file]: self.extract_tar_file(file_path) else: shutil.copy(file_path, self.extracted_data_path) return file_hashes
def test_get_256_hash_from_string(): string_to_hash = 'E pluribus unum' file_hash = MantraHashed.get_256_hash_from_string(string_to_hash) assert (isinstance(file_hash, str)) assert (len(file_hash) == 64)
def handle(self, args, unknown): if not Path("mantra.yml").exists(): print("ERROR: Please run this command from your mantra project directory (i.e. the directory containing `mantra.yml`)") sys.exit(1) # collect the artefacts to upload if len(args.artefacts) == 0: # get all the datasets, models and results print("Uploading all datasets, models, tasks and results...") all_models = find_artefacts("", "models", "model.py") all_datasets = find_artefacts("", "data", "data.py") all_tasks = find_artefacts("", "tasks", "task.py") if Path("results").exists(): all_results = [str(p) for p in Path("results").iterdir() if p.is_dir()] else: all_results = [] all_artefacts = list(itertools.chain(all_models, all_datasets, all_tasks, all_results)) else: all_artefacts = args.artefacts missing_artefacts = [a for a in all_artefacts if not Path(a).exists()] if len(missing_artefacts) > 0: print("ERROR: The following artefact(s) are missing: `%s`" % missing_artefacts) sys.exit(1) # TODO: Results will have dependencies, make sure those are taken into account # 1) Get the hashes for all the files and dependencies all_hashes = [] for artefact_dir in all_artefacts: artefact_hash, file_hashes = MantraHashed.get_folder_hash(artefact_dir) all_hashes.append({ "artefact_dir": artefact_dir, "artefact_hash": artefact_hash, "file_hashes": file_hashes, }) # 2) Get the credentials # prompt for username and password mantrahub_user = input("Your mantrahub username: "******"": print("ERROR: The username cannot be empty, quitting...") sys.exit(1) mantrahub_pass = getpass.getpass("Your mantrahub password: "******"/api/artefacts_diff") json_payload = json.dumps({"all_hashes": all_hashes}) diff_response = requests.post(full_url, json=json_payload, auth=(mantrahub_user, mantrahub_pass)) diff = json.loads(diff_response.json())["diff_hashes"] if diff: upload_url_base = urljoin(args.remote, "api/upload_file/") for artefact in diff: for k,v in artefact["file_hashes"].items(): print("Uploading `%s`..." % v["path"]) files = {'file': open(v["path"], 'rb')} h = {"Content-Disposition": "attachment; filename=%s" % v["path"]} r = requests.put(upload_url_base+v["path"], files=files, headers=h, auth=(mantrahub_user, mantrahub_pass)) else: print("No new files to upload...") # Finally, commit all the results commit_url = urljoin(args.remote, "api/artefacts_diff_commit") json_payload = json.dumps({"all_hashes": all_hashes, "diff_hashes": diff}) commit_response = requests.post(commit_url, json=json_payload, auth=(mantrahub_user, mantrahub_pass)) if commit_response.status_code != requests.codes.ok: print("ERROR: Commit not successful: %s" % commit_response.text)