Exemple #1
0
    def configure_trial_metadata(self):
        """
        This method configures the trial metadata file. This file contains information on the training job, including the data, model and task used, 
        as well as things like hyperparameters, a training timestamp, and so on.

        Parameters
        -----------
        arg_dict : dict
            Containing arguments that were used for training
        """

        metadata = {}
        metadata['model_name'] = self.model_name
        metadata['data_name'] = self.dataset_name 
        metadata['model_hash'] = self.model_hash
        metadata['data_hash'] = self.data_hash
        metadata['argument_cmds'] = self.arg_str

        if self.task_name is not None:
            metadata['task_name'] = self.task_name
            metadata['task_hash'] = self.task_hash
        else:
            metadata['task_name'] = 'none'
            metadata['task_hash'] = 'none'

        metadata['hyperparameters'] = self.get_hyperparameter_dict(metadata_dict=metadata)

        metadata['start_timestamp'] = int((datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)).total_seconds())
        metadata['trial_group_hash'] = MantraHashed.get_256_hash_from_string(metadata['model_hash'] + metadata['data_hash'] + metadata['task_hash'])
        metadata['trial_hash'] = MantraHashed.get_256_hash_from_string(metadata['model_hash'] + metadata['data_hash'] + metadata['task_hash'] + str(metadata['start_timestamp']))

        self.trial_folder_name = '%s_%s_%s_%s' % (metadata['start_timestamp'], metadata['model_name'], metadata['data_name'], metadata['trial_hash'][:SHORT_HASH_INT])
        self.yaml_content = yaml.dump(metadata, default_flow_style=False)
        self.log_file_contents = '%s %s %s %s %s %s %s %s %s %s\n' % (metadata['start_timestamp'], self.trial_folder_name, metadata['trial_hash'],
            metadata['trial_group_hash'],
            metadata['model_name'],
            metadata['model_hash'],
            metadata['data_name'],
            metadata['data_hash'],
            metadata['task_name'],
            metadata['task_hash'])

        if self.name:
            trial_location = os.getcwd() + '/.mantra/TRIAL_GROUP_NAMES'
            
            with open(trial_location, 'r') as stream:
                yaml_content = yaml.load(stream)

                if not yaml_content:
                    yaml_content = {}

            yaml_content[metadata['trial_group_hash']] = self.name

            new_yaml_content = yaml.dump(yaml_content, default_flow_style=False)
            
            yaml_file = open(trial_location, 'w')
            yaml_file.write(new_yaml_content)
            yaml_file.close()
Exemple #2
0
def test_create_file_hash_dict():
    file_info = MantraHashed.create_file_hash_dict(__file__, __file__)
    assert (isinstance(file_info, dict))
    assert (file_info['path'] == __file__)
    assert (file_info['hash'] == MantraHashed.get_256_hash_from_file(__file__))
    assert (file_info['type'] == 'file')
    assert (file_info['name'] == __file__)
    assert (isinstance(file_info['perm'], int))
    assert (len(str(file_info['perm'])) == 3)
Exemple #3
0
    def version_artefact(self, artefact_type='MODELS', **kwargs):
        """
        This method versions an artefacts used in the training: data, models tasks. We store to the .mantra folder - it means that we retrieve
        a hash for each artefact that we can record for the user in the UI; and also allows the user to retrieve old model
        or dataset versions on their local at any time.

        Parameters
        -----------
        artefact_type : str
            Specifies the type of DMT artefact (data-model-task)

        Returns
        -----------
        str - string to display to user containing the hashed artefact
        """

        if artefact_type == 'MODELS':
            folder_name = 'models'
            artefact_name = self.model_name
        elif artefact_type == 'DATA':
            folder_name = 'data'
            artefact_name = self.dataset_name
        elif artefact_type == 'TASKS':
            folder_name = 'tasks'
            artefact_name = self.task_name

        artefact_dir = '%s/%s/%s' % (os.getcwd(), folder_name, artefact_name)
        artefact_hash, artefact_hash_dict = MantraHashed.get_folder_hash(
            folder_dir=artefact_dir)

        is_new_artefact = MantraHashed.save_artefact(
            cwd=os.getcwd(),
            hash=artefact_hash,
            objects=artefact_hash_dict,
            trial=self,
            artefact_type=artefact_type,
            **kwargs)

        if artefact_type == 'MODELS':
            self.model_hash = artefact_hash
            artefact_hash_text = colored(' \033[1m ...', 'white') + colored(
                ' Model hash:        %s' % self.model_hash, 'blue')
        elif artefact_type == 'DATA':
            self.data_hash = artefact_hash
            artefact_hash_text = colored(' \033[1m ...', 'white') + colored(
                ' Data hash:         %s' % self.data_hash, 'blue')
        elif artefact_type == 'TASKS':
            self.task_hash = artefact_hash
            artefact_hash_text = colored(' \033[1m ...', 'white') + colored(
                ' Task hash:         %s' % self.task_hash, 'blue')

        if is_new_artefact:
            artefact_hash_text += colored(' (new)', 'white')

        return artefact_hash_text
Exemple #4
0
    def export_data_to_s3(self, args):
        """
        This method exports data module dependencies to the user's S3. 

        - We check for the existence of an Mantra bucket. If it doesn't exist we create one. 
        - Next we check the files class variable in the dataset class....
        - We calculate the (concatenated) hash of the files; if it differs from what we have on S3, we reupload.
        - In other words, we are only storing the data dependencies on S3 for convenience: the rest we treat as small code files that 
            we can transfer easily between local/instances/S3 (e.g. data processing code)
    
        Parameters
        -----------

        args : list
            Optional arguments that were used for training

        Returns
        -----------
        void - setups the instance with the appropriate environment and files
        """

        data_dir = '%s/data/%s/' % (os.getcwd(), args.dataset)
        config_dir = '%sconfig.yml' % data_dir
        s3_data_bucket_dir = 'data/%s/raw/' % args.dataset
        s3_data_hash_location = '%shash' % s3_data_bucket_dir

        # Hashing details
        local_data_dependency_hash = MantraHashed.get_data_dependency_hash(data_dir=data_dir, dataset_class=self.dataset_class)

        s3_client = boto3.client('s3')
        s3_resource = boto3.resource('s3')
        s3_buckets = [bucket['Name'] for bucket in s3_client.list_buckets()['Buckets']]

        self.create_s3_bucket(s3_client, s3_buckets)

        try:
            bucket_contents = [object['Key'] for object in s3_client.list_objects(Bucket=self.settings.S3_BUCKET_NAME)['Contents']]
        except KeyError:
            bucket_contents = []

        hash_object = s3_resource.Object(self.settings.S3_BUCKET_NAME, s3_data_hash_location)

        if s3_data_hash_location in bucket_contents:

            s3_hash = hash_object.get()['Body'].read().decode('utf-8')

            if s3_hash == local_data_dependency_hash:
                print(colored(' \033[1m [+]', 'green') + colored(' Data exported to S3', 'white'))
                return

        # If the hash is different, or we don't have the files in S3, then upload the dataset dependencies to S3
        for file in self.dataset_class.files:
            s3_client.upload_file('%sraw/%s' % (data_dir, file), self.settings.S3_BUCKET_NAME, '%s%s' % (s3_data_bucket_dir, file), 
                Callback=S3ProgressPercentage('%sraw/%s' % (data_dir, file)))

        hash_object.put(Body=local_data_dependency_hash)

        print(colored('\n \033[1m [+]', 'green') + colored(' Data exported to S3', 'white'))
Exemple #5
0
    def extract_file_data(self):
        """
        This method extracts data from the files list, and checks hashes based on old extractions

        Returns
        --------
        void - extracts files and stores extract location
        """

        self.hash_location = '%s%s' % (self.data_dir, 'raw/hash')
        self.raw_data_path = '%s%s' % (self.data_dir, 'raw')
        self.extracted_data_path = '%s%s' % (self.raw_data_path, '/.extract')

        is_hash = os.path.isfile(self.hash_location)
        is_extract_folder = os.path.exists(self.extracted_data_path)
        is_data_folder = os.path.exists(self.data_dir)

        if not is_extract_folder:
            os.mkdir(self.extracted_data_path)

        file_hashes = self.get_data_dependency_hashes(
            is_extract_folder=is_extract_folder, is_hash=is_hash)
        final_hash = MantraHashed.get_256_hash_from_string(
            ''.join(file_hashes))

        # If there is no hash then we store the hash

        if not is_hash:
            hash_file = open(self.hash_location, 'w')
            hash_file.write(final_hash)
            hash_file.close()

        # If there is no extract folder then we create one and copy the files over

        if not is_extract_folder:
            for file in self.files:
                file_path = '%s/%s' % (self.raw_data_path, file)
                shutil.copy(file_path, self.extracted_data_path)
            return

        hash_file = open(self.hash_location, 'r')
        old_hash = hash_file.read()
        hash_file.close()

        # If the hash of dependency files hasn't changed, we are good to go; else we copy the new files over

        if old_hash == final_hash:
            return

        else:

            for file in self.files:
                file_path = '%s/%s' % (self.raw_data_path, file)
                shutil.copy(file_path, self.extracted_data_path)
Exemple #6
0
def test_get_tree_contents():

    tree_path = '/home/ubuntu'
    dirs = ['folder1', 'folder2']
    files = ['file1', 'file2', 'file3']
    ref_table = {}
    ref_table[tree_path] = {}
    ref_table['%s/%s' % (tree_path, 'folder1')] = {}
    ref_table['%s/%s' % (tree_path, 'folder1')]['perm'] = 700
    ref_table['%s/%s' % (tree_path, 'folder1')]['hash'] = 'hash1'
    ref_table['%s/%s' % (tree_path, 'folder2')] = {}
    ref_table['%s/%s' % (tree_path, 'folder2')]['perm'] = 700
    ref_table['%s/%s' % (tree_path, 'folder2')]['hash'] = 'hash2'
    ref_table['%s/%s' % (tree_path, 'folder3')] = {}
    ref_table['%s/%s' % (tree_path, 'folder3')]['perm'] = 700
    ref_table['%s/%s' % (tree_path, 'folder3')]['hash'] = 'hash3'
    ref_table['%s/%s' % (tree_path, 'file1')] = {}
    ref_table['%s/%s' % (tree_path, 'file1')]['perm'] = 700
    ref_table['%s/%s' % (tree_path, 'file1')]['hash'] = 'hash4'
    ref_table['%s/%s' % (tree_path, 'file2')] = {}
    ref_table['%s/%s' % (tree_path, 'file2')]['perm'] = 700
    ref_table['%s/%s' % (tree_path, 'file2')]['hash'] = 'hash5'
    ref_table['%s/%s' % (tree_path, 'file3')] = {}
    ref_table['%s/%s' % (tree_path, 'file3')]['perm'] = 700
    ref_table['%s/%s' % (tree_path, 'file3')]['hash'] = 'hash6'

    tree_str, tree_hash = MantraHashed.get_tree_contents(
        tree_path, dirs, files, ref_table)

    tree_lines = tree_str.split('\n')
    assert (tree_lines[0] == '700 tree hash1 folder1 ')
    assert (tree_lines[1] == '700 tree hash2 folder2 ')
    assert (tree_lines[2] == '700 file hash4 file1 ')
    assert (tree_lines[3] == '700 file hash5 file2 ')
    assert (tree_lines[4] == '700 file hash6 file3 ')

    assert (tree_hash ==
            'b258eeaf5c932c3b57a0e1f955f11331df5b66f6a1dfb470686397f6c3726c4c')
Exemple #7
0
    def get_data_dependency_hashes(self, is_extract_folder, is_hash):
        """
        This method obtains a list of hashes of the file dependencies (Dataset.files) specified in the dataset.

        Parameters
        --------

        is_extract_folder - bool
            Whether an .extract folder currently exists within the data project folder

        is_hash - bool
            Whether a concatenated hash file exists for the raw data dependencies

        Returns
        --------
        list of strs - containing the hashes of the files in Dataset.files, hash of tar if exists
        """

        file_hashes = []

        for file in sorted(self.files):

            file_path = '%s/%s' % (self.raw_data_path, file)

            if not os.path.isfile(file_path):
                raise IOError('The following file does no exist:  %s' % file)

            tar_hash = MantraHashed.get_256_hash_from_file(file_path)
            file_hashes.append(tar_hash)

            if not is_extract_folder or not is_hash:
                if self.extract_file_dict[file]:
                    self.extract_tar_file(file_path)
                else:
                    shutil.copy(file_path, self.extracted_data_path)

        return file_hashes
Exemple #8
0
def test_get_256_hash_from_string():
    string_to_hash = 'E pluribus unum'
    file_hash = MantraHashed.get_256_hash_from_string(string_to_hash)
    assert (isinstance(file_hash, str))
    assert (len(file_hash) == 64)
Exemple #9
0
    def handle(self, args, unknown):
        if not Path("mantra.yml").exists():
            print("ERROR: Please run this command from your mantra project directory (i.e. the directory containing `mantra.yml`)")
            sys.exit(1)

        # collect the artefacts to upload
        if len(args.artefacts) == 0:
            # get all the datasets, models and results
            print("Uploading all datasets, models, tasks and results...")

            all_models = find_artefacts("", "models", "model.py")
            all_datasets = find_artefacts("", "data", "data.py")
            all_tasks = find_artefacts("", "tasks", "task.py")
            if Path("results").exists():
                all_results = [str(p) for p in Path("results").iterdir() if p.is_dir()]
            else:
                all_results = []
            all_artefacts = list(itertools.chain(all_models, all_datasets, all_tasks, all_results))

        else:
            all_artefacts = args.artefacts
            missing_artefacts = [a for a in all_artefacts if not Path(a).exists()]
            if len(missing_artefacts) > 0:
                print("ERROR: The following artefact(s) are missing: `%s`" % missing_artefacts)
                sys.exit(1)

        # TODO: Results will have dependencies, make sure those are taken into account

        # 1) Get the hashes for all the files and dependencies

        all_hashes = []
        for artefact_dir in all_artefacts:
            artefact_hash, file_hashes = MantraHashed.get_folder_hash(artefact_dir)
            all_hashes.append({
                "artefact_dir": artefact_dir,
                "artefact_hash": artefact_hash,
                "file_hashes": file_hashes,
            })

        # 2) Get the credentials

        # prompt for username and password
        mantrahub_user = input("Your mantrahub username: "******"":
            print("ERROR: The username cannot be empty, quitting...")
            sys.exit(1)

        mantrahub_pass = getpass.getpass("Your mantrahub password: "******"/api/artefacts_diff")
        json_payload = json.dumps({"all_hashes": all_hashes})
        diff_response = requests.post(full_url, json=json_payload, auth=(mantrahub_user, mantrahub_pass))

        diff = json.loads(diff_response.json())["diff_hashes"]

        if diff:
            upload_url_base = urljoin(args.remote, "api/upload_file/")
            for artefact in diff:
                for k,v in artefact["file_hashes"].items():
                    print("Uploading `%s`..." % v["path"])
                    files = {'file': open(v["path"], 'rb')}

                    h = {"Content-Disposition": "attachment; filename=%s" % v["path"]}
                    r = requests.put(upload_url_base+v["path"], files=files, headers=h,
                                     auth=(mantrahub_user, mantrahub_pass))
        else:
            print("No new files to upload...")

        # Finally, commit all the results
        commit_url = urljoin(args.remote, "api/artefacts_diff_commit")
        json_payload = json.dumps({"all_hashes": all_hashes, "diff_hashes": diff})
        commit_response = requests.post(commit_url, json=json_payload, auth=(mantrahub_user, mantrahub_pass))

        if commit_response.status_code != requests.codes.ok:
            print("ERROR: Commit not successful: %s" % commit_response.text)