def test_invalid_dataset_path():
    dataset_path = "bad/dataset/path"
    try:
        count_datapoints(dataset_path)
        assert False, "Assertion for dataset path should have failed."
    except AssertionError as e:
        assert(str(e) == "Dataset path is invalid.")
Beispiel #2
0
    def _split_data(self, job):
        """
        Takes in a job, which should have the raw filepath assigned.

        1. In each folder, aggregate all data.
        2. Create session folder in transformed folder, along with data folders
           in session folder.
        3. Shuffle each transformed data and perform train-test split on each. 
        4. Put each training and test set in corresponding data folders in 
           session folder.
        5. Update session filepath in job.
        """

        # 1. Extracts all of the raw data from raw data filepath
        assert job.raw_filepath, \
            "Raw data filepath has not been set!"
        files = os.listdir(job.raw_filepath)
        files = list(filter(lambda x: x.endswith('.csv'), files))
        assert len(files) == 1, \
            "Only supporting one file per dataset folder!"
        data_filepath = os.path.join(job.raw_filepath, files[0])
        raw_data = pd.read_csv(data_filepath)

        # 2. Create transformed folder, if it doesn't exist.
        transformed_filepath = os.path.join(job.raw_filepath, "transformed")
        if not os.path.isdir(transformed_filepath):
            os.makedirs(transformed_filepath)

        # 3. Create session folder using timestamp and random characters.
        new_name = str(uuid.uuid4())
        session_filepath = os.path.join(transformed_filepath, new_name)
        os.makedirs(session_filepath)

        # 4. Retrieve train-test split from job.
        split = job.hyperparams['split']

        # 5. Shuffle raw data, then split into train and test set.
        raw_data = raw_data.sample(frac=1)
        split_index = int(len(raw_data) * split)
        train = raw_data.iloc[:split_index]
        test = raw_data.iloc[split_index:]

        # 6. Create train.csv and test.csv in data folder.
        train.to_csv(os.path.join(session_filepath, 'train.csv'), index=False)

        test.to_csv(os.path.join(session_filepath, 'test.csv'), index=False)

        # 7. Get datapoint count to be used in future jobs
        datapoint_count = count_datapoints(session_filepath)

        # 8. Return job with assigned session folder filepath and
        #    datapoint count.
        results = DMLResult(status='successful',
                            job=job,
                            results={
                                'session_filepath': session_filepath,
                                'datapoint_count': datapoint_count
                            },
                            error_message="")
        return results
def test_invalid_batch_size(dataset_path, train_dataset_path, test_dataset_path):
    """
    Test that assertion fails with invalid batch size.
    """
    count = count_datapoints(dataset_path)
    
    #Set up iterator for training set
    random_train_iterator = create_random_train_dataset_iterator(
            train_dataset_path,
            batch_size=-1,
            labeler='label',
            infinite=False 
        )

    #Set up iterator for test set.
    random_test_iterator = create_random_test_dataset_iterator(
            test_dataset_path,
            batch_size=-1,
            labeler='label',
            infinite=False
        )

    #Assertion should fail here.
    try:
        train_data = []
        for X,y in random_train_iterator:
            for datapoint in X:
                train_data.append(datapoint[-1])
        assert False,"Assertion for batch size should have failed"
    except AssertionError as e:
        assert str(e) == "Invalid batch size provided."
def test_invalid_labeler(dataset_path, train_dataset_path, test_dataset_path):
    """
    Test that assertion fails with invalid labeler.
    """
    count = count_datapoints(dataset_path)

    #Set up iterator for training set
    random_train_iterator = create_random_train_dataset_iterator(
            train_dataset_path,
            batch_size=4,
            labeler='bad_column',
            infinite=True
        )

    #Set up iterator for test set.
    random_test_iterator = create_random_test_dataset_iterator(
            test_dataset_path,
            batch_size=4,
            labeler='bad_column',
            infinite=True
        )

    #Assertion should fail here.
    try:
        train_data = []
        for X,y in random_train_iterator:
            for datapoint in X:
                train_data.append(datapoint[-1])
        assert False,"Assertion for labeler should have failed."
    except AssertionError as e:
        assert str(e) == "Labeler is invalid."
def test_labeler_out_of_bounds(dataset_path):
    """
    Test that assertion fails with invalid labeler.
    """
    count = count_datapoints(dataset_path)

    #Set up iterator for training set
    train_iterator = create_sequential_train_dataset_iterator(
            dataset_path,
            count,
            batch_size=4,
            labeler=get_num_columns(dataset_path)*2,
            infinite=True
        )

    #Set up iterator for test set.
    test_iterator = create_sequential_test_dataset_iterator(
            dataset_path,
            count,
            batch_size=4,
            labeler=get_num_columns(dataset_path)*2,
            infinite=True
        )

    #Assertion should fail here.
    try:
        train_data = []
        for X,y in train_iterator:
            for datapoint in X:
                train_data.append(datapoint[-1])
        assert False,"Assertion for labeler should have failed."
    except AssertionError as e:
        assert str(e) == "Labeler is out of bounds."
def test_infinite_works(dataset_path):
    """
    With infinite=True, test 3 * count iterations to see that data is what we expect.
    """

    count = count_datapoints(dataset_path)

    #Set up iterator for training set
    train_iterator = create_sequential_train_dataset_iterator(
            dataset_path,
            count,
            batch_size=4,
            labeler=0,
            infinite=True
        )

    #Set up iterator for test set.
    test_iterator = create_sequential_test_dataset_iterator(
            dataset_path,
            count,
            batch_size=4,
            labeler=0,
            infinite=True
        )

    #Collect "datapoints" for training and test set. In reality, just take the 
    #index so that overlapping points can be detected later.
    limit = 3 * count

    num_datapoints = 0
    train_data = []
    for X,y in train_iterator:
        for datapoint in X:
            train_data.append(datapoint[-1])
            num_datapoints += 1
        if num_datapoints >= limit * 0.8:
            break

    num_datapoints = 0
    test_data = []
    for X,y in test_iterator:
        for datapoint in X:
            test_data.append(datapoint[-1])
            num_datapoints += 1
        if num_datapoints >= limit * 0.2:
            break

    #Check that training set and test set have right number of datapoints
    assert len(train_data) == 0.8*count*3
    assert len(test_data) == 0.2*count*3

    #Set up dataframe for join
    train_df = pd.DataFrame(data={"index": train_data})
    test_df = pd.DataFrame(data={"index": test_data})

    #Check for no overlapping datapoints
    assert len(pd.merge(train_df, test_df, on='index').index) == 0
def test_train_test_split(dataset_path):
    """
    Test train-test split works by:
        1. Checking the split put the right number of datapoints (proportional 
           to the total number) in the training and test datasets.
        2. Checking that datapoints don't overlap between each dataset
    """

    count = count_datapoints(dataset_path)

    #Set up iterator for training set
    train_iterator = create_sequential_train_dataset_iterator(
            dataset_path,
            count,
            batch_size=7,
            labeler=0,
            infinite=False 
        )

    #Set up iterator for test set.
    test_iterator = create_sequential_test_dataset_iterator(
            dataset_path,
            count,
            batch_size=7,
            labeler=0,
            infinite=False
        )

    #Collect "datapoints" for training and test set. In reality, just take the 
    #index so that overlapping points can be detected later.
    train_data = []
    for X,y in train_iterator:
        for datapoint in X:
            train_data.append(datapoint[-1])

    test_data = []
    for X,y in test_iterator:
        for datapoint in X:
            test_data.append(datapoint[-1])

    #Check that training set and test set have right number of datapoints
    assert len(train_data) == 0.8*count
    assert len(test_data) == 0.2*count

    #Set up dataframe for join
    train_df = pd.DataFrame(data={"index": train_data})
    test_df = pd.DataFrame(data={"index": test_data})

    #Check for no overlapping datapoints
    assert len(pd.merge(train_df, test_df, on='index').index) == 0
def test_large_batch_size(dataset_path):
    """
    Same as above, except test with batch_size > count (datapoints)
    """
    count = count_datapoints(dataset_path)
    batch_size = count * random.randint(1, 5)

    #Set up iterator for training set
    train_iterator = create_sequential_train_dataset_iterator(
            dataset_path,
            count,
            batch_size=batch_size,
            labeler=0,
            infinite=False 
        )

    #Set up iterator for test set.
    test_iterator = create_sequential_test_dataset_iterator(
            dataset_path,
            count,
            batch_size=batch_size,
            labeler=0,
            infinite=False
        )

    #Collect "datapoints" for training and test set. In reality, just take the 
    #index so that overlapping points can be detected later.
    train_data = []
    for X,y in train_iterator:
        for datapoint in X:
            train_data.append(datapoint[-1])

    test_data = []
    for X,y in test_iterator:
        for datapoint in X:
            test_data.append(datapoint[-1])

    #Check that training set and test set have right number of datapoints
    assert len(train_data) == 0.8*count
    assert len(test_data) == 0.2*count

    #Set up dataframe for join
    train_df = pd.DataFrame(data={"index": train_data})
    test_df = pd.DataFrame(data={"index": test_data})

    #Check for no overlapping datapoints
    assert len(pd.merge(train_df, test_df, on='index').index) == 0
Beispiel #9
0
    def _train(self, job):
        """
        Trains the specified machine learning model on all the local data,
        starting from the initial model state specified, until a stopping
        condition is met, and using the hyper-parameters specified.

        Returns a DMLResult with the updated model weights, the weighting factor
        omega, and stats about the training job.

        NOTE: Uses the same hyperparameters and labeler for training and
        validating during 'avg_type' of type 'val_acc'.

        NOTE2: Assumes 'job.weights' are the actual weights and not a path.
        """

        train_dataset_path, test_dataset_path = self._set_up()
        data_count_mappings = count_datapoints(self._dataset_path)

        # Get the right dataset iterator based on the averaging type.
        avg_type = job.hyperparams.get('averaging_type', 'data_size')
        batch_size = job.hyperparams['batch_size']
        assert avg_type in ['data_size', 'val_acc'], \
            "Averaging type '{0}' is not supported.".format(avg_type)
        self.logger.info("Training model...")
        if avg_type == 'data_size':
            dataset_iterator = create_random_train_dataset_iterator(
                train_dataset_path,
                batch_size=batch_size,
                infinite=False,
                num_epochs=job.hyperparams.get('epochs'))
        elif avg_type == 'val_acc':
            dataset_iterator = create_random_train_dataset_iterator(
                train_dataset_path,
                batch_size=batch_size,
            )
            test_dataset_iterator = create_random_test_dataset_iterator(
                test_dataset_path,
                batch_size=batch_size,
            )

        # Train the model the right way based on the model type.
        assert job.framework_type in ['keras'], \
            "Model type '{0}' is not supported.".format(job.framework_type)

        if job.framework_type == 'keras':
            trained_model, result_val = train_keras_model(
                job.model,
                dataset_iterator,
                data_count_mappings['train.csv'],
                job.hyperparams,
                self.config,
            )

        # Get the right omega based on the averaging type.
        if avg_type == 'data_size':
            omega = data_count_mappings['train.csv']
        elif avg_type == 'val_acc':
            val_stats = self._validate(
                job,
                custom_iterator=test_dataset_iterator).results['val_stats']
            omega = val_stats['val_metric']['acc']
            train_stats.update(val_stats)
        trained_model.save("sessions/my_model.h5")
        with open("sessions/my_model.h5", mode='rb') as file:
            file_content = file.read()
            encoded_content = base64.b64encode(file_content)
            h5_model = encoded_content.decode('ascii')

        train_results = {
            'omega': omega,
        }

        train_results['gradients'] = result_val

        # new_weights_path = [weights.tolist() for weights in new_weights_path]
        #print(new_weights_path)
        # Return the results.
        # return new_weights_path, omega, train_stats
        results = DMLResult(
            status='successful',
            job=job,
            results=train_results,
            error_message="",
        )
        return results