Ejemplo n.º 1
0
def __run_all__(algorithm, data, exp, tuned, directory):
    """
    Objective: Run all
    :param algorithm:
    :param data:
    :param exp:
    :param tuned:
    :param directory:
    :return:
    """
    if directory:
        with misc.cd('dataFiles/testdata'):
            model1 = models.MlModel(algorithm=algorithm,
                                    dataset=data,
                                    target=exp,
                                    tune=tuned,
                                    feat_meth=[0],
                                    cv=2,
                                    opt_iter=2)
    else:
        model1 = models.MlModel(algorithm=algorithm,
                                dataset=data,
                                target=exp,
                                tune=tuned,
                                feat_meth=[0],
                                cv=2,
                                opt_iter=2)
    model1.featurize()
    model1.data_split(val=0.1)
    model1.reg()
    model1.run()
    model1.analyze()
    return model1
Ejemplo n.º 2
0
def __data_split_model__(algorithm, data, exp, tuned, directory):
    """
    Objective: Return MLModel class object after the featurization and data split process
    Intention: Most of the process starting from choosing algorithm in the `core` pipeline requires the data to be split
                This fixture was built so that I don't have to initialize featurization and data splitting in the
                test functions
    :param algorithm:
    :param data:
    :param exp:
    :param tuned:
    :param directory: Whether to get data from the test directory or not. This was created in case I use data from
                        somewhere else.
    :return: MLModel class object after the featurization and data split process
    """
    if directory:
        with misc.cd('dataFiles/testdata'):
            model1 = models.MlModel(algorithm=algorithm,
                                    dataset=data,
                                    target=exp,
                                    tune=tuned,
                                    feat_meth=[0],
                                    cv=2,
                                    opt_iter=2)
    else:
        model1 = models.MlModel(algorithm=algorithm,
                                dataset=data,
                                target=exp,
                                tune=tuned,
                                feat_meth=[0],
                                cv=2,
                                opt_iter=2)
    model1.featurize()
    model1.data_split(val=0.1)

    return model1
Ejemplo n.º 3
0
def __model_object__(algorithm, data, exp, tuned, directory):
    """
    Objective: return MLModel class object with custom test inputs for experimentation
    Intention: I want to have access to a model object to test for earlier functions in the `core` pipeline such as
                featurize, data_split
    :param algorithm:
    :param data:
    :param exp:
    :param tuned:
    :param directory: Whether to get data from the test directory or not. This was created in case I use data from
                        somewhere else.
    :return: model class object with only initialized model instances (instances in __init__)
    """
    if directory:
        with misc.cd('dataFiles/testdata'):
            model1 = models.MlModel(algorithm=algorithm,
                                    dataset=data,
                                    target=exp,
                                    tune=tuned,
                                    feat_meth=[0],
                                    cv=2,
                                    opt_iter=2)
    else:
        model1 = models.MlModel(algorithm=algorithm,
                                dataset=data,
                                target=exp,
                                tune=tuned,
                                feat_meth=[0],
                                cv=2,
                                opt_iter=2)

    return model1
Ejemplo n.º 4
0
    def insert_dataset_feat_mysql(self, dataset, feat_id):

        feat_name = self.feat_sets[feat_id]

        # Digest data and smiles_series
        with cd(str(Path(__file__).parent.parent.parent.absolute()) + '/dataFiles/'):
            if dataset in list(self.rds.keys()):
                self.target_name = self.rds[dataset]
                self.data, smiles_series = ingest.load_smiles(self, dataset)
            elif dataset in self.cds:
                self.data, smiles_series = ingest.load_smiles(self, dataset, drop=False)
            else:
                raise Exception(f"Dataset {dataset} not found in rds or cds. Please list in baddies or add")

        # Insert just the raw dataset that can be featurized (drop smiles that return None Mol objects)
        if feat_name is None:

            # Drop misbehaving rows
            issue_row_list = []
            issue_row = 0
            for smiles in self.data['smiles']:
                if MolFromSmiles(smiles) is None:
                    issue_row_list.append(issue_row)
                issue_row = issue_row + 1
            self.data.drop(self.data.index[[issue_row_list]], inplace=True)

            # Send data to MySql
            self.data.to_sql(f'{dataset}', self.conn, if_exists='fail')

        # Otherwise featurize data like normal
        else:
            self.feat_meth = [feat_id]
            self.featurize(not_silent=False)
            self.data = compress_fingerprint(self.data)
            self.data.to_sql(f'{dataset}_{feat_name}', self.conn, if_exists='fail')
Ejemplo n.º 5
0
def represent_dataset():
    """
    Get string representation for all current dataset in dataFiles
    This representation follows the "Letters and Numbers" rule explained in naming_schemes_v2.pptx in the naming branch
    :return: Dictionary of dataset as key and their representation as value
    """
    with cd(str(pathlib.Path(__file__).parent.parent.absolute()) + '/dataFiles/'):  # Access folder with all dataset
        for roots, dirs, files in os.walk(os.getcwd()):
            data_dict = {}  # Dictionary of dataset and their first character
            for dataset in files:  # Loop through list of files
                if not dataset[0].isdigit():  # If first letter is not a number
                    data_dict[dataset] = dataset[0]  # Key as dataset as first character as value
                else:  # If first letter is a number
                    newstring = ''  # Empty string
                    for letter in dataset:  # Start looping through every character in the name
                        if letter.isdigit():  # If the character is a digit
                            newstring += letter  # Add number to the empty string
                        else:  # If letter is a string
                            newstring += letter  # Add string to the empty string
                            data_dict[dataset] = newstring
                            break  # Stop sequence at the first letter
    compare = []  # This list is for dataset that have unique first character
    repeat = []  # This list is for dataset that have matching first character
    duplicate_dict = {}  # Dictionary of dataset with the same first character
    for key in data_dict:  # For dataset in dictionary
        for value in data_dict[key]:  # For first character
            if value not in compare:  # If first character is not in our list
                compare.append(value)  # Add it to the empty list
            else:
                repeat.append(key)  # Add dataset that has matching first character to list
                duplicate_dict[value] = repeat  # Key as the string and list of dataset as values
    unique_list = []
    for key in duplicate_dict:  # For every key in duplicate dictionary
        count = 1  # Counter starts at 1
        unique = {}  # Final dictionary with all unique string representation for their respective dataset
        for duplicate in duplicate_dict[key]:  # For dataset with matching dataset_string
            data_dict.pop(duplicate, None)  # Remove values that have unique first character
            dataset_string = ''.join([duplicate[:-4][0], duplicate[:-4][-count:]])  # Combing first and last character
            if dataset_string not in unique.values():  # Check to see if the newly created string has duplicate
                unique[duplicate] = dataset_string   # Key as the dataset and the newly created string as value
            else:  # If the string still has duplicate
                count += 1  # Increase counter by 1
                dataset_string = ''.join([duplicate[:-4][0], duplicate[:-4][-count:]])  # First, last and second to last
                unique[duplicate] = dataset_string  # Key as the dataset and the newly created string as value
                break  # Break the loop
        unique_list.append(unique)  # Get all dictionary for a situation that has multiple matching first character
    for dictionary in unique_list:  # Loop through all dictionaries
        data_dict.update(dictionary)  # Update the original dictionary
    return data_dict
Ejemplo n.º 6
0
def test_cd(mock_cd):
    """
    :param mock_cd: mocked object. In this case, we will mock the module cd
    As a reminder, mocking is used to test the functionality of the modules imported in our script. Instead of testing
    the script of its result, we want to make sure the main components of the script is working perfectly and those are
    usually the main python modules that the script uses.
    In this case, we want to make sure that the module "os" is doing what its supposed to do and that is to change the
    working directory
    """
    # change working directory to
    os.chdir(ROOT_DIR)
    # move to dataFiles
    with misc.cd('dataFiles'):
        # Test to see if getcwd gets called
        mock_cd.getcwd.assert_called_once()
        # Test to see if os.path.expanduser gets called with a string(Folder name)
        mock_cd.path.expanduser.assert_called_with('dataFiles')
        # Test to see if os.chdir gets called
        mock_cd.chdir.assert_called_once()