def __run_all__(algorithm, data, exp, tuned, directory): """ Objective: Run all :param algorithm: :param data: :param exp: :param tuned: :param directory: :return: """ if directory: with misc.cd('dataFiles/testdata'): model1 = models.MlModel(algorithm=algorithm, dataset=data, target=exp, tune=tuned, feat_meth=[0], cv=2, opt_iter=2) else: model1 = models.MlModel(algorithm=algorithm, dataset=data, target=exp, tune=tuned, feat_meth=[0], cv=2, opt_iter=2) model1.featurize() model1.data_split(val=0.1) model1.reg() model1.run() model1.analyze() return model1
def __data_split_model__(algorithm, data, exp, tuned, directory): """ Objective: Return MLModel class object after the featurization and data split process Intention: Most of the process starting from choosing algorithm in the `core` pipeline requires the data to be split This fixture was built so that I don't have to initialize featurization and data splitting in the test functions :param algorithm: :param data: :param exp: :param tuned: :param directory: Whether to get data from the test directory or not. This was created in case I use data from somewhere else. :return: MLModel class object after the featurization and data split process """ if directory: with misc.cd('dataFiles/testdata'): model1 = models.MlModel(algorithm=algorithm, dataset=data, target=exp, tune=tuned, feat_meth=[0], cv=2, opt_iter=2) else: model1 = models.MlModel(algorithm=algorithm, dataset=data, target=exp, tune=tuned, feat_meth=[0], cv=2, opt_iter=2) model1.featurize() model1.data_split(val=0.1) return model1
def __model_object__(algorithm, data, exp, tuned, directory): """ Objective: return MLModel class object with custom test inputs for experimentation Intention: I want to have access to a model object to test for earlier functions in the `core` pipeline such as featurize, data_split :param algorithm: :param data: :param exp: :param tuned: :param directory: Whether to get data from the test directory or not. This was created in case I use data from somewhere else. :return: model class object with only initialized model instances (instances in __init__) """ if directory: with misc.cd('dataFiles/testdata'): model1 = models.MlModel(algorithm=algorithm, dataset=data, target=exp, tune=tuned, feat_meth=[0], cv=2, opt_iter=2) else: model1 = models.MlModel(algorithm=algorithm, dataset=data, target=exp, tune=tuned, feat_meth=[0], cv=2, opt_iter=2) return model1
def insert_dataset_feat_mysql(self, dataset, feat_id): feat_name = self.feat_sets[feat_id] # Digest data and smiles_series with cd(str(Path(__file__).parent.parent.parent.absolute()) + '/dataFiles/'): if dataset in list(self.rds.keys()): self.target_name = self.rds[dataset] self.data, smiles_series = ingest.load_smiles(self, dataset) elif dataset in self.cds: self.data, smiles_series = ingest.load_smiles(self, dataset, drop=False) else: raise Exception(f"Dataset {dataset} not found in rds or cds. Please list in baddies or add") # Insert just the raw dataset that can be featurized (drop smiles that return None Mol objects) if feat_name is None: # Drop misbehaving rows issue_row_list = [] issue_row = 0 for smiles in self.data['smiles']: if MolFromSmiles(smiles) is None: issue_row_list.append(issue_row) issue_row = issue_row + 1 self.data.drop(self.data.index[[issue_row_list]], inplace=True) # Send data to MySql self.data.to_sql(f'{dataset}', self.conn, if_exists='fail') # Otherwise featurize data like normal else: self.feat_meth = [feat_id] self.featurize(not_silent=False) self.data = compress_fingerprint(self.data) self.data.to_sql(f'{dataset}_{feat_name}', self.conn, if_exists='fail')
def represent_dataset(): """ Get string representation for all current dataset in dataFiles This representation follows the "Letters and Numbers" rule explained in naming_schemes_v2.pptx in the naming branch :return: Dictionary of dataset as key and their representation as value """ with cd(str(pathlib.Path(__file__).parent.parent.absolute()) + '/dataFiles/'): # Access folder with all dataset for roots, dirs, files in os.walk(os.getcwd()): data_dict = {} # Dictionary of dataset and their first character for dataset in files: # Loop through list of files if not dataset[0].isdigit(): # If first letter is not a number data_dict[dataset] = dataset[0] # Key as dataset as first character as value else: # If first letter is a number newstring = '' # Empty string for letter in dataset: # Start looping through every character in the name if letter.isdigit(): # If the character is a digit newstring += letter # Add number to the empty string else: # If letter is a string newstring += letter # Add string to the empty string data_dict[dataset] = newstring break # Stop sequence at the first letter compare = [] # This list is for dataset that have unique first character repeat = [] # This list is for dataset that have matching first character duplicate_dict = {} # Dictionary of dataset with the same first character for key in data_dict: # For dataset in dictionary for value in data_dict[key]: # For first character if value not in compare: # If first character is not in our list compare.append(value) # Add it to the empty list else: repeat.append(key) # Add dataset that has matching first character to list duplicate_dict[value] = repeat # Key as the string and list of dataset as values unique_list = [] for key in duplicate_dict: # For every key in duplicate dictionary count = 1 # Counter starts at 1 unique = {} # Final dictionary with all unique string representation for their respective dataset for duplicate in duplicate_dict[key]: # For dataset with matching dataset_string data_dict.pop(duplicate, None) # Remove values that have unique first character dataset_string = ''.join([duplicate[:-4][0], duplicate[:-4][-count:]]) # Combing first and last character if dataset_string not in unique.values(): # Check to see if the newly created string has duplicate unique[duplicate] = dataset_string # Key as the dataset and the newly created string as value else: # If the string still has duplicate count += 1 # Increase counter by 1 dataset_string = ''.join([duplicate[:-4][0], duplicate[:-4][-count:]]) # First, last and second to last unique[duplicate] = dataset_string # Key as the dataset and the newly created string as value break # Break the loop unique_list.append(unique) # Get all dictionary for a situation that has multiple matching first character for dictionary in unique_list: # Loop through all dictionaries data_dict.update(dictionary) # Update the original dictionary return data_dict
def test_cd(mock_cd): """ :param mock_cd: mocked object. In this case, we will mock the module cd As a reminder, mocking is used to test the functionality of the modules imported in our script. Instead of testing the script of its result, we want to make sure the main components of the script is working perfectly and those are usually the main python modules that the script uses. In this case, we want to make sure that the module "os" is doing what its supposed to do and that is to change the working directory """ # change working directory to os.chdir(ROOT_DIR) # move to dataFiles with misc.cd('dataFiles'): # Test to see if getcwd gets called mock_cd.getcwd.assert_called_once() # Test to see if os.path.expanduser gets called with a string(Folder name) mock_cd.path.expanduser.assert_called_with('dataFiles') # Test to see if os.chdir gets called mock_cd.chdir.assert_called_once()