def remove_outliers(df, leaf_size=40, num_processes=1):
    '''Unsupervised outlier detection using local outlier factor

    Args:
        df (ecnet.utils.data_utils.DataFrame): loaded data
        leaf_size (int): used by nearest-neighbor algorithm as the number of
            points at which to switch to brute force
        num_processes (int): number of parallel jobs for LOF algorithm

    Returns:
        ecnet.utils.data_utils.DataFrame: data w/o outliers
    '''

    ditto_logger.stream_level = logger.stream_level
    if logger.file_level != 'disable':
        ditto_logger.log_dir = logger.log_dir
        ditto_logger.file_level = logger.file_level
    ditto_logger.default_call_loc('OUTLIERS')
    item_collection = ItemCollection(df._filename)
    for inp_name in df.input_names:
        item_collection.add_attribute(Attribute(inp_name))
    for pt in df.data_points:
        item_collection.add_item(pt.id, deepcopy(pt.inputs))
    item_collection.strip()
    outliers = local_outlier_factor(item_collection.dataframe,
                                    leaf_size=leaf_size,
                                    n_jobs=num_processes)
    logger.log('debug', 'Outliers: {}'.format(outliers), call_loc='OUTLIERS')
    for out in outliers:
        for idx, pt in enumerate(df.data_points):
            if out == pt.id:
                del df.data_points[idx]
                break
    return df
Beispiel #2
0
    def errors(self,
               *args,
               dset: str = None,
               model_filename: str = 'model.h5') -> dict:
        '''Obtains various errors for specified set

        Args:
            *args (str): one or more error functions; `rmse`, `mean_abs_error`,
                `med_abs_error`, `r2`
            dset (str): set to obtain errors for; `learn`, `valid`, `train`,
                `test`, None (all sets)
            model_filename (str): if specified, uses .h5 model file for error
                calculations

        Returns:
            dict: {'error_fn', value ...} with supplied errors
        '''

        for err in args:
            logger.log('debug',
                       'Calculating {} for {} set'.format(err, dset),
                       call_loc='ERRORS')
        preds = self.use(dset, model_filename=model_filename)
        y_vals = get_y(self._sets, dset)
        errors = {}
        for err in args:
            errors[err] = get_error(preds, y_vals, err)
        logger.log('debug', 'Errors: {}'.format(errors), call_loc='ERRORS')
        return errors
Beispiel #3
0
    def use(self,
            dset: str = None,
            output_filename: str = None,
            model_filename: str = 'model.h5') -> list:
        '''Uses trained neural network(s) to predict for specified set; single
        NN if no project created, best pool candidates if created

        Args:
            dset (str): set to predict for; `learn`, `valid`, `train`, `test`,
                None (all sets)
            output_filename (str): if supplied, saves results to this CSV file
            model_filename (str): if supplied, use specified .h5 model file

        Returns:
            list: list of results for specified set
        '''

        if self._prj_name is None:
            results = use_model(self._sets, dset, model_filename)

        else:
            results = use_project(self._prj_name, self._num_pools, dset,
                                  self._sets)
        if output_filename is not None:
            save_results(results, dset, self._df, output_filename)
            logger.log('info',
                       'Results saved to {}'.format(output_filename),
                       call_loc='USE')
        return results
Beispiel #4
0
    def limit_inputs(self,
                     limit_num: int,
                     num_estimators: int = None,
                     eval_set: str = 'learn',
                     output_filename: str = None,
                     **kwargs) -> list:
        '''Selects `limit_num` influential input parameters using random
        forest regression

        Args:
            limit_num (int): desired number of inputs
            num_estimators (int): number of trees in the RFR algorithm;
                defaults to the total number of inputs
            output_filename (str): if not None, new limited database is saved
                here
            eval_set (str): set to perform RFR on (`learn`, `valid`, `train`,
                `test`, None (all)) (default: `learn`)
            **kwargs: any argument accepted by
                sklearn.ensemble.RandomForestRegressor

        Returns:
            list: [(feature, importance), ..., (feature, importance)]
        '''

        result = limit_rforest(self._df, limit_num, num_estimators,
                               self._num_processes, eval_set, **kwargs)
        self._df.set_inputs([r[0] for r in result])
        self._sets = self._df.package_sets()
        if output_filename is not None:
            self._df.save(output_filename)
            logger.log(
                'info',
                'Resulting database saved to {}'.format(output_filename),
                call_loc='LIMIT')
        return result
def stream_logging(s_level):

    logger.stream_level = s_level
    logger.log('debug', 'Debug message')
    logger.log('info', 'Info message')
    logger.log('warn', 'Warning message')
    logger.log('error', 'Error message')
    logger.log('crit', 'Critical message')
def file_logging(f_level, log_dir='logs'):

    logger.file_level = f_level
    logger.stream_level = f_level
    logger.log_dir = log_dir
    logger.log('debug', 'Debug message')
    logger.log('info', 'Info message')
    logger.log('warn', 'Warning message')
    logger.log('error', 'Error message')
    logger.log('crit', 'Critical message')
Beispiel #7
0
    def train(self,
              shuffle: str = None,
              split: list = None,
              retrain: bool = False,
              validate: bool = False,
              selection_set: str = None,
              selection_fn: str = 'rmse',
              model_filename: str = 'model.h5',
              verbose: int = 0) -> tuple:
        '''Trains neural network(s) using currently-loaded data; single NN if
        no project is created, all candidates if created

        Args:
            shuffle (str): `all` to shuffle all sets for each candidate,
                `train` to shuffle learning/validation data for each candidate
            split (list): if shuffle == `all`||`train`, [learn%, valid%, test%]
            retrain (bool): if True, uses existing project models for
                additional training
            validate (bool): if True, uses a validation set to determine
                learning cutoff
            selection_set (str): best candidates/pool are selected using this
                set; `learn`, `valid`, `train`, `test`, None (all data)
            selection_fn (str): candidates are selected based on this error
                metric; `rmse`, `mean_abs_error`, `med_abs_error`
            model_filename (str): if project not created, saves `.h5` file
                here
            verbose (int): 1 to display loss at each epoch, 0 otherwise (single
                model only)

        Returns:
            tuple: if training single model, returns tuple of learn/valid
                losses, else None
        '''

        if self._prj_name is None:
            logger.log('info', 'Training single model', call_loc='TRAIN')
            _, losses = train_model(self._sets,
                                    self._vars,
                                    selection_set,
                                    selection_fn,
                                    retrain,
                                    model_filename,
                                    validate,
                                    verbose=verbose)
            return losses

        else:
            train_project(self._prj_name, self._num_pools,
                          self._num_candidates, self._df, self._sets,
                          self._vars, shuffle, split, retrain, validate,
                          selection_set, selection_fn, self._num_processes)
            return None
Beispiel #8
0
    def load(self, filename=None):
        '''Loads neural network from .h5 file

        Args:
            filename (str): path to .h5 model file
        '''

        if filename is None:
            filename = self._filename
        self._model = load_model(filename)
        logger.log('debug',
                   'Model loaded from {}'.format(filename),
                   call_loc='MLP')
Beispiel #9
0
    def create_sorted_sets(self, sort_string, split=[0.65, 0.25, 0.1]):
        '''Creates random learn, validate and test sets, ensuring data points with
        the supplied sort string are split proportionally between the sets

        Args:
            sort_string (str): database STRING value used to sort data points
            split (list): [learn%, valid%, test%] for set assignments
        '''

        logger.log('debug',
                   'Creating sorted sets using {} STRING'.format(sort_string),
                   call_loc='DF')

        try:
            string_idx = self.string_names.index(sort_string)
        except ValueError:
            raise Exception('{} not found in STRING names'.format(sort_string))
        self.data_points.sort(key=lambda x: x.strings[string_idx])

        string_vals = []
        string_groups = []

        for point in self.data_points:
            if point.strings[string_idx] not in string_vals:
                string_vals.append(point.strings[string_idx])
                string_groups.append([point])
            else:
                string_groups[-1].append(point)

        self.learn_set = []
        self.valid_set = []
        self.test_set = []

        for group in string_groups:
            split_locs = [
                int(len(group) * split[0]),
                int(len(group) * (split[0] + split[1])),
            ]
            for point in group[0:split_locs[0]]:
                point.assignment = 'L'
                self.learn_set.append(point)
            for point in group[split_locs[0]:split_locs[1]]:
                point.assignment = 'V'
                self.valid_set.append(point)
            for point in group[split_locs[1]:]:
                point.assignment = 'T'
                self.test_set.append(point)

        logger.log('debug',
                   'Number of entries in learn set: {}'.format(
                       len(self.learn_set)),
                   call_loc='DF')
        logger.log('debug',
                   'Number of entries in validation set: {}'.format(
                       len(self.valid_set)),
                   call_loc='DF')
        logger.log('debug',
                   'Number of entries in test set: {}'.format(
                       len(self.test_set)),
                   call_loc='DF')
Beispiel #10
0
    def create_sorted_sets(self, sort_str: str, split: list = [0.7, 0.2, 0.1]):
        '''Creates random learn, validate and test sets, ensuring data points
        with the supplied sort string are split proportionally between the sets

        Args:
            sort_str (str): database STRING value used to sort data points
            split (list): [learn%, valid%, test%] for set assignments
        '''

        logger.log('debug',
                   'Creating sorted sets using {} STRING'.format(sort_str),
                   call_loc='DF')

        if sort_str not in self._string_names:
            raise ValueError('{} not found in STRING names'.format(sort_str))

        string_vals = []
        string_groups = []

        for point in self.data_points:
            str_val = getattr(point, sort_str)
            if str_val not in string_vals:
                string_vals.append(str_val)
                string_groups.append([point])
            else:
                str_loc = string_vals.index(str_val)
                string_groups[str_loc].append(point)

        self.learn_set = []
        self.valid_set = []
        self.test_set = []

        for group in string_groups:
            split_locs = [
                int(len(group) * split[0]),
                int(len(group) * (split[0] + split[1])),
            ]
            for point in group[0:split_locs[0]]:
                point.assignment = 'L'
                self.learn_set.append(point)
            for point in group[split_locs[0]:split_locs[1]]:
                point.assignment = 'V'
                self.valid_set.append(point)
            for point in group[split_locs[1]:]:
                point.assignment = 'T'
                self.test_set.append(point)

        logger.log('debug',
                   'Number of entries in learn set: {}'.format(
                       len(self.learn_set)),
                   call_loc='DF')
        logger.log('debug',
                   'Number of entries in validation set: {}'.format(
                       len(self.valid_set)),
                   call_loc='DF')
        logger.log('debug',
                   'Number of entries in test set: {}'.format(
                       len(self.test_set)),
                   call_loc='DF')
def limit_rforest(df, limit_num, num_estimators=1000, num_processes=1):
    '''Uses random forest regression to select input parameters

    Args:
        df (ecnet.utils.data_utils.DataFrame): loaded data
        limit_num (int): desired number of input parameters
        num_estimators (int): number of trees used by RFR algorithm
        num_processes (int): number of parallel jobs for RFR algorithm

    Returns:
        ecnet.utils.data_utils.DataFrame: limited data
    '''

    ditto_logger.stream_level = logger.stream_level
    if logger.file_level != 'disable':
        ditto_logger.log_dir = logger.log_dir
        ditto_logger.file_level = logger.file_level
    ditto_logger.default_call_loc('LIMIT')
    item_collection = ItemCollection(df._filename)
    for inp_name in df.input_names:
        item_collection.add_attribute(Attribute(inp_name))
    for pt in df.data_points:
        item_collection.add_item(pt.id, deepcopy(pt.inputs))
    for tar_name in df.target_names:
        item_collection.add_attribute(Attribute(tar_name, is_descriptor=False))
    for pt in df.data_points:
        for idx, tar in enumerate(pt.targets):
            item_collection.set_item_attribute(pt.id, tar,
                                               df.target_names[idx])
    item_collection.strip()
    params = [
        param[0] for param in random_forest_regressor(
            item_collection.dataframe,
            target_attribute=df.target_names[0],
            n_components=limit_num,
            n_estimators=num_estimators,
            n_jobs=num_processes)
    ]
    for idx, param in enumerate(params):
        for tn in df.target_names:
            if tn == param:
                del params[idx]
                break

    logger.log('debug',
               'Selected parameters: {}'.format(params),
               call_loc='LIMIT')
    df.set_inputs(params)
    return df
Beispiel #12
0
    def save(self, filename: str = None):
        ''' save: saves the model weights, architecture to either the filename/
        path specified when object was created, or new, supplied filename/path

        Args:
            filename (str): new filepath if different than init filename/path
        '''

        if filename is None:
            filename = self._filename
        check_h5(filename)
        self._model.save(filename, include_optimizer=False)
        logger.log('debug',
                   'Model saved to {}'.format(filename),
                   call_loc='MLP')
Beispiel #13
0
    def load(self, filename: str = None):
        ''' load: loads a saved model, restoring the architecture/weights;
        loads from filename/path specified during object initialization,
        unless new filename/path specified

        Args:
            filename (str): new filepath if different than init filename/path
        '''

        if filename is None:
            filename = self._filename
        self._model = load_model(filename, compile=False)
        logger.log('debug',
                   'Model loaded from {}'.format(filename),
                   call_loc='MLP')
Beispiel #14
0
    def save(self, filename: str):
        '''Saves the current state of the DataFrame to a new CSV database

        Args:
            filename (str): path to location where database is saved; if not
                supplied, saves to CSV file where data was loaded from
        '''

        if filename is None:
            filename = self._filename

        if '.csv' not in filename:
            filename += '.csv'

        rows = []
        type_row = ['DATAID', 'ASSIGNMENT']
        type_row.extend(['STRING' for _ in range(len(self._string_names))])
        type_row.extend(['GROUP' for _ in range(len(self._group_names))])
        type_row.extend(['TARGET' for _ in range(len(self._target_names))])
        type_row.extend(['INPUT' for _ in range(len(self._input_names))])
        rows.append(type_row)

        title_row = ['DATAID', 'ASSIGNMENT']
        title_row.extend(self._string_names)
        title_row.extend(self._group_names)
        title_row.extend(self._target_names)
        title_row.extend(self._input_names)
        rows.append(title_row)

        data_rows = []
        for point in self.data_points:
            data_row = [point.id, point.assignment]
            data_row.extend([getattr(point, s) for s in self._string_names])
            data_row.extend([getattr(point, g) for g in self._group_names])
            data_row.extend([getattr(point, t) for t in self._target_names])
            data_row.extend([getattr(point, i) for i in self._input_names])
            data_rows.append(data_row)
        rows.extend(sorted(data_rows, key=lambda x: x[0]))

        with open(filename, 'w', encoding='utf8') as csv_file:
            wr = writer(csv_file, quoting=QUOTE_ALL, lineterminator='\n')
            for row in rows:
                wr.writerow(row)

        logger.log('debug',
                   'DataFrame saved to {}'.format(filename),
                   call_loc='DF')
Beispiel #15
0
    def save(self, filename=None):
        '''Saves neural network to .h5 file

        filename (str): if None, uses MultilayerPerceptron._filename;
            otherwise, saves to this file
        '''

        if filename is None:
            filename = self._filename
        if H5_EXT.match(filename) is None:
            raise ValueError(
                'Invalid filename/extension, must be `.h5`: {}'.format(
                    filename))
        self._model.save(filename)
        logger.log('debug',
                   'Model saved to {}'.format(filename),
                   call_loc='MLP')
Beispiel #16
0
    def set_inputs(self, inputs: list):
        '''Removes all input variables except those supplied, updates sets
        accordingly

        Args:
            inputs (list): input variable names, str
        '''

        logger.log('debug',
                   'Setting input parameters to {}'.format(inputs),
                   call_loc='DF')

        for inp in inputs:
            if inp not in self._input_names:
                raise ValueError('{} not found in existing inputs'.format(inp))

        self._input_names = inputs
        self.create_sets()
Beispiel #17
0
def main(db_name: str):

    # Set up logging
    logger.stream_level = 'info'
    logger.log_dir = db_name.replace('.csv', '') + '_logs'
    logger.file_level = 'debug'

    # Split database proportionally based on property value
    # Proportions are 70% learn, 20% validate, 10% test
    prop_range_from_split(db_name, [0.7, 0.2, 0.1])

    # Find the optimal number of input variables
    # Train (learn + valid) set used for evaluation
    n_desc = len(find_optimal_num_inputs(db_name, 'train', _NUM_PROC)[1])
    logger.log('info', 'Optimal number of input variables: {}'.format(n_desc))

    # Create server object with base config
    sv = Server(model_config=db_name.replace('.csv', '.yml'),
                num_processes=_NUM_PROC)

    # Load data
    sv.load_data(db_name)

    # Limit input variables to `n_desc` using Train set
    # Outputs to relevant database name
    sv.limit_inputs(
        n_desc, eval_set='train',
        output_filename=db_name.replace('.csv', '.{}.csv'.format(n_desc))
    )

    # Tune hyperparameters (architecture and ADAM)
    # 20 employer bees, 10 search cycles
    # Evaluation of solutions based on validation set median absolute error
    sv.tune_hyperparameters(20, 10, eval_set='valid', eval_fn='med_abs_error')

    # Create an ECNet project (saved and recalled later)
    # 5 pools with 75 trials/pool, best ANNs selected from each pool
    sv.create_project(db_name.replace('.csv', ''), 5, 75)

    # Train project
    # Select best candidates based on validation set median absolute error
    sv.train(validate=True, selection_set='valid',
             selection_fn='med_abs_error')

    # Obtain learning, validation, testing set median absolute error, r-squared
    err_l = sv.errors('med_abs_error', 'r2', dset='learn')
    err_v = sv.errors('med_abs_error', 'r2', dset='valid')
    err_t = sv.errors('med_abs_error', 'r2', dset='test')
    logger.log('info', 'Learning set performance: {}'.format(err_l))
    logger.log('info', 'Validation set performance: {}'.format(err_v))
    logger.log('info', 'Testing set performance: {}'.format(err_t))

    # Save the project, creating a .prj file and removing un-chosen candidates
    sv.save_project(del_candidates=True)
Beispiel #18
0
    def save(self, filename):
        '''Saves the current state of the DataFrame to a new CSV database

        Args:
            filename (str): path to location where database is saved
        '''

        if '.csv' not in filename:
            filename += '.csv'

        rows = []
        type_row = ['DATAID', 'ASSIGNMENT']
        type_row.extend(['STRING' for _ in range(self.num_strings)])
        type_row.extend(['GROUP' for _ in range(self.num_groups)])
        type_row.extend(['TARGET' for _ in range(self.num_targets)])
        type_row.extend(['INPUT' for _ in range(self.num_inputs)])
        rows.append(type_row)

        title_row = ['DATAID', 'ASSIGNMENT']
        title_row.extend(self.string_names)
        title_row.extend(self.group_names)
        title_row.extend(self.target_names)
        title_row.extend(self.input_names)
        rows.append(title_row)

        data_rows = []
        for point in self.data_points:
            data_row = [point.id, point.assignment]
            data_row.extend(point.strings)
            data_row.extend(point.groups)
            data_row.extend(point.targets)
            data_row.extend(point.inputs)
            data_rows.append(data_row)
        rows.extend(sorted(data_rows, key=lambda x: x[0]))

        with open(filename, 'w') as csv_file:
            wr = writer(csv_file, quoting=QUOTE_ALL, lineterminator='\n')
            for row in rows:
                wr.writerow(row)

        logger.log('debug',
                   'DataFrame saved to {}'.format(filename),
                   call_loc='DF')
Beispiel #19
0
    def create_project(self,
                       project_name: str,
                       num_pools: int = 1,
                       num_candidates: int = 1):
        '''Creates folder hierarchy for a new project

        Args:
            project_name (str): name of the project, and top-level dir name
            num_pools (int): number of candidate pools for the project
            num_candidates (int): number of candidates per pool
        '''

        self._prj_name = project_name
        self._num_pools = num_pools
        self._num_candidates = num_candidates
        create_project(project_name, num_pools, num_candidates)
        logger.log('info',
                   'Created project: {}'.format(project_name),
                   call_loc='PROJECT')
        logger.log('debug',
                   'Number of pools: {}'.format(num_pools),
                   call_loc='PROJECT')
        logger.log('debug',
                   'Number of candidates/pool: {}'.format(num_candidates),
                   call_loc='PROJECT')
Beispiel #20
0
    def save_project(self,
                     filename: str = None,
                     clean_up: bool = True,
                     del_candidates: bool = False):
        '''Saves current state of project to a .prj file

        Args:
            filename (str): if None, uses name supplied in project creation;
                else, saves the project here
            clean_up (bool): if True, removes project folder structure after
                .prj file created
            del_candidates (bool): if True, deletes all non-chosen candidate
                neural networks
        '''

        if self._prj_name is None:
            raise RuntimeError('A project has not been created')
        save_path = save_project(self._prj_name, filename, self._cf_file,
                                 self._df, self._vars, clean_up,
                                 del_candidates)
        logger.log('info',
                   'Project saved to {}'.format(save_path),
                   call_loc='PROJECT')
Beispiel #21
0
    def load_data(self,
                  filename: str,
                  random: bool = False,
                  split: list = None,
                  normalize: bool = False):
        '''Loads data from an ECNet-formatted CSV database

        Args:
            filename (str): path to CSV database
            random (bool): if True, random set assignments (learn, validate,
                test); if False, uses DB-specified assignmenets
            split (list): if random is True, [learn%, valid%, test%]
            normalize (bool): if true, uses min-max normalization to normalize
                input parameters between 0 and 1
        '''

        logger.log('info',
                   'Loading data from {}'.format(filename),
                   call_loc='LOAD')
        self._df = DataFrame(filename)
        if normalize:
            self._df.normalize()
        self._df.create_sets(random, split)
        self._sets = self._df.package_sets()
Beispiel #22
0
    def set_inputs(self, inputs):
        '''Removes all input variables except those supplied

        Args:
            inputs (list): input variable names, str
        '''

        logger.log('debug',
                   'Setting input parameters to {}'.format(inputs),
                   call_loc='DF')

        idxs = []
        for input in inputs:
            for cidx, current_input in enumerate(self.input_names):
                if input == current_input:
                    idxs.append(cidx)
        for point in self.data_points:
            new_inputs = []
            for i in idxs:
                new_inputs.append(point.inputs[i])
            point.inputs = new_inputs
        self.input_names = inputs
        self.num_inputs = len(inputs)
        self.create_sets()
Beispiel #23
0
def limit_rforest(df: DataFrame,
                  limit_num: int,
                  num_estimators: int = None,
                  num_processes: int = 1,
                  eval_set: str = 'learn',
                  **kwargs) -> list:
    '''Uses random forest regression to select input parameters

    Args:
        df (ecnet.utils.data_utils.DataFrame): loaded data
        limit_num (int): desired number of input parameters
        num_estimators (int): number of trees used by RFR algorithm
        num_processes (int): number of parallel jobs for RFR algorithm
        eval_set (str): set to perform RFR on (`learn`, `valid`, `train`,
            `test`, None (all)) (default: `learn`)
        **kwargs: any argument accepted by
            sklearn.ensemble.RandomForestRegressor

    Returns:
        list: [(feature, importance), ..., (feature, importance)]
    '''

    logger.log(
        'info',
        'Finding {} most influential input parameters'.format(limit_num),
        call_loc='LIMIT')

    pd = df.package_sets()
    X = get_x(pd, eval_set)
    y = ravel(get_y(pd, eval_set))

    if num_estimators is None:
        num_estimators = len(X[0])

    logger.log('debug',
               'Number of estimators: {}'.format(num_estimators),
               call_loc='LIMIT')

    regr = RandomForestRegressor(n_jobs=num_processes,
                                 n_estimators=num_estimators,
                                 **kwargs)
    regr.fit(X, y)
    importances = regr.feature_importances_
    result = []
    for idx, name in enumerate(df._input_names):
        result.append((name, importances[idx]))
    result = sorted(result, key=lambda t: t[1], reverse=True)[:limit_num]
    logger.log('debug',
               'Selected parameters: {}'.format([r[0] for r in result]),
               call_loc='LIMIT')
    return result
Beispiel #24
0
    def __init__(self,
                 model_config: str = 'config.yml',
                 prj_file: str = None,
                 num_processes: int = 1):
        '''Server object: handles data loading, model creation, data-to-model
        hand-off, data input parameter selection, hyperparameter tuning

        Args:
            model_config (str): path to multilayer perceptron .yml config file;
                if not found, default config is generated
            prj_file (str): path to pre-existing ECNet .prj file, if using for
                retraining/new predictions
            num_processes (int): number of parallel processes to utilize for
                training and tuning processes
        '''

        logger.log('debug',
                   'Arguments:\n\t| model_config:\t\t{}\n\t|'
                   ' prj_file:\t\t{}\n\t| num_processes:\t{}'.format(
                       model_config, prj_file, num_processes),
                   call_loc='INIT')

        self._num_processes = num_processes

        if prj_file is not None:
            self._prj_name, self._num_pools, self._num_candidates, self._df,\
                self._cf_file, self._vars = open_project(prj_file)
            check_config(self._vars)
            self._sets = self._df.package_sets()
            logger.log('info',
                       'Opened project {}'.format(prj_file),
                       call_loc='INIT')
            return

        self._cf_file = model_config
        self._prj_name = None

        self._vars = {}
        try:
            self._vars.update(open_config(self._cf_file))
            check_config(self._vars)
        except FileNotFoundError:
            logger.log(
                'warn',
                '{} not found, generating default config'.format(model_config),
                call_loc='INIT')
            self._vars = default_config()
            save_config(self._vars, self._cf_file)
Beispiel #25
0
    def shuffle(self, sets: str = 'all', split: list = [0.7, 0.2, 0.1]):
        '''Shuffles learning, validation and test sets or learning and
        validation sets

        Args:
            sets (str): 'all' or 'train' (learning + validation)
            split (list): [learn%, valid%, test%] used for new assignments
        '''

        logger.log('debug', 'Shuffling {} sets'.format(sets), call_loc='DF')

        if sets == 'all':
            self.create_sets(random=True, split=split)
        elif sets == 'train':
            lv_set = []
            lv_set.extend([p for p in self.learn_set])
            lv_set.extend([p for p in self.valid_set])
            rand_index = sample(
                range(len(self.learn_set) + len(self.valid_set)),
                (len(self.learn_set) + len(self.valid_set)))
            self.learn_set = lv_set[
                0:int(len(rand_index) * (split[0] / (1 - split[2]))) + 1]
            self.valid_set = lv_set[
                int(len(rand_index) * (split[0] / (1 - split[2]))) + 1:]
            logger.log('debug',
                       'Number of entries in learn set: {}'.format(
                           len(self.learn_set)),
                       call_loc='DF')
            logger.log('debug',
                       'Number of entries in validation set: {}'.format(
                           len(self.valid_set)),
                       call_loc='DF')
            logger.log('debug',
                       'Number of entries in test set: {}'.format(
                           len(self.test_set)),
                       call_loc='DF')
        else:
            raise ValueError('Invalid sets argument: {}'.format(sets))
Beispiel #26
0
def tune_hyperparameters(df, vars, num_employers, num_iterations,
                         num_processes=1, shuffle=None, split=None,
                         validate=True, eval_set=None, eval_fn='rmse'):
    '''Tunes neural network learning/architecture hyperparameters

    Args:
        df (ecnet.utils.data_utils.DataFrame): currently loaded data
        vars (dict): ecnet.Server._vars variables
        num_employers (int): number of employer bees
        num_iterations (int): number of search cycles for the colony
        num_processes (int): number of parallel processes to utilize
        shuffle (bool): if True, shuffles L/V/T data for all evals
        split (list): if shuffle is True, [learn%, valid%, test%]
        validate (bool): if True, uses periodic validation; otherwise, no
        eval_set (str): set used to evaluate bee performance; `learn`, `valid`,
            `train`, `test`, None (all sets)
        eval_fn (str): error function used to evaluate bee performance; `rmse`,
            `mean_abs_error`, `med_abs_error`

    Returns:
        dict: tuned hyperparameters
    '''

    fit_fn_args = {
        'df': df,
        'shuffle': shuffle,
        'num_processes': num_processes,
        'split': split,
        'validate': validate,
        'eval_set': eval_set,
        'eval_fn': eval_fn,
        'hidden_layers': vars['hidden_layers']
    }

    value_ranges = [
        ('float', (0.0, 1.0)),
        ('float', (0.0, 1.0)),
        ('float', (0.0, 1.0)),
        ('float', (0.0, 1.0)),
        ('float', (0.0, 1.0))
    ]

    for _ in range(len(vars['hidden_layers'])):
        value_ranges.append(('int', (1, 50)))

    abc = ABC(
        tune_fitness_function,
        num_employers=num_employers,
        value_ranges=value_ranges,
        args=fit_fn_args,
        processes=num_processes
    )

    abc._logger.stream_level = logger.stream_level
    if logger.file_level != 'disable':
        abc._logger.log_dir = logger.log_dir
        abc._logger.file_level = logger.file_level
    abc._logger.default_call_loc('TUNE')
    abc.create_employers()
    for i in range(num_iterations):
        logger.log('info', 'Iteration {}'.format(i + 1), call_loc='TUNE')
        abc.run_iteration()
        logger.log('info', 'Best Performer: {}, {}'.format(
            abc.best_performer[2], {
                'beta_1': abc.best_performer[1][0],
                'beta_2': abc.best_performer[1][1],
                'decay': abc.best_performer[1][2],
                'epsilon': abc.best_performer[1][3],
                'learning_rate': abc.best_performer[1][4],
                'hidden_layers': abc.best_performer[1][5:]
            }
        ), call_loc='TUNE')
    params = abc.best_performer[1]
    vars['beta_1'] = params[0]
    vars['beta_2'] = params[1]
    vars['decay'] = params[2]
    vars['epsilon'] = params[3]
    vars['learning_rate'] = params[4]
    for l_idx in range(len(vars['hidden_layers'])):
        vars['hidden_layers'][l_idx][0] = params[5 + l_idx]
    return vars
Beispiel #27
0
def create_model(prop_abvr: str, smiles: list = None, targets: list = None,
                 db_name: str = None, qspr_backend: str = 'padel',
                 create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1],
                 log_level: str = 'info', log_to_file: bool = True,
                 num_processes: int = 1):
    ''' create_model: ECRL's database/model creation workflow for all
    publications

    Args:
        prop_abvr (str): abbreviation for the property name (e.g. CN)
        smiles (list): if supplied with targets, creates a new database
        targets (list): if supplied with smiles, creates a new database
        db_name (str): you may supply an existing ECNet-formatted database
        qspr_backend (str): if creating new database, generation software to
            use (`padel`, `alvadesc`)
        create_plots (bool): if True, creates plots for median absolute error
            vs. number of descriptors as inputs, parity plot for all sets
        data_split (list): [learn %, valid %, test %] for all supplied data
        log_level (str): `debug`, `info`, `warn`, `error`, `crit`
        log_to_file (bool): if True, saves workflow logs to a file in `logs`
            directory
        num_processes (int): number of concurrent processes to use for various
            tasks
    '''

    # Initialize logging
    logger.stream_level = log_level
    if log_to_file:
        logger.file_level = log_level

    # If database not supplied, create database from supplied SMILES, targets
    if db_name is None:
        if smiles is None or targets is None:
            raise ValueError('Must supply SMILES and target values')
        db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format(
            prop_abvr
        ))
        logger.log('info', 'Creating database {}...'.format(db_name),
                   'WORKFLOW')
        create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend)
        logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW')

    # Create database split, each subset has proportionally equal number of
    #   compounds based on range of experimental/target values
    logger.log('info', 'Creating optimal data split...', 'WORKFLOW')
    prop_range_from_split(db_name, data_split)
    logger.log('info', 'Created optimal data split', 'WORKFLOW')
    df = DataFrame(db_name)
    df.create_sets()
    logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)),
               'WORKFLOW')
    logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)),
               'WORKFLOW')
    logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW')

    # Find optimal number of QSPR input variables
    logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW')
    errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes)
    df = DataFrame(db_name)
    df.set_inputs(desc)
    df.save(db_name.replace('.csv', '_opt.csv'))
    logger.log('info', 'Found optimal number of inputs', 'WORKFLOW')
    logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)),
               'WORKFLOW')

    # Plot the curve of MAE vs. num. desc. added, if desired
    if create_plots:
        logger.log('info', 'Creating plot of MAE vs. descriptors...',
                   'WORKFLOW')
        num_add = [e[0] for e in errors]
        maes = [e[1] for e in errors]
        opt_num = len(desc)
        plt.clf()
        plt.rcParams['font.family'] = 'Times New Roman'
        plt.plot(num_add, maes, c='blue')
        plt.axvline(x=opt_num, c='red', linestyle='--')
        plt.xlabel('Number of Descriptors as ANN Input Variables')
        plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr))
        plt.savefig(db_name.replace('.csv', '_desc_curve.png'))
        logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW')

    # Tune ANN hyperparameters according to validation set performance
    logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW')
    config = default_config()
    config = tune_hyperparameters(df, config, 25, 10, num_processes,
                                  shuffle='train', split=[0.7, 0.2, 0.1],
                                  validate=True, eval_set='valid',
                                  eval_fn='med_abs_error', epochs=300)
    config['epochs'] = default_config()['epochs']
    config_filename = db_name.replace('.csv', '.yml')
    save_config(config, config_filename)
    logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW')
    logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']),
               'WORKFLOW')
    logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW')
    logger.log('info', '\tBatch size: {}'.format(config['batch_size']),
               'WORKFLOW')
    logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW')
    logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']),
               'WORKFLOW')

    # Create Model
    logger.log('info', 'Generating ANN...', 'WORKFLOW')
    sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes)
    sv.load_data(db_name.replace('.csv', '_opt.csv'))
    sv.create_project(db_name.replace('.csv', ''), 5, 75)
    sv.train(validate=True, selection_set='valid', shuffle='train',
             split=[0.7, 0.2, 0.1], selection_fn='med_abs_error')
    logger.log('info', 'ANN Generated', 'WORKFLOW')
    logger.log('info', 'Measuring ANN performance...', 'WORKFLOW')
    preds_test = sv.use(dset='test')
    preds_train = sv.use(dset='train')
    test_errors = sv.errors('r2', 'med_abs_error', dset='test')
    train_errors = sv.errors('r2', 'med_abs_error', dset='train')
    logger.log('info', 'Measured ANN performance', 'WORKFLOW')
    logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format(
        train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW')
    logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format(
        test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW')
    sv.save_project(del_candidates=True)

    if create_plots:
        logger.log('info', 'Creating parity plot...', 'WORKFLOW')
        plt.clf()
        parity_plot = ParityPlot(
            '',
            'Experimental {} Value'.format(prop_abvr),
            'Predicted {} Value'.format(prop_abvr)
        )
        parity_plot.add_series(concatenate(
            (sv._sets.learn_y, sv._sets.valid_y)
        ), preds_train, 'Training Set', 'blue')
        parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red')
        parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE')
        parity_plot._add_label('Test $R^2$', test_errors['r2'])
        parity_plot._add_label('Training MAE', train_errors['med_abs_error'])
        parity_plot._add_label('Training $R^2$', train_errors['r2'])
        parity_plot.save(db_name.replace('.csv', '_parity.png'))
        logger.log('info', 'Created parity plot', 'WORKFLOW')
Beispiel #28
0
    def __init__(self, filename: str):
        '''DataFrame object: handles data importing, set splitting, shuffling,
        packaging

        Args:
            filename (str): path to ECNet-formatted CSV database
        '''

        if '.csv' not in filename:
            filename += '.csv'
        try:
            with open(filename, newline='', encoding='utf8') as file:
                rows = list(reader(file))
        except FileNotFoundError:
            raise Exception('CSV database not found: {}'.format(filename))

        self._filename = filename

        self.data_points = []

        self._string_names = []
        self._group_names = []
        self._target_names = []
        self._input_names = []

        for p_idx, row in enumerate(rows[2:]):

            new_point = DataPoint()

            for h_idx, header in enumerate(rows[0]):
                if header == 'DATAID':
                    new_point.id = row[h_idx]
                elif header == 'ASSIGNMENT':
                    new_point.assignment = row[h_idx]
                elif header == 'STRING':
                    if p_idx == 0:
                        self._string_names.append(rows[1][h_idx])
                    setattr(new_point, rows[1][h_idx], row[h_idx])
                elif header == 'GROUP':
                    if p_idx == 0:
                        self._group_names.append(rows[1][h_idx])
                    setattr(new_point, rows[1][h_idx], row[h_idx])
                elif header == 'TARGET':
                    if p_idx == 0:
                        self._target_names.append(rows[1][h_idx])
                    setattr(new_point, rows[1][h_idx], row[h_idx])
                elif header == 'INPUT':
                    if p_idx == 0:
                        self._input_names.append(rows[1][h_idx])
                    setattr(new_point, rows[1][h_idx], row[h_idx])

            self.data_points.append(new_point)

        logger.log('debug',
                   'Found {} data entries'.format(len(self.data_points)),
                   call_loc='DF')
        logger.log('debug',
                   'Input parameters/entry: {}'.format(len(self._input_names)),
                   call_loc='DF')
        logger.log('debug',
                   'Target values/entry: {}'.format(len(self._target_names)),
                   call_loc='DF')
Beispiel #29
0
    def create_sets(self, random: bool = False, split: list = [0.7, 0.2, 0.1]):
        '''Creates learning, validation and test sets

        Args:
            random (bool): if True, use random assignments for learn, validate,
                test sets
            split (list): [learn%, valid%, test%] if random == True
        '''

        self.learn_set = []
        self.valid_set = []
        self.test_set = []

        if random is True:
            logger.log('debug',
                       'Assigning entries to random sets',
                       call_loc='DF')
            rand_index = sample(range(len(self)), len(self))
            split_locs = [
                int(len(rand_index) * split[0]),
                int(len(rand_index) * (split[0] + split[1])),
            ]
            learn_index = rand_index[0:split_locs[0]]
            valid_index = rand_index[split_locs[0]:split_locs[1]]
            test_index = rand_index[split_locs[1]:]
            for idx in learn_index:
                self.data_points[idx].assignment = 'L'
                self.learn_set.append(self.data_points[idx])
            for idx in valid_index:
                self.data_points[idx].assignment = 'V'
                self.valid_set.append(self.data_points[idx])
            for idx in test_index:
                self.data_points[idx].assignment = 'T'
                self.test_set.append(self.data_points[idx])

        elif random is False:
            logger.log('debug',
                       'Assigning entries to explicit sets',
                       call_loc='DF')
            for point in self.data_points:
                if point.assignment == 'L':
                    self.learn_set.append(point)
                elif point.assignment == 'V':
                    self.valid_set.append(point)
                elif point.assignment == 'T':
                    self.test_set.append(point)

        else:
            raise ValueError('Unknown random boolean: {}'.format(random))

        logger.log('debug',
                   'Number of entries in learn set: {}'.format(
                       len(self.learn_set)),
                   call_loc='DF')
        logger.log('debug',
                   'Number of entries in validation set: {}'.format(
                       len(self.valid_set)),
                   call_loc='DF')
        logger.log('debug',
                   'Number of entries in test set: {}'.format(
                       len(self.test_set)),
                   call_loc='DF')
Beispiel #30
0
    def fit(self,
            l_x,
            l_y,
            v_x=None,
            v_y=None,
            epochs=1500,
            lr=0.001,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=None,
            decay=0.0,
            v=0):
        '''Fits neural network to supplied inputs and targets

        Args:
            l_x (numpy.array): learning input data
            l_y (numpy.array): learning target data
            v_x (numpy.array): if not None, periodic validation is performed w/
                these inputs
            v_y (numpy.array): if not None, periodic validation is performed w/
                these targets
            epochs (int): number of learning epochs if not validating, maximum
                number of learning epochs if performing periodic validation
            lr (float): learning rate for Adam optimizer
            beta_1 (float): beta_1 value for Adam optimizer
            beta_2 (float): beta_2 value for Adam optimizer
            epsilon (float): epsilon value for Adam optimizer
            decay (float): learning rate decay for Adam optimizer
            v (int): verbose training, `0` for no printing, `1` for printing
        '''

        self._model.compile(loss=mean_squared_error,
                            optimizer=Adam(lr=lr,
                                           beta_1=beta_1,
                                           beta_2=beta_2,
                                           epsilon=epsilon,
                                           decay=decay),
                            metrics=[mae])

        if v_x is not None and v_y is not None:
            valid_mae_lowest = self._model.evaluate(v_x, v_y, verbose=v)[1]
            steps = int(epochs / 250)
            for e in range(steps):
                h = self._model.fit(l_x,
                                    l_y,
                                    validation_data=(v_x, v_y),
                                    epochs=250,
                                    verbose=v)
                valid_mae = h.history['val_mean_absolute_error'][-1]
                if valid_mae < valid_mae_lowest:
                    valid_mae_lowest = valid_mae
                elif valid_mae > (valid_mae_lowest + 0.05 * valid_mae_lowest):
                    logger.log('debug',
                               'Validation cutoff after {} epochs'.format(e *
                                                                          250),
                               call_loc='MLP')
                    return

        else:
            self._model.fit(l_x, l_y, epochs=epochs, verbose=v)
        logger.log('debug',
                   'Training complete after {} epochs'.format(epochs),
                   call_loc='MLP')