Beispiel #1
0
def process_and_save_multidomain_datasets(data_dict, dataset_name_pairs,
                                          validation_splits, output_root_dir):

    encoder = LabelEncoder()
    domain_names = list(validation_splits.keys())

    for dataset_name_pair in dataset_name_pairs:
        encoder = LabelEncoder()
        experiment = '_'.join(dataset_name_pair)
        experiment_dir = os.path.join(output_root_dir, experiment)

        for i, dataset_name in enumerate(dataset_name_pair):
            domain = domain_names[i]
            if domain == 'source':
                merge_new_labels = True
            else:
                merge_new_labels = False

            process_and_save_dataset(
                data_dict[dataset_name],
                '_'.join([domain, dataset_name]),
                encoder=encoder,
                validation_splits=validation_splits[domain],
                experiment_dir=experiment_dir,
                merge_new_labels=merge_new_labels)
Beispiel #2
0
def process_and_save_multidataset_singledomain_datasets(
        data_dict: dict,
        dataset_names: list,
        validation_splits: dict,
        output_root_dir: str,
        merge_new_labels: bool = True):
    '''
    Generate CSV datasets for single domain experiment, but for each pair of datasets to be merged
    Arguments:
        data_dict: dict
        dataset_names: list
            e.g. ['PNAS','Fossil','Leaves']. Pairs will be created within function.
        validation_splits: dict
        output_root_dir: str
        merge_new_labels: bool = True

    '''
    for i, dataset_1 in enumerate(dataset_names):
        for j, dataset_2 in enumerate(dataset_names):
            if j == i:
                continue
            dataset_name = '+'.join([dataset_1, dataset_2])

            encoder = LabelEncoder()
            experiment_dir = os.path.join(output_root_dir, dataset_name)

            input_data = pd.concat(
                [data_dict[dataset_1], data_dict[dataset_2]])
            process_and_save_dataset(input_data,
                                     name=dataset_name,
                                     encoder=encoder,
                                     validation_splits=validation_splits,
                                     experiment_dir=experiment_dir,
                                     merge_new_labels=merge_new_labels,
                                     other_data_keys=['dataset'])
def main(experiment_config, experiment_results_dir):

    ############################################
    #TODO: Moving towards defining most or all run parameters in separate config files
    ############################################

    domain = experiment_config.domain
    label_mapping_filepath = experiment_config['label_mappings']

    label_encoder = LabelEncoder(filepath=label_mapping_filepath)
    print(label_encoder)
    trainer = CSVTrainer(experiment_config, label_encoder=label_encoder)

    trainer.init_model_builder()

    model_filepath = os.path.join(
        trainer.model_manager.model_dir,
        trainer.model_name + '_' + domain + '_model.h5')

    train_data = trainer.get_data_loader(subset='train')
    val_data = trainer.get_data_loader(subset='val')
    test_data = trainer.get_data_loader(subset='test')

    #Get parameters for fitting and callbacks
    fit_params = trainer.get_fit_params()
    callbacks = get_callbacks(weights_best=os.path.join(
        trainer.model_manager.model_dir,
        trainer.model_name + '_' + domain + '_model_weights_best.h5'),
                              logs_dir=os.path.join(experiment_results_dir,
                                                    'tensorboard_logs'),
                              restore_best_weights=True)

    history = trainer.fit(train_data,
                          steps_per_epoch=fit_params['steps_per_epoch'],
                          epochs=fit_params['epochs'],
                          validation_data=val_data,
                          validation_steps=fit_params['validation_steps'],
                          callbacks=callbacks)  #,
    #                      history_name=domain
    #                      )
    trainer.histories[domain] = history

    trainer.save_model(filepath=model_filepath)
    #######################################################################
    # TARGET DOMAIN

    #trainer.load_model(filepath=source_model_filepath)
    num_test_samples = trainer.metadata_splits['test']['num_samples']
    num_steps = num_test_samples // trainer.config['batch_size']
    test_results = [
        trainer.evaluate(test_data, steps=num_steps, log_name='test')
    ]  #'trained-on-source_train--evaluated-on-source_test')]

    trainer.test_results = test_results

    return trainer
Beispiel #4
0
    def init_params(self, label_encoder):
        #         import pdb; pdb.set_trace()
        self.tfrecord_root_dir = self.config['tfrecord_root_dir']
        self.model_dir = self.config['model_dir']
        self.data_db_path = self.config['data_db_path']
        self.db = None
        if label_encoder is None:
            self.encoder = LabelEncoder()
        else:
            self.encoder = label_encoder

        if 'label_encodings_filepath' in self.config:
            assert validate_filepath(self.config['label_encodings_filepath'],
                                     file_type='json')
            self.label_encodings_filepath = self.config[
                'label_encodings_filepath']
        else:
            self.label_encodings_filepath = os.path.join(
                self.model_dir, f'{self.name}-label_encodings.json')
        self.config['label_encodings_filepath'] = self.label_encodings_filepath
Beispiel #5
0
def process_and_save_singledomain_datasets(data_dict: dict,
                                           dataset_names: list,
                                           validation_splits: dict,
                                           output_root_dir: str,
                                           merge_new_labels: bool = True):
    '''
    Generate CSV datasets for single domain experiment, one for each individual dataset
    '''
    for dataset_name in dataset_names:
        encoder = LabelEncoder()
        experiment_dir = os.path.join(output_root_dir, dataset_name)
        process_and_save_dataset(data_dict[dataset_name],
                                 name=dataset_name,
                                 encoder=encoder,
                                 validation_splits=validation_splits,
                                 experiment_dir=experiment_dir,
                                 merge_new_labels=merge_new_labels)
def main(experiment_configs, experiment_results_dir):

    ############################################
    #TODO: Moving towards defining most or all run parameters in separate config files
    ############################################
    label_encoders = {}
    for i, domain in enumerate(['source', 'target']):
        label_mapping_filepath = experiment_configs[i]['label_mappings']
        label_encoders.update(
            {domain: LabelEncoder(filepath=label_mapping_filepath)})
        print(domain, len(label_encoders[domain]))

    trainer = TransferTrainer(experiment_configs,
                              trainer_constructor=CSVTrainer,
                              label_encoders=label_encoders)

    trainer.init_model_builder(domain='source')

    source_model_filepath = os.path.join(
        trainer.model_manager.model_dir,
        trainer.model_name + '_source_model.h5')
    target_model_filepath = os.path.join(
        trainer.model_manager.model_dir,
        trainer.model_name + '_target_model.h5')

    source_train_data = trainer.get_data_loader(domain='source',
                                                subset='train')
    source_val_data = trainer.get_data_loader(domain='source', subset='val')

    #Get parameters for fitting and callbacks
    fit_params = trainer.get_fit_params(domain='source')
    callbacks = get_callbacks(weights_best=os.path.join(
        trainer.model_manager.model_dir, 'source_domain_weights_best.h5'),
                              logs_dir=os.path.join(experiment_results_dir,
                                                    'tensorboard_logs'),
                              restore_best_weights=True)

    # TRAIN ON SOURCE DOMAIN

    history = trainer.fit(source_train_data,
                          steps_per_epoch=fit_params['steps_per_epoch'],
                          epochs=fit_params['epochs'],
                          validation_data=source_val_data,
                          validation_steps=fit_params['validation_steps'],
                          callbacks=callbacks,
                          history_name='source')
    trainer.histories['source'] = history

    trainer.save_model(filepath=source_model_filepath)
    #######################################################################
    # TARGET DOMAIN

    trainer.load_model(filepath=source_model_filepath)

    target_train_data = trainer.get_data_loader(domain='target',
                                                subset='train')
    target_val_data = trainer.get_data_loader(domain='target', subset='val')
    target_test_data = trainer.get_data_loader(domain='target', subset='test')

    fit_params = trainer.get_fit_params(domain='target')
    callbacks = get_callbacks(weights_best=os.path.join(
        trainer.model_manager.model_dir, 'target_domain_weights_best.h5'),
                              logs_dir=os.path.join(experiment_results_dir,
                                                    'tensorboard_logs'),
                              restore_best_weights=True)

    num_test_samples = trainer.domains['target'].metadata_splits['test'][
        'num_samples']
    num_steps = num_test_samples // trainer.domains['target'].config[
        'batch_size']
    test_results = []
    test_results += [
        trainer.evaluate(target_test_data,
                         steps=num_steps,
                         log_name='0-shot_test')
    ]

    # FINETUNE ON TARGET DOMAIN

    history = trainer.fit(target_train_data,
                          steps_per_epoch=fit_params['steps_per_epoch'],
                          epochs=fit_params['epochs'],
                          validation_data=target_val_data,
                          validation_steps=fit_params['validation_steps'],
                          callbacks=callbacks,
                          history_name='target')

    trainer.histories['target'] = history

    test_results += [
        trainer.evaluate(target_test_data,
                         steps=num_steps,
                         log_name='test_acc')
    ]
    trainer.test_results = test_results

    return trainer
Beispiel #7
0
class SQLManager:
    '''
    ETL pipeline for preparing data from Leavesdb SQLite database and staging TFRecords for feeding into data loaders.

    Meant to be subclassed for use with BaseTrainer and future Trainer classes.
    '''
    def __init__(self, experiment_config, label_encoder=None):

        self.config = experiment_config
        self.configs = {'experiment_config': self.config}
        self.name = ''
        print('In SQLManager.__init__')
        self.init_params(label_encoder=label_encoder)

    def init_params(self, label_encoder):
        #         import pdb; pdb.set_trace()
        self.tfrecord_root_dir = self.config['tfrecord_root_dir']
        self.model_dir = self.config['model_dir']
        self.data_db_path = self.config['data_db_path']
        self.db = None
        if label_encoder is None:
            self.encoder = LabelEncoder()
        else:
            self.encoder = label_encoder

        if 'label_encodings_filepath' in self.config:
            assert validate_filepath(self.config['label_encodings_filepath'],
                                     file_type='json')
            self.label_encodings_filepath = self.config[
                'label_encodings_filepath']
        else:
            self.label_encodings_filepath = os.path.join(
                self.model_dir, f'{self.name}-label_encodings.json')
        self.config['label_encodings_filepath'] = self.label_encodings_filepath

    def extract(self, dataset_names=''):
        '''
        Query all filenames and labels associated with dataset_name

        Argmuents:
            dataset_names, list(str):
                list of individual dataset names to load into one dataframe

        Return:
            data, pd.DataFrame:
                DataFrame containing columns ['path','label','dataset']
        '''
        dataset_names = dataset_name.split('+')
        self.db_df = self.db_query(dataset_names=dataset_names)
        self.target_size = self.config.target_size
        self.num_channels = self.config.num_channels
        return self.db_df

    def transform(self, verbose=False):
        self.x, self.y = self.db_filter(self.db_df, verbose=verbose)
        self.data_splits, self.metadata_splits = self.split_data(
            self.x, self.y)
        self.num_classes = self.metadata_splits['train']['num_classes']
        self.config.num_classes = self.num_classes
        self.label_encodings = self.encoder.get_encodings()
        return self.data_splits

    def load(self):
        self.dataset_builder = DatasetBuilder(root_dir=self.tfrecord_root_dir,
                                              num_classes=self.num_classes)

        self.coder, self.tfrecord_files = self.stage_tfrecords()
        return self.tfrecord_files

#     '''
#     TODO: Refactor starting from db_query() to accept arbitrary number of datasets to be queried and concatenated together ###

#     '''

    def open_db_connection(self):
        '''
        Returns an open connection to db, starts it if it doesn't yet exist
        '''
        if not self.db:
            self.local_db = leavesdb.init_local_db(src_db=self.data_db_path)
            self.db = dataset.connect(f'sqlite:///{self.local_db}',
                                      row_type=stuf)
        return self.db

    def load_data(self,
                  db,
                  datasets=['Fossil', 'Leaves'],
                  x_col='path',
                  y_col='family',
                  keep_cols=['dataset']):

        data_df = pd.DataFrame(db['dataset'].all())
        data = []
        columns = [x_col, y_col, *keep_cols]
        for name in datasets:
            data += [data_df[data_df.loc[:, 'dataset'] == name]]
        data = pd.concat(data)
        data = data.loc[:, columns]

        return data

    def db_query(self, dataset_names=['Fossil'], label_col='family'):
        '''
        Query all filenames and labels associated with dataset_name

        Argmuents:
            dataset_names, list(str):
                list of individual dataset names to load into one dataframe

        Return:
            data, pd.DataFrame:
                DataFrame containing columns ['path','label','dataset']
        '''
        db = self.open_db_connection()
        data = self.load_data(db,
                              datasets=dataset_names,
                              x_col='path',
                              y_col=label_col)
        return data

    def db_filter(self, db_df, label_col='family', verbose=False):
        '''
        Function to apply preprocessing to output of db_query, prior to conversion of images to TFRecord.

        '''
        threshold = self.config.low_class_count_thresh
        db_df = filter_low_count_labels(db_df,
                                        threshold=threshold,
                                        verbose=verbose)

        if len(self.encoder) == 0:
            self.encoder.merge_labels(labels=list(db_df[label_col]))
        self.encoder.save_labels(self.config['label_encodings_filepath'])

        db_df = self.encoder.filter(db_df, label_col=label_col)

        self.x = db_df['path'].values.reshape((-1, 1))
        self.y = np.array(self.encoder.transform(db_df[label_col]))

        return self.x, self.y

    def split_data(self, x, y, verbose=False):
        '''
        Function to split data ino k-splits. Currently, default is to simply split into train/val/test sets
        '''
        val_size = self.config.data_splits['val_size']
        test_size = self.config.data_splits['test_size']

        self.data_splits = train_val_test_split(x,
                                                y,
                                                val_size=val_size,
                                                test_size=test_size)

        self.metadata_splits = get_data_splits_metadata(self.data_splits,
                                                        self.db_df,
                                                        encoder=self.encoder,
                                                        verbose=verbose)
        return self.data_splits, self.metadata_splits

    def get_class_counts(self):
        class_count_splits = {}
        for subset, subset_data in self.data_splits.items():
            print(subset)
            if type(subset_data['path']) == np.ndarray:
                subset_data['path'] = subset_data['path'].flatten().tolist()
            labels, label_counts = get_class_counts(
                pd.DataFrame.from_dict(subset_data))
            class_count_splits[subset] = {
                l: c
                for l, c in zip(labels, label_counts)
            }
        return class_count_splits

    def stage_tfrecords(self, verbose=False):
        '''
        Looks for tfrecords corresponding to DatasetConfig parameters, if nonexistent then proceeds to create tfrecords.
        '''
        self.root_dir = self.tfrecord_root_dir
        dataset_name = self.config.dataset_name
        val_size = self.config.data_splits['val_size']
        test_size = self.config.data_splits['test_size']

        #Store records in subdirectories labeled with relevant metadata
        record_subdirs = [
            dataset_name,
            f'num_channels-3_thresh-{self.config.low_class_count_thresh}',
            f'val_size={val_size}-test_size={test_size}'
        ]

        tfrecords = self.dataset_builder.recursive_search(
            self.root_dir, subdirs=record_subdirs, verbose=verbose)
        if tfrecords is None:
            return create_tfrecords(self.config,
                                    record_subdirs,
                                    data_splits=self.data_splits,
                                    metadata_splits=self.metadata_splits)
        else:
            coder = TFRecordCoder(self.data_splits['train'],
                                  self.root_dir,
                                  record_subdirs=record_subdirs,
                                  target_size=self.target_size,
                                  num_channels=self.num_channels,
                                  num_classes=self.num_classes)
            return coder, tfrecords