Esempio n. 1
0
def run_model(DecisionTree_params, category):
    """Full-scale training, validation and testing using all amines.
    Args:
        DecisionTree_params:         A dictionary of the parameters for the decision tree model.
                                        See initialize() for more information.
        category:                    A string representing the category the model is classified under.
    """

    # Feature names hard-coded for decision tree visualization
    features = [
        '_rxn_M_acid', '_rxn_M_inorganic', '_rxn_M_organic', '_solv_GBL',
        '_solv_DMSO', '_solv_DMF', '_stoich_mmol_org', '_stoich_mmol_inorg',
        '_stoich_mmol_acid', '_stoich_mmol_solv', '_stoich_org/solv',
        '_stoich_inorg/solv', '_stoich_acid/solv', '_stoich_org+inorg/solv',
        '_stoich_org+inorg+acid/solv', '_stoich_org/liq', '_stoich_inorg/liq',
        '_stoich_org+inorg/liq', '_stoich_org/inorg', '_stoich_acid/inorg',
        '_rxn_Temperature_C', '_rxn_Reactiontime_s', '_feat_AvgPol',
        '_feat_Refractivity', '_feat_MaximalProjectionArea',
        '_feat_MaximalProjectionRadius', '_feat_maximalprojectionsize',
        '_feat_MinimalProjectionArea', '_feat_MinimalProjectionRadius',
        '_feat_minimalprojectionsize', '_feat_MolPol',
        '_feat_VanderWaalsSurfaceArea', '_feat_ASA', '_feat_ASA_H',
        '_feat_ASA_P', '_feat_ASA-', '_feat_ASA+',
        '_feat_ProtPolarSurfaceArea', '_feat_Hacceptorcount',
        '_feat_Hdonorcount', '_feat_RotatableBondCount',
        '_raw_standard_molweight', '_feat_AtomCount_N', '_feat_BondCount',
        '_feat_ChainAtomCount', '_feat_RingAtomCount', '_feat_primaryAmine',
        '_feat_secondaryAmine', '_rxn_plateEdgeQ', '_feat_maxproj_per_N',
        '_raw_RelativeHumidity'
    ]

    # Unload common parameters
    config = DecisionTree_params['configs'][category] if DecisionTree_params[
        'configs'] else None
    verbose = DecisionTree_params['verbose']
    warning = DecisionTree_params['warning']
    stats_path = DecisionTree_params['stats_path']
    result_dict = DecisionTree_params['result_dict']

    model_name = DecisionTree_params['model_name']
    print(f'Running model {model_name}')

    # Unload the training data specific parameters
    num_draws = DecisionTree_params['num_draws']
    train_size = DecisionTree_params['train_size']
    active_learning_iter = DecisionTree_params['active_learning_iter']
    cross_validation = DecisionTree_params['cross_validate']
    full = DecisionTree_params['full_dataset']
    active_learning = DecisionTree_params['active_learning']
    w_hx = DecisionTree_params['with_historical_data']
    w_k = DecisionTree_params['with_k']
    draw_success = DecisionTree_params['draw_success']

    # Specify the desired operation
    fine_tuning = DecisionTree_params['fine_tuning']
    save_model = DecisionTree_params['save_model']
    visualize = DecisionTree_params['visualize']
    to_file = True

    if fine_tuning:
        class_weights = [{
            0: i,
            1: 1.0 - i
        } for i in np.linspace(.05, .95, num=50)]
        class_weights.append('balanced')
        class_weights.append(None)

        max_depths = [i for i in range(9, 26)]
        max_depths.append(None)

        ft_params = {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_depth': max_depths,
            'min_samples_split': [i for i in range(2, 11)],
            'min_samples_leaf': [i for i in range(1, 4)],
            'class_weight': class_weights
        }

        result_path = './results/ft_{}.pkl'.format(model_name)

        grid_search(ActiveDecisionTree,
                    ft_params,
                    result_path,
                    num_draws,
                    train_size,
                    active_learning_iter,
                    active_learning=active_learning,
                    w_hx=w_hx,
                    w_k=w_k,
                    draw_success=draw_success,
                    result_dict=result_dict,
                    model_name=model_name)

    else:
        # Load the desired sized dataset under desired option
        dataset = process_dataset(num_draw=num_draws,
                                  train_size=train_size,
                                  active_learning_iter=active_learning_iter,
                                  verbose=verbose,
                                  cross_validation=cross_validation,
                                  full=full,
                                  active_learning=active_learning,
                                  w_hx=w_hx,
                                  w_k=w_k,
                                  success=draw_success)

        draws = list(dataset.keys())
        amine_list = list(dataset[0]['x_t'].keys())

        for amine in amine_list:
            # Create the decision tree model instance for the specific amine
            ADT = ActiveDecisionTree(amine=amine,
                                     config=config,
                                     verbose=verbose,
                                     stats_path=stats_path,
                                     result_dict=result_dict,
                                     model_name=model_name)
            for set_id in draws:
                # Unload the randomly drawn dataset values
                x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                           dataset[set_id]['y_t'], \
                                                           dataset[set_id]['x_v'], \
                                                           dataset[set_id]['y_v'], \
                                                           dataset[set_id]['all_data'], \
                                                           dataset[set_id]['all_labels']

                # Load the training and validation set into the model
                ADT.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine],
                                 y_v[amine], all_data[amine],
                                 all_labels[amine])

                # Train the data on the training set
                ADT.train(warning=warning)

                # Conduct active learning with all the observations available in the pool
                if active_learning:
                    ADT.active_learning(num_iter=active_learning_iter,
                                        warning=warning)

                if visualize:
                    # Plot the decision tree
                    # To compile the graph, use the following command in terminal
                    # dot -Tpng "{dt_file_name}.dot" -o "{desired file name}.png"
                    # If using Jupyter Notebook, add ! in front to run command lines
                    file_name = './results/{0:s}_dt_{1:s}_{2:d}.dot'.format(
                        model_name, amine, set_id)
                    export_graphviz(ADT.model,
                                    feature_names=features,
                                    class_names=['FAILURE', 'SUCCESS'],
                                    out_file=file_name,
                                    filled=True,
                                    rounded=True,
                                    special_characters=True)

            if to_file:
                ADT.store_metrics_to_file()

            # Save the model for future reproducibility
            if save_model:
                ADT.save_model(model_name)
Esempio n. 2
0
def grid_search(clf,
                combinations,
                path,
                num_draws,
                train_size,
                active_learning_iter,
                active_learning=True,
                w_hx=True,
                w_k=True,
                draw_success=False,
                model_name=''):
    """Fine tune the model based on average bcr performance to find the best model hyper-parameters.

    Similar to GridSearchCV in scikit-learn package, we try out all the combinations and evaluate performance
        across all amine-specific models under different categories.

    Args:
        clf:                        A class object representing the classifier being fine tuned.
        combinations:               A list of dictionaries representing the possible hyper-parameter values to try out.
        path:                       A string representing the directory path to store the statistics of all combinations
                                        tried during one stage of fine tuning.
        num_draws:                  An integer representing the number of random drawn to create the dataset.
        train_size:                 An integer representing the number of amine-specific experiments used for training.
                                        Corresponds to the k in the category description.
        active_learning_iter:       An integer representing the number of iterations in an active learning loop.
                                        Corresponds to the x in the category description.
        active_learning:            A boolean representing if active learning will be involved in testing or not.
        w_hx:                       A boolean representing if the models are trained with historical data or not.
        w_k:                        A boolean representing if the modes are trained with amine-specific experiments.
        draw_success:               A boolean representing if the models are trained on regular randomly-drawn datasets
                                        or random datasets with at least one success for each amine.
        model_name:                 A string representing the name of the model being fine tuned.

    Returns:
        best_option:                A dictionary representing the hyper-parameters that yields the best performance on
                                        average. The keys may vary for models.
    """

    # Load or initialize dictionary to keep all configurations' performances
    if os.path.exists(path):
        with open(path, 'rb') as f:
            ft_log = pickle.load(f)
    else:
        ft_log = defaultdict(dict)

    if model_name not in ft_log:
        ft_log[model_name] = defaultdict(dict)

    # Load the full dataset under specific categorical option
    dataset = process_dataset(num_draw=num_draws,
                              train_size=train_size,
                              active_learning_iter=active_learning_iter,
                              verbose=False,
                              cross_validation=True,
                              full=True,
                              active_learning=active_learning,
                              w_hx=w_hx,
                              w_k=w_k,
                              success=draw_success)

    draws = list(dataset.keys())
    amine_list = list(dataset[0]['x_t'].keys())

    if 'Default' not in ft_log[model_name]:
        # Set baseline performance
        base_accuracies = []
        base_precisions = []
        base_recalls = []
        base_bcrs = []

        for amine in amine_list:
            if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success:
                # Skipping the amine with only 1 successful experiment overall
                # Can't run 4-ii and 5-ii models on this amine
                continue
            else:
                ACLF = clf(amine=amine, verbose=False)

                for set_id in draws:
                    # Unload the randomly drawn dataset values
                    x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                               dataset[set_id]['y_t'], \
                                                               dataset[set_id]['x_v'], \
                                                               dataset[set_id]['y_v'], \
                                                               dataset[set_id]['all_data'], \
                                                               dataset[set_id]['all_labels']

                    # Load the training and validation set into the model
                    ACLF.load_dataset(set_id, x_t[amine], y_t[amine],
                                      x_v[amine], y_v[amine], all_data[amine],
                                      all_labels[amine])

                    # Train the data on the training set
                    ACLF.train(warning=False)

                ACLF.find_inner_avg()

                base_accuracies.append(
                    ACLF.metrics['average']['accuracies'][-1])
                base_precisions.append(
                    ACLF.metrics['average']['precisions'][-1])
                base_recalls.append(ACLF.metrics['average']['recalls'][-1])
                base_bcrs.append(ACLF.metrics['average']['bcrs'][-1])

        # Calculated the average baseline performances
        ft_log[model_name]['Default']['accuracies'] = sum(
            base_accuracies) / len(base_accuracies)
        ft_log[model_name]['Default']['precisions'] = sum(
            base_precisions) / len(base_precisions)
        ft_log[model_name]['Default']['recalls'] = sum(base_recalls) / len(
            base_recalls)
        ft_log[model_name]['Default']['bcrs'] = sum(base_bcrs) / len(base_bcrs)

    # Try out each possible combinations of hyper-parameters
    for option in combinations:
        accuracies = []
        precisions = []
        recalls = []
        bcrs = []

        for amine in amine_list:
            if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success:
                # Skipping the amine with only 1 successful experiment overall
                # Can't run 4-ii and 5-ii models on this amine
                continue
            else:
                # print("Training and cross validation on {} amine.".format(amine))
                ACLF = clf(amine=amine, config=option, verbose=False)

                for set_id in draws:
                    # Unload the randomly drawn dataset values
                    x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                               dataset[set_id]['y_t'], \
                                                               dataset[set_id]['x_v'], \
                                                               dataset[set_id]['y_v'], \
                                                               dataset[set_id]['all_data'], \
                                                               dataset[set_id]['all_labels']

                    # Load the training and validation set into the model
                    ACLF.load_dataset(set_id, x_t[amine], y_t[amine],
                                      x_v[amine], y_v[amine], all_data[amine],
                                      all_labels[amine])

                    # Train the data on the training set
                    ACLF.train(warning=False)

                ACLF.find_inner_avg()

                accuracies.append(ACLF.metrics['average']['accuracies'][-1])
                precisions.append(ACLF.metrics['average']['precisions'][-1])
                recalls.append(ACLF.metrics['average']['recalls'][-1])
                bcrs.append(ACLF.metrics['average']['bcrs'][-1])

        ft_log[model_name][str(
            option)]['accuracies'] = sum(accuracies) / len(accuracies)
        ft_log[model_name][str(
            option)]['precisions'] = sum(precisions) / len(precisions)
        ft_log[model_name][str(
            option)]['recalls'] = sum(recalls) / len(recalls)
        ft_log[model_name][str(option)]['bcrs'] = sum(bcrs) / len(bcrs)

    # Save the fine tuning performances to pkl if not multi-processing
    with open(path, 'wb') as f:
        pickle.dump(ft_log, f)
Esempio n. 3
0
def grid_search(clf,
                ft_params,
                path,
                num_draws,
                train_size,
                active_learning_iter,
                active_learning=True,
                w_hx=True,
                w_k=True,
                draw_success=False,
                random=False,
                random_size=10,
                result_dict=None,
                model_name=''):
    """Fine tune the model based on average bcr performance to find the best model hyper-parameters.

    Similar to GridSearchCV in scikit-learn package, we try out all the combinations and evaluate performance
        across all amine-specific models under different categories.

    Args:
        clf:                        A class object representing the classifier being fine tuned.
        ft_params:                  A dictionary representing the possible hyper-parameter values to try out.
        path:                       A string representing the directory path to store the statistics of all combinations
                                        tried during one stage of fine tuning.
        num_draws:                  An integer representing the number of random drawn to create the dataset.
        train_size:                 An integer representing the number of amine-specific experiments used for training.
                                        Corresponds to the k in the category description.
        active_learning_iter:       An integer representing the number of iterations in an active learning loop.
                                        Corresponds to the x in the category description.
        active_learning:            A boolean representing if active learning will be involved in testing or not.
        w_hx:                       A boolean representing if the models are trained with historical data or not.
        w_k:                        A boolean representing if the modes are trained with amine-specific experiments.
        draw_success:               A boolean representing if the models are trained on regular randomly-drawn datasets
                                        or random datasets with at least one success for each amine.
        random:                     A boolean representing if we want to do random search or not.
        random_size:                An integer representing the number of random combinations to try and compare.
        result_dict:                A dictionary representing the result dictionary used during multi-thread processing.
        model_name:                 A string representing the name of the model being fine tuned.

    Returns:
        best_option:                A dictionary representing the hyper-parameters that yields the best performance on
                                        average. The keys may vary for models.
    """

    # Load or initialize dictionary to keep all configurations' performances
    if result_dict:
        ft_log = result_dict
    elif os.path.exists(path):
        with open(path, 'rb') as f:
            ft_log = pickle.load(f)
    else:
        ft_log = defaultdict(dict)

    if model_name not in ft_log:
        ft_log[model_name] = defaultdict(dict)

    # Set all possible combinations
    combinations = []

    keys, values = zip(*ft_params.items())
    for bundle in itertools.product(*values):
        combinations.append(dict(zip(keys, bundle)))

    # Random search if we are not searching through the whole grid
    if random:
        combinations = list(np.random.choice(combinations, size=random_size))

    # Load the full dataset under specific categorical option
    dataset = process_dataset(num_draw=num_draws,
                              train_size=train_size,
                              active_learning_iter=active_learning_iter,
                              verbose=False,
                              cross_validation=True,
                              full=True,
                              active_learning=active_learning,
                              w_hx=w_hx,
                              w_k=w_k,
                              success=draw_success)

    draws = list(dataset.keys())
    amine_list = list(dataset[0]['x_t'].keys())

    # Set baseline performance
    base_accuracies = []
    base_precisions = []
    base_recalls = []
    base_bcrs = []

    # Log the starting time of fine tuning
    start_time = time.time()

    for amine in amine_list:
        if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success:
            # Skipping the amine with only 1 successful experiment overall
            # Can't run 4-ii and 5-ii models on this amine
            continue
        else:
            ACLF = clf(amine=amine, verbose=False)

            for set_id in draws:
                # Unload the randomly drawn dataset values
                x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                           dataset[set_id]['y_t'], \
                                                           dataset[set_id]['x_v'], \
                                                           dataset[set_id]['y_v'], \
                                                           dataset[set_id]['all_data'], \
                                                           dataset[set_id]['all_labels']

                # Load the training and validation set into the model
                ACLF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine],
                                  y_v[amine], all_data[amine],
                                  all_labels[amine])

                # Train the data on the training set
                ACLF.train(warning=False)

            ACLF.find_inner_avg()

            base_accuracies.append(ACLF.metrics['average']['accuracies'][-1])
            base_precisions.append(ACLF.metrics['average']['precisions'][-1])
            base_recalls.append(ACLF.metrics['average']['recalls'][-1])
            base_bcrs.append(ACLF.metrics['average']['bcrs'][-1])

    # Calculated the average baseline performances
    ft_log[model_name]['Default']['accuracies'] = sum(base_accuracies) / len(
        base_accuracies)
    ft_log[model_name]['Default']['precisions'] = sum(base_precisions) / len(
        base_precisions)
    ft_log[model_name]['Default']['recalls'] = sum(base_recalls) / len(
        base_recalls)
    ft_log[model_name]['Default']['bcrs'] = sum(base_bcrs) / len(base_bcrs)

    # Try out each possible combinations of hyper-parameters
    print(f'There are {len(combinations)} many combinations to try.')
    for option in combinations:
        accuracies = []
        precisions = []
        recalls = []
        bcrs = []

        for amine in amine_list:
            if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success:
                # Skipping the amine with only 1 successful experiment overall
                # Can't run 4-ii and 5-ii models on this amine
                continue
            else:
                # print("Training and cross validation on {} amine.".format(amine))
                ACLF = clf(amine=amine, config=option, verbose=False)

                for set_id in draws:
                    # Unload the randomly drawn dataset values
                    x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                               dataset[set_id]['y_t'], \
                                                               dataset[set_id]['x_v'], \
                                                               dataset[set_id]['y_v'], \
                                                               dataset[set_id]['all_data'], \
                                                               dataset[set_id]['all_labels']

                    # Load the training and validation set into the model
                    ACLF.load_dataset(set_id, x_t[amine], y_t[amine],
                                      x_v[amine], y_v[amine], all_data[amine],
                                      all_labels[amine])

                    # Train the data on the training set
                    ACLF.train(warning=False)

                ACLF.find_inner_avg()

                accuracies.append(ACLF.metrics['average']['accuracies'][-1])
                precisions.append(ACLF.metrics['average']['precisions'][-1])
                recalls.append(ACLF.metrics['average']['recalls'][-1])
                bcrs.append(ACLF.metrics['average']['bcrs'][-1])

        ft_log[model_name][str(
            option)]['accuracies'] = sum(accuracies) / len(accuracies)
        ft_log[model_name][str(
            option)]['precisions'] = sum(precisions) / len(precisions)
        ft_log[model_name][str(
            option)]['recalls'] = sum(recalls) / len(recalls)
        ft_log[model_name][str(option)]['bcrs'] = sum(bcrs) / len(bcrs)

    # Find the total time used for fine tuning
    end_time = time.time()
    time_lapsed = end_time - start_time

    # Make time used more readable
    days = int(time_lapsed / 86400)
    hours = int((time_lapsed - (86400 * days)) / 3600)
    minutes = int((time_lapsed - (86400 * days) - (3600 * hours)) / 60)
    seconds = round(
        time_lapsed - (86400 * days) - (3600 * hours) - (minutes * 60), 2)
    per_combo = round(time_lapsed / (len(combinations)), 4)

    print(f'Fine tuning for {model_name} completed.')
    print(
        f'Total time used: {days} days {hours} hours {minutes} minutes {seconds} seconds.'
    )
    print(f'Or about {per_combo} seconds per combination.')

    # Save the fine tuning performances to pkl if not multi-processing
    if not result_dict:
        with open(path, 'wb') as f:
            pickle.dump(ft_log, f)
Esempio n. 4
0
def run_model(RandomForest_params, category):
    """Full-scale training, validation and testing using all amines.
    Args:
        RandomForest_params:         A dictionary of the parameters for the random forest model.
                                        See initialize() for more information.
        category:                    A string representing the category the model is classified under.
    """

    # Unload common parameters
    config = RandomForest_params['config'][category] if RandomForest_params[
        'config'] else None
    verbose = RandomForest_params['verbose']
    warning = RandomForest_params['warning']
    stats_path = RandomForest_params['stats_path']
    result_dict = RandomForest_params['result_dict']

    model_name = RandomForest_params['model_name']
    print(f'Running model {model_name}')

    # Unload the training data specific parameters
    num_draws = RandomForest_params['num_draws']
    train_size = RandomForest_params['train_size']
    cross_validation = RandomForest_params['cross_validate']
    active_learning = RandomForest_params['active_learning']
    w_hx = RandomForest_params['with_historical_data']
    w_k = RandomForest_params['with_k']
    active_learning_iter = RandomForest_params['active_learning_iter']
    full = RandomForest_params['full_dataset']
    draw_success = RandomForest_params['draw_success']

    # Specify the desired operation
    fine_tuning = RandomForest_params['fine_tuning']
    save_model = RandomForest_params['save_model']
    to_file = True

    if fine_tuning:
        class_weights = [{
            0: i,
            1: 1.0 - i
        } for i in np.linspace(.05, .95, num=50)]
        class_weights.append('balanced')
        class_weights.append(None)

        ft_params = {
            'n_estimators': [100, 200, 500, 1000],
            'criterion': ['gini', 'entropy'],
            'max_depth': [i for i in range(1, 9)],
            'max_features': ['auto', 'sqrt', 'log2', None],
            'bootstrap': [True],
            'min_samples_leaf': [i for i in range(1, 6)],
            'min_samples_split': [i for i in range(2, 11)],
            'ccp_alpha': [.1 * i for i in range(1)],
            'class_weight': class_weights
        }

        result_path = './results/ft_{}.pkl'.format(model_name)

        grid_search(ActiveRandomForest,
                    ft_params,
                    result_path,
                    num_draws,
                    train_size,
                    active_learning_iter,
                    active_learning=active_learning,
                    w_hx=w_hx,
                    w_k=w_k,
                    draw_success=draw_success,
                    result_dict=result_dict,
                    model_name=model_name)

    else:
        # Load the desired sized dataset under desired option
        dataset = process_dataset(num_draw=num_draws,
                                  train_size=train_size,
                                  active_learning_iter=active_learning_iter,
                                  verbose=verbose,
                                  cross_validation=cross_validation,
                                  full=full,
                                  active_learning=active_learning,
                                  w_hx=w_hx,
                                  w_k=w_k,
                                  success=draw_success)

        draws = list(dataset.keys())
        amine_list = list(dataset[0]['x_t'].keys())

        for amine in amine_list:

            # Create the RandomForest model instance for the specific amine
            ARF = ActiveRandomForest(amine=amine,
                                     config=config,
                                     verbose=verbose,
                                     stats_path=stats_path,
                                     result_dict=result_dict,
                                     model_name=model_name)

            for set_id in draws:
                # Unload the randomly drawn dataset values
                x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                           dataset[set_id]['y_t'], \
                                                           dataset[set_id]['x_v'], \
                                                           dataset[set_id]['y_v'], \
                                                           dataset[set_id]['all_data'], \
                                                           dataset[set_id]['all_labels']

                # Load the training and validation set into the model
                ARF.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine],
                                 y_v[amine], all_data[amine],
                                 all_labels[amine])

                # Train the data on the training set
                ARF.train(warning=warning)

                # Conduct active learning with all the observations available in the pool
                if active_learning:
                    ARF.active_learning(num_iter=active_learning_iter,
                                        warning=warning)

            if to_file:
                ARF.store_metrics_to_file()

            # Save the model for future reproducibility
            if save_model:
                ARF.save_model(model_name)
Esempio n. 5
0
def run_model(LinearSVM_params, category):
    """Full-scale training, validation and testing using all amines.

    Args:
        LinearSVM_params:         A dictionary of the parameters for the LinearSVM model.
                                See initialize() for more information.
        category:           A string representing the category the model is classified under.
     """

    # Unload common parameters
    config = LinearSVM_params['configs'][category] if LinearSVM_params[
        'configs'] else None
    verbose = LinearSVM_params['verbose']
    warning = LinearSVM_params['warning']
    stats_path = LinearSVM_params['stats_path']

    model_name = LinearSVM_params['model_name']
    print(f'Running model {model_name}')

    # Unload the training data specific parameters
    train_size = LinearSVM_params['train_size']
    active_learning_iter = LinearSVM_params['active_learning_iter']
    cross_validation = LinearSVM_params['cross_validate']
    full = LinearSVM_params['full_dataset']
    active_learning = LinearSVM_params['active_learning']
    w_hx = LinearSVM_params['with_historical_data']
    w_k = LinearSVM_params['with_k']

    # Specify the desired operation
    fine_tuning = LinearSVM_params['fine_tuning']
    save_model = LinearSVM_params['save_model']
    to_params = True

    if fine_tuning:
        class_weights = [{
            0: i,
            1: 1.0 - i
        } for i in np.linspace(.1, .9, num=9)]
        class_weights.append('balanced')
        class_weights.append(None)

        ft_params = {
            # 'penalty': ['l1', 'l2'],
            'penalty': ['l1'],
            # 'loss': ['hinge', 'squared_hinge'],
            'loss': ['squared_hinge'],
            'dual': [False],
            # 'C': [.001, .01, .1, 1, 10],
            'C': [i for i in np.linspace(0.001, 0.01, num=10)],
            # 'tol': [.0001, .001, .01, .1, 1],
            'tol': [i for i in np.linspace(0.01, 0.1, num=10)],
            'fit_intercept': [True],
            'class_weight': class_weights,
        }

        _ = grid_search(ActiveLinearSVM,
                        ft_params,
                        train_size,
                        active_learning_iter,
                        active_learning=active_learning,
                        w_hx=w_hx,
                        w_k=w_k,
                        info=True)
    else:
        # Load the desired sized dataset under desired option
        amine_list, x_t, y_t, x_v, y_v, all_data, all_labels = process_dataset(
            train_size=train_size,
            active_learning_iter=active_learning_iter,
            verbose=verbose,
            cross_validation=cross_validation,
            full=full,
            active_learning=active_learning,
            w_hx=w_hx,
            w_k=w_k)

        # print(amine_list)
        for amine in amine_list:
            if cross_validation:
                # print("Training and cross validation on {} amine.".format(amine))

                # Create the LinearSVM model instance for the specific amine
                ALSVM = ActiveLinearSVM(amine=amine,
                                        config=config,
                                        verbose=verbose,
                                        stats_path=stats_path,
                                        model_name=model_name)

                # Load the training and validation set into the model
                ALSVM.load_dataset(x_t[amine], y_t[amine], x_v[amine],
                                   y_v[amine], all_data[amine],
                                   all_labels[amine])

                # Train the data on the training set
                ALSVM.train(warning=warning)

                # Conduct active learning with all the observations available in the pool
                if active_learning:
                    ALSVM.active_learning(num_iter=active_learning_iter,
                                          warning=warning,
                                          to_params=to_params)
                else:
                    ALSVM.store_metrics_to_params()

                # Save the model for future reproducibility
                if save_model:
                    ALSVM.save_model(model_name)
Esempio n. 6
0
def run_model(SVM_params, category):
    """Full-scale training, validation and testing using all amines.

    Args:
        SVM_params:         A dictionary of the parameters for the SVM model.
                                See initialize() for more information.
        category:           A string representing the category the model is classified under.
     """

    # Unload common parameters
    config = SVM_params['configs'][category] if SVM_params['configs'] else None
    verbose = SVM_params['verbose']
    stats_path = SVM_params['stats_path']

    model_name = SVM_params['model_name']
    print(f'Running model {model_name}')

    # Unload the training data specific parameters
    train_size = SVM_params['train_size']
    active_learning_iter = SVM_params['active_learning_iter']
    cross_validation = SVM_params['cross_validate']
    full = SVM_params['full_dataset']
    active_learning = SVM_params['active_learning']
    w_hx = SVM_params['with_historical_data']
    w_k = SVM_params['with_k']

    # Specify the desired operation
    fine_tuning = SVM_params['fine_tuning']
    save_model = SVM_params['save_model']
    to_params = True

    if fine_tuning:
        w0 = [i for i in np.linspace(.1, .9, num=9)]
        w0.append(1)

        ft_params = {
            '-t': [0, 1, 2, 3],
            '-d': [i for i in range(1, 6)],
            '-g': [.0001, .001, .01, 1 / 51, .1, 1],
            '-c': [.0001, .001, .01, .1, 1, 10],
            '-m': [4000],
            '-w0': w0,
        }

        _ = grid_search(ActiveSVC,
                        ft_params,
                        train_size,
                        active_learning_iter,
                        active_learning=active_learning,
                        w_hx=w_hx,
                        w_k=w_k,
                        info=True)
    else:
        # Load the desired sized dataset under desired option
        amine_list, x_t, y_t, x_v, y_v, all_data, all_labels = process_dataset(
            train_size=train_size,
            active_learning_iter=active_learning_iter,
            verbose=verbose,
            cross_validation=cross_validation,
            full=full,
            active_learning=active_learning,
            w_hx=w_hx,
            w_k=w_k)

        # print(amine_list)
        for amine in amine_list:
            if cross_validation:
                # print("Training and cross validation on {} amine.".format(amine))

                # Create the SVM model instance for the specific amine
                ASVM = ActiveSVC(amine=amine,
                                 config=config,
                                 verbose=verbose,
                                 stats_path=stats_path,
                                 model_name=model_name)

                # Load the training and validation set into the model
                ASVM.load_dataset(x_t[amine], y_t[amine], x_v[amine],
                                  y_v[amine], all_data[amine],
                                  all_labels[amine])

                # Train the data on the training set
                ASVM.train()

                # Conduct active learning with all the observations available in the pool
                if active_learning:
                    ASVM.active_learning(num_iter=active_learning_iter,
                                         to_params=to_params)
                else:
                    ASVM.store_metrics_to_params()

                # Save the model for future reproducibility
                if save_model:
                    ASVM.save_model(model_name)
Esempio n. 7
0
def grid_search(clf,
                params,
                train_size,
                active_learning_iter,
                active_learning=True,
                w_hx=True,
                w_k=True,
                info=False):
    """Fine tune the model based on average bcr performance to find the best model hyper-parameters.

    Similar to GridSearchCV in scikit-learn package, we try out all the combinations and evaluate performance
        across all amine-specific models under different categories.

    Args:
        clf:                        A class object representing the classifier being fine tuned.
        params:                     A dictionary representing the possible hyper-parameter values to try out.
        train_size:                 An integer representing the number of amine-specific experiments used for training.
                                        Corresponds to the k in the category description.
        active_learning_iter:       An integer representing the number of iterations in an active learning loop.
                                        Corresponds to the x in the category description.
        active_learning:            A boolean representing if active learning will be involved in testing or not.
        w_hx:                       A boolean representing if the models are trained with historical data or not.
        w_k:                        A boolean representing if the modes are trained with amine-specific experiments.
        info:                       A boolean. Setting it to True will make the function print out additional
                                        information during the fine-tuning stage.
                                        Default to False.
    Returns:
        best_option:                A dictionary representing the hyper-parameters that yields the best performance on
                                        average. The keys may vary for models.
    """

    # Set all possible combinations
    combinations = []

    keys, values = zip(*params.items())
    for bundle in itertools.product(*values):
        config_dict = dict(zip(keys, bundle))
        # Delete duplicate configs where the kernel is not poly
        if not (config_dict['-t'] != 1 and config_dict['-d'] != 1):
            config = dict_to_str_config(config_dict)
            combinations.append(config)

    # Load the full dataset under specific categorical option
    amine_list, train_data, train_labels, val_data, val_labels, all_data, all_labels = process_dataset(
        train_size=train_size,
        active_learning_iter=active_learning_iter,
        verbose=False,
        cross_validation=True,
        full=True,
        active_learning=active_learning,
        w_hx=w_hx,
        w_k=w_k)

    # Set baseline performance
    base_accuracies = []
    base_precisions = []
    base_recalls = []
    base_bcrs = []
    base_aucs = []

    for amine in amine_list:
        ACLF = clf(amine=amine, verbose=False)

        # Exact and load the training and validation set into the model
        x_t, y_t = train_data[amine], train_labels[amine]
        x_v, y_v = val_data[amine], val_labels[amine]
        all_task_data, all_task_labels = all_data[amine], all_labels[amine]
        ACLF.load_dataset(x_t, y_t, x_v, y_v, all_task_data, all_task_labels)

        ACLF.train(warning=False)

        # Calculate AUC
        auc = roc_auc_score(ACLF.all_labels, ACLF.y_preds)

        base_accuracies.append(ACLF.metrics['accuracies'][-1])
        base_precisions.append(ACLF.metrics['precisions'][-1])
        base_recalls.append(ACLF.metrics['recalls'][-1])
        base_bcrs.append(ACLF.metrics['bcrs'][-1])
        base_aucs.append(auc)

    # Calculated the average baseline performances
    base_avg_accuracy = sum(base_accuracies) / len(base_accuracies)
    base_avg_precision = sum(base_precisions) / len(base_precisions)
    base_avg_recall = sum(base_recalls) / len(base_recalls)
    base_avg_bcr = sum(base_bcrs) / len(base_bcrs)
    base_avg_auc = sum(base_aucs) / len(base_aucs)

    best_metric = base_avg_auc
    previous_recall = base_avg_recall

    if info:
        print(f'Baseline average accuracy is {base_avg_accuracy}')
        print(f'Baseline average precision is {base_avg_precision}')
        print(f'Baseline average recall is {base_avg_recall}')
        print(f'Baseline average bcr is {base_avg_bcr}')
        print(f'Baseline average auc is {base_avg_auc}')

    best_option = {}

    option_no = 1

    # Try out each possible combinations of hyper-parameters
    print(f'There are {len(combinations)} many combinations to try.')
    for option in combinations:
        accuracies = []
        precisions = []
        recalls = []
        bcrs = []
        aucs = []

        print(f'Trying option {option_no}')
        option_no += 1
        for amine in amine_list:
            # print("Training and cross validation on {} amine.".format(amine))
            ACLF = clf(amine=amine, config=option, verbose=False)

            # Exact and load the training and validation set into the model
            x_t, y_t = train_data[amine], train_labels[amine]
            x_v, y_v = val_data[amine], val_labels[amine]
            all_task_data, all_task_labels = all_data[amine], all_labels[amine]

            ACLF.load_dataset(x_t, y_t, x_v, y_v, all_task_data,
                              all_task_labels)
            ACLF.train(warning=False)

            # Calculate AUC
            auc = roc_auc_score(ACLF.all_labels, ACLF.y_preds)

            accuracies.append(ACLF.metrics['accuracies'][-1])
            precisions.append(ACLF.metrics['precisions'][-1])
            recalls.append(ACLF.metrics['recalls'][-1])
            bcrs.append(ACLF.metrics['bcrs'][-1])
            aucs.append(auc)

        avg_accuracy = sum(accuracies) / len(accuracies)
        avg_precision = sum(precisions) / len(precisions)
        avg_recall = sum(recalls) / len(recalls)
        avg_bcr = sum(bcrs) / len(bcrs)
        avg_auc = sum(aucs) / len(aucs)

        if best_metric - avg_auc < .01 and avg_recall > previous_recall:
            if info:
                print(f'The previous best option is {best_option}')
                print(f'The current best setting is {option}')
                print(
                    f'The fine-tuned average accuracy is {avg_accuracy} vs. the base accuracy {base_avg_accuracy}'
                )
                print(
                    f'The fine-tuned average precision is {avg_precision} vs. the base precision {base_avg_precision}'
                )
                print(
                    f'The fine-tuned average recall rate is {avg_recall} vs. the base recall rate {base_avg_recall}'
                )
                print(
                    f'The fine-tuned average bcr is {avg_bcr} vs. the base bcr {base_avg_bcr}'
                )
                print(
                    f'The fine-tuned average auc is {avg_auc} vs. the base auc {base_avg_auc}'
                )
                print()

            best_metric = avg_auc
            previous_recall = avg_recall
            best_option = option

    if info:
        print()
        print(f'The best setting for all amines is {best_option}')
        print(f'With an average auc of {best_metric}')

    return best_option
Esempio n. 8
0
def run_model(GradientBoosting_params, category):
    """Full-scale training, validation and testing using all amines.
    Args:
        GradientBoosting_params:         A dictionary of the parameters for the Gradient Boosting model.
                                            See initialize() for more information.
        category:                        A string representing the category the model is classified under.
    """

    # Unload common parameters
    config = GradientBoosting_params['config'][
        category] if GradientBoosting_params['config'] else None
    verbose = GradientBoosting_params['verbose']
    warning = GradientBoosting_params['warning']
    stats_path = GradientBoosting_params['stats_path']
    result_dict = GradientBoosting_params['result_dict']

    model_name = GradientBoosting_params['model_name']
    print(f'Running model {model_name}')

    # Unload the training data specific parameters
    num_draws = GradientBoosting_params['num_draws']
    train_size = GradientBoosting_params['train_size']
    active_learning_iter = GradientBoosting_params['active_learning_iter']
    active_learning = GradientBoosting_params['active_learning']
    cross_validation = GradientBoosting_params['cross_validate']
    full = GradientBoosting_params['full_dataset']
    w_hx = GradientBoosting_params['with_historical_data']
    w_k = GradientBoosting_params['with_k']
    draw_success = GradientBoosting_params['draw_success']

    # Specify the desired operation
    fine_tuning = GradientBoosting_params['fine_tuning']
    save_model = GradientBoosting_params['save_model']
    to_file = True

    if fine_tuning:
        ft_params = {
            'loss': ['deviance', 'exponential'],
            'learning_rate': [0.1, 0.01, 0.001],
            'n_estimators': [100, 200, 500, 1000],
            'criterion': ['friedman_mse', 'mse', 'mae'],
            'max_depth': [i for i in range(1, 9)],
            'max_features': ['auto', 'sqrt', 'log2', None],
            'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [2, 5, 10],
            'ccp_alpha': [.1 * i for i in range(1)]
        }

        result_path = './results/ft_{}.pkl'.format(model_name)

        grid_search(ActiveGradientBoosting,
                    ft_params,
                    result_path,
                    num_draws,
                    train_size,
                    active_learning_iter,
                    active_learning=active_learning,
                    w_hx=w_hx,
                    w_k=w_k,
                    draw_success=draw_success,
                    result_dict=result_dict,
                    model_name=model_name)

    else:
        # Load the desired sized dataset under desired option
        dataset = process_dataset(num_draw=num_draws,
                                  train_size=train_size,
                                  active_learning_iter=active_learning_iter,
                                  verbose=verbose,
                                  cross_validation=cross_validation,
                                  full=full,
                                  active_learning=active_learning,
                                  w_hx=w_hx,
                                  w_k=w_k,
                                  success=draw_success)

        draws = list(dataset.keys())
        amine_list = list(dataset[0]['x_t'].keys())

        # print(training_batches.keys())
        for amine in amine_list:
            if amine == 'XZUCBFLUEBDNSJ-UHFFFAOYSA-N' and draw_success:
                # Skipping the amine with only 1 successful experiment overall
                # Can't run 4-ii and 5-ii models on this amine
                continue
            else:
                # Create the GradientBoosting model instance for the specific amine
                AGB = ActiveGradientBoosting(amine=amine,
                                             config=config,
                                             verbose=verbose,
                                             stats_path=stats_path,
                                             result_dict=result_dict,
                                             model_name=model_name)
                for set_id in draws:
                    # Unload the randomly drawn dataset values
                    x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                               dataset[set_id]['y_t'], \
                                                               dataset[set_id]['x_v'], \
                                                               dataset[set_id]['y_v'], \
                                                               dataset[set_id]['all_data'], \
                                                               dataset[set_id]['all_labels']
                    # Load the training and validation set into the model
                    AGB.load_dataset(set_id, x_t[amine], y_t[amine],
                                     x_v[amine], y_v[amine], all_data[amine],
                                     all_labels[amine])

                    # Train the data on the training set
                    AGB.train(warning=warning)

                    # Conduct active learning with all the observations available in the pool
                    if active_learning:
                        AGB.active_learning(num_iter=active_learning_iter,
                                            warning=warning)

                if to_file:
                    AGB.store_metrics_to_file()

                # Save the model for future reproducibility
                if save_model:
                    AGB.save_model(model_name)
Esempio n. 9
0
def run_model(KNN_params, category):
    """Full-scale training, validation and testing using all amines.

    Args:
        KNN_params:         A dictionary of the parameters for the KNN model.
                                See initialize() for more information.
        category:           A string representing the category the model is classified under.
    """

    # Unload common parameters
    config = KNN_params['configs'][category] if KNN_params['configs'] else None
    verbose = KNN_params['verbose']
    warning = KNN_params['warning']
    stats_path = KNN_params['stats_path']
    result_dict = KNN_params['result_dict']

    model_name = KNN_params['model_name']
    print(f'Running model {model_name}')

    # Unload the training data specific parameters
    num_draws = KNN_params['num_draws']
    train_size = KNN_params['train_size']
    active_learning_iter = KNN_params['active_learning_iter']
    cross_validation = KNN_params['cross_validate']
    full = KNN_params['full_dataset']
    active_learning = KNN_params['active_learning']
    w_hx = KNN_params['with_historical_data']
    w_k = KNN_params['with_k']
    draw_success = KNN_params['draw_success']

    # Specify the desired operation
    fine_tuning = KNN_params['fine_tuning']
    save_model = KNN_params['save_model']
    to_file = True

    if fine_tuning:
        # Set all possible combinations
        ft_params = {
            'n_neighbors': [i for i in range(1, 10)],
            'leaf_size': [i for i in range(1, 51)],
            'p': [i for i in range(1, 4)]
        }

        result_path = './results/ft_{}.pkl'.format(model_name)

        grid_search(
            ActiveKNN,
            ft_params,
            result_path,
            num_draws,
            train_size,
            active_learning_iter,
            active_learning=active_learning,
            w_hx=w_hx,
            w_k=w_k,
            draw_success=draw_success,
            result_dict=result_dict,
            model_name=model_name,
        )

    else:
        # Load the desired sized dataset under desired option
        dataset = process_dataset(
            num_draw=num_draws,
            train_size=train_size,
            active_learning_iter=active_learning_iter,
            verbose=verbose,
            cross_validation=cross_validation,
            full=full,
            active_learning=active_learning,
            w_hx=w_hx,
            w_k=w_k,
            success=draw_success,
        )

        draws = list(dataset.keys())
        amine_list = list(dataset[0]['x_t'].keys())

        for amine in amine_list:
            # Create the KNN model instance for the specific amine
            KNN = ActiveKNN(amine=amine, config=config, verbose=verbose, stats_path=stats_path, result_dict=result_dict,
                            model_name=model_name)
            for set_id in draws:
                # Unload the randomly drawn dataset values
                x_t, y_t, x_v, y_v, all_data, all_labels = dataset[set_id]['x_t'], \
                                                           dataset[set_id]['y_t'], \
                                                           dataset[set_id]['x_v'], \
                                                           dataset[set_id]['y_v'], \
                                                           dataset[set_id]['all_data'], \
                                                           dataset[set_id]['all_labels']

                # Load the training and validation set into the model
                KNN.load_dataset(set_id, x_t[amine], y_t[amine], x_v[amine], y_v[amine], all_data[amine],
                                 all_labels[amine])

                # Train the data on the training set
                KNN.train(warning=warning)

                # Conduct active learning with all the observations available in the pool
                if active_learning:
                    KNN.active_learning(num_iter=active_learning_iter, warning=warning)

            if to_file:
                KNN.store_metrics_to_file()

            # Save the model for future reproducibility
            if save_model:
                KNN.save_model(model_name)