Example #1
0
class PurposeFieldModel:
    def __init__(self, config):
        """ Combines the purpose and field vectorizers and prediction models into a single "composite" model.

        Takes configurations for building the vectorizers and classification models. The 'purpose' and 'field' categories each get their own
        vectorizer and model. The vectorizer configs can be retrieved from HerdVectorizer class, the model configs may be retrieved from the
        ClassificationModel class.
    
        Args:
            config (dictionary): Dictionary containing the following configuration: 'purpose_vectorizer', 'field_vectorizer', 'purpose_model', 'field_model'.
    
        """
        self.purpose_vectorizer = HerdVectorizer(config['purpose_vectorizer'])
        self.field_vectorizer = HerdVectorizer(config['field_vectorizer'])

        self.purpose_model = ClassificationModel(config['purpose_model'])
        self.field_model = ClassificationModel(config['field_model'])

        return

    def fit(self, abstracts, Y_purpose, Y_field):
        """ Trains the model.

        Input arguments must all be the same length.

        Args:
            abstracts (list): A list of documents, each document is represented as a list of words.
            Y_purpose (list): A list of labels of the 'purpose' variety.
            Y_field (list): A list of labels of the 'field' variety.
        """
        self.purpose_vectorizer.train(abstracts, Y_purpose, 'purpose')
        self.field_vectorizer.train(abstracts, Y_field, 'field')

        X_purpose = self.purpose_vectorizer.transform_data(abstracts)
        X_field = self.field_vectorizer.transform_data(abstracts)

        Y_purpose = self.purpose_model.labelset.mat2vec(Y_purpose)
        Y_field = self.field_model.labelset.mat2vec(Y_field)

        self.purpose_model.fit(X_purpose, Y_purpose)
        self.field_model.fit(X_field, Y_field)

        return

    def predict(self, abstracts):
        """ Make predictions on the input data.

        The list of documents input is vectorized and input to the prediction model, which generates label predictions.
        This process is done separately for generating both purpose and field label predictions.

        Args:
            abstracts (list): A list of documents, each document is represented as a list of words.

        Returns:
            dictionary: dictionary containing two lists of predictions, dictionary keys are 'purpose' and 'field'.
        """
        X_purpose = self.purpose_vectorizer.transform_data(abstracts)
        X_field = self.field_vectorizer.transform_data(abstracts)

        predictions = dict()
        predictions['purpose'] = self.purpose_model.predict(X_purpose)
        predictions['field'] = self.field_model.predict(X_field)

        return predictions

    def get_config(self):
        """ Returns the configuration used to build this model.

        Returns:
            dict: dictionary containing the following keys, 'purpose_vectorizer', 'field_vectorizer', 'purpose_model', 'field_model'. Each entry is the configuration required to build the model.
        """
        config = dict()
        config['purpose_vectorizer'] = self.purpose_vectorizer.get_config()
        config['field_vectorizer'] = self.field_vectorizer.get_config()
        config['purpose_model'] = self.purpose_model.get_config()
        config['field_model'] = self.field_model.get_config()
        return config
Example #2
0
def MultiLR():
    """ Program for running an experiment using Loigistic Regression classifier."""
    parser = argparse.ArgumentParser(description='Simple Decision Tree Search')
    parser.add_argument('-o', dest='fout', type=str, help='output file name')
    parser.add_argument('-p',
                        dest='parameters',
                        type=str,
                        help='parameters to exercise')
    args = parser.parse_args()

    # options for the preprocessor
    clf_parser = argparse.ArgumentParser(
        description='representation testing options')
    clf_parser.add_argument('-g', action='store_true')
    clf_parser.add_argument('-d',
                            dest='dataset',
                            type=str,
                            help='dataset(xlsx or hdf5)')

    clf_parser.add_argument(
        '--target_set',
        dest='target_set',
        type=str,
        default='purpose',
        help='target set for prediction, {purpose, field, custom}')
    clf_parser.add_argument('--stemmer',
                            dest='stemmer',
                            type=str,
                            default=None)
    clf_parser.add_argument('--tok_min_df',
                            dest='token_min_df',
                            type=int,
                            default=10,
                            help='Min. doc. freq. of tokens')
    clf_parser.add_argument('--tok_max_df',
                            dest='token_max_df',
                            type=int,
                            default=200,
                            help='Max. doc. freq. of tokens')
    clf_parser.add_argument('--bigrams', dest='bigrams', action='store_true')
    clf_parser.add_argument('--bigram_window_size',
                            dest='bigram_window_size',
                            type=int,
                            default=2,
                            help='bigram window size')
    clf_parser.add_argument('--bigram_filter_size',
                            dest='bigram_filter_size',
                            type=int,
                            default=5,
                            help='bigram filter size')
    clf_parser.add_argument('--bigram_nbest',
                            dest='bigram_nbest',
                            type=int,
                            default=2000,
                            help='bigram N best')
    clf_parser.add_argument('--trigrams', dest='trigrams', action='store_true')
    clf_parser.add_argument('--trigram_window_size',
                            dest='trigram_window_size',
                            type=int,
                            default=2,
                            help='trigram window size')
    clf_parser.add_argument('--trigram_filter_size',
                            dest='trigram_filter_size',
                            type=int,
                            default=10,
                            help='trigram filter size')
    clf_parser.add_argument('--trigram_nbest',
                            dest='trigram_nbest',
                            type=int,
                            default=1000,
                            help='trigram N best')
    clf_parser.add_argument('--fselect',
                            dest='fselect',
                            type=str,
                            default='chi2',
                            help='feature selection scoring function')
    clf_parser.add_argument(
        '--kbest',
        dest='kbest',
        type=int,
        default=500,
        help='feature selection option: Kbest feature to keep')
    clf_parser.add_argument(
        '--selectfunc',
        dest='selectfunc',
        type=str,
        default='mean',
        help='function to integrate selected features in multi-class scenario')

    clf_args = clf_parser.parse_args(
        [x for x in args.parameters.strip().split(' ') if x != ''])

    # load data
    filename, file_ext = os.path.splitext(clf_args.dataset)
    if file_ext == '.hdf5':
        df = pandas.read_hdf(clf_args.dataset)
    elif file_ext == '.xlsx':
        df = pandas.read_excel(clf_args.dataset)

    # retrieve the label set
    label_set = []
    if clf_args.target_set == 'purpose':
        label_set = LabelTransformer(
            LabelTransformer.default_labels('purpose'))
    elif clf_args.target_set == 'field':
        label_set = LabelTransformer(LabelTransformer.default_labels('field'))
    elif clf_args.target_set == 'custom':
        raise ValueError('TODO: implement custom label set here!')
    else:
        print('target_set invalid!')
        exit()

    # split into train/test sets
    df_train = df[df.cv_index != 0]
    df_test = df[df.cv_index == 0]

    Y_train = list(df_train[clf_args.target_set])
    Y_test = list(df_test[clf_args.target_set])

    Y_train = label_set.label2mat(Y_train)
    Y_test = label_set.label2mat(Y_test)

    # prepare the vectorizer and vectorize the data
    myVec = HerdVectorizer()
    if clf_args.bigrams and clf_args.bigram_nbest != 0:
        myVec.set_bigrams(True, clf_args.bigram_window_size,
                          clf_args.bigram_filter_size, clf_args.bigram_nbest)

    if clf_args.trigrams and clf_args.trigram_nbest != 0:
        myVec.set_trigrams(True, clf_args.trigram_window_size,
                           clf_args.trigram_filter_size,
                           clf_args.trigram_nbest)

    myVec.set_stemmer(clf_args.stemmer)

    if clf_args.fselect and clf_args.kbest and clf_args.selectfunc:
        myVec.set_feature_selector(clf_args.fselect, clf_args.kbest,
                                   clf_args.selectfunc)

    myVec.train(df_train['sow'], Y_train, label_set)
    X_train_validate = myVec.transform_data(df_train['sow'])

    # return an index as a single vector with values 1-5,
    # Need to convert this to an iterable object with train,test folds
    cv_folds = df_train['cv_index'].values
    train_folds = []
    test_folds = []
    for i in [1, 2, 3, 4, 5]:
        train_fold = [ind for ind, val in enumerate(cv_folds) if val != i]
        test_fold = [ind for ind, val in enumerate(cv_folds) if val == i]
        train_folds.append(train_fold)
        test_folds.append(test_fold)

    cv_folds = zip(train_folds, test_folds)

    params = [{
        'penalty': ['l1'],
        'C': [0.0001, 0.001, 0.01, 0.1, 10, 100, 1000, 10000],
        'class_weight': ['balanced'],
        'max_iter': [300],
        'multi_class': ['ovr'],
        'solver': ['liblinear']
    }, {
        'penalty': ['l2'],
        'C': [0.0001, 0.001, 0.01, 0.1, 10, 100, 1000, 10000],
        'class_weight': ['balanced'],
        'max_iter': [300],
        'multi_class': ['ovr'],
        'solver': ['liblinear']
    }, {
        'penalty': ['l2'],
        'C': [0.0001, 0.001, 0.01, 0.1, 10, 100, 1000, 10000],
        'class_weight': ['balanced'],
        'max_iter': [300],
        'multi_class': ['multinomial'],
        'solver': ['newton-cg']
    }]

    y_train = label_set.mat2vec(Y_train)

    test_model = LogisticRegression()
    test_model.fit(X_train_validate, y_train)

    grid = GridSearchCV(estimator=LogisticRegression(),
                        param_grid=params,
                        cv=cv_folds,
                        n_jobs=1,
                        scoring='f1_micro')
    grid.fit(X_train_validate, y_train)
    best_dt_model = LogisticRegression(
        penalty=grid.best_params_['penalty'],
        C=grid.best_params_['C'],
        multi_class=grid.best_params_['multi_class'],
        max_iter=grid.best_params_['max_iter'],
        class_weight=grid.best_params_['class_weight'],
        solver=grid.best_params_['solver'])

    # reset iterator so we can get metric of our own
    cv_folds = zip(train_folds, test_folds)

    # initialize nested dictionary to store CV metrics
    metrics = {}
    for label in label_set:
        metrics[label] = {}
        metrics[label]['accuracy'] = []
        metrics[label]['recall'] = []
        metrics[label]['precision'] = []
        metrics[label]['f1'] = []
        metrics[label]['mcc'] = []
    metrics['f1-micro'] = []
    metrics['f1-macro'] = []

    for train_ind, test_ind in cv_folds:
        best_dt_model.fit(X_train_validate[train_ind, :], y_train[train_ind])
        y_predict = best_dt_model.predict(X_train_validate[test_ind])
        y_test = y_train[test_ind]

        y_multi_test = label_set.vec2mat(y_test)
        y_multi_predict = label_set.vec2mat(y_predict)

        for i, label in enumerate(label_set):
            metrics[label]['accuracy'].append(
                skmetrics.accuracy_score(y_multi_test[:, i],
                                         y_multi_predict[:, i]))
            metrics[label]['recall'].append(
                skmetrics.recall_score(y_multi_test[:, i], y_multi_predict[:,
                                                                           i]))
            metrics[label]['precision'].append(
                skmetrics.precision_score(y_multi_test[:, i],
                                          y_multi_predict[:, i]))
            metrics[label]['f1'].append(
                skmetrics.f1_score(y_multi_test[:, i], y_multi_predict[:, i]))
            metrics[label]['mcc'].append(
                skmetrics.matthews_corrcoef(y_multi_test[:, i],
                                            y_multi_predict[:, i]))
        metrics['f1-micro'].append(
            skmetrics.f1_score(y_multi_test, y_multi_predict, average='micro'))
        metrics['f1-macro'].append(
            skmetrics.f1_score(y_multi_test, y_multi_predict, average='macro'))

    for label in label_set:
        metrics[label]['accuracy'] = np.mean(metrics[label]['accuracy'])
        metrics[label]['recall'] = np.mean(metrics[label]['recall'])
        metrics[label]['precision'] = np.mean(metrics[label]['precision'])
        metrics[label]['f1'] = np.mean(metrics[label]['f1'])
        metrics[label]['mcc'] = np.mean(metrics[label]['mcc'])

    metrics['f1-micro'] = np.mean(metrics['f1-micro'])
    metrics['f1-macro'] = np.mean(metrics['f1-macro'])

    # need to combine all stats and scores and save as a yaml file
    results = {}
    results['classifier'] = 'Logistic Regression'
    results['metrics'] = metrics  # metrics as calculated by this script
    results[
        'best_params'] = grid.best_params_  # best parameters found via grid search
    results['clf_parameters'] = vars(
        clf_args)  # parameters passed into this script
    print(results)

    with open(args.fout, 'w') as fout:
        yaml.dump(results, fout, explicit_start=True, default_flow_style=False)