Ejemplo n.º 1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-m', '--model', dest='model_name', type=str, required=True,
                        help='The binary file with the text classifier.')
    parser.add_argument('--conv1', dest='size_of_conv1', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 1.')
    parser.add_argument('--conv2', dest='size_of_conv2', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 2.')
    parser.add_argument('--conv3', dest='size_of_conv3', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 3.')
    parser.add_argument('--conv4', dest='size_of_conv4', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 4.')
    parser.add_argument('--conv5', dest='size_of_conv5', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 5.')
    parser.add_argument('--hidden', dest='hidden_layer_size', type=str, required=False, default='500',
                        help='Size of each hidden layer and total number of hidden layers (separate them with colons).')
    parser.add_argument('--num_monte_carlo', dest='num_monte_carlo', type=int, required=False, default=10,
                        help='Number of generated Monte Carlo samples for each data sample.')
    parser.add_argument('--batch_size', dest='batch_size', type=int, required=False, default=16,
                        help='Size of mini-batch.')
    parser.add_argument('--gpu_frac', dest='gpu_memory_frac', type=float, required=False, default=0.9,
                        help='Allocable part of the GPU memory for the classifier.')
    parser.add_argument('--nn_type', dest='nn_type', type=str, choices=['bayesian', 'usual'],
                        required=False, default='bayesian', help='Neural network type: `bayesian` or `usual`.')
    args = parser.parse_args()

    model_name = os.path.normpath(args.model_name)
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            nn = pickle.load(fp)
    else:
        hidden_layer_size, n_hidden_layers = parse_hidden_layers_description(args.hidden_layer_size)
        train_texts, train_labels = load_data('train')
        print('Number of samples for training is {0}.'.format(len(train_texts)))
        nn = ImpartialTextClassifier(filters_for_conv1=args.size_of_conv1, filters_for_conv2=args.size_of_conv2,
                                    filters_for_conv3=args.size_of_conv3, filters_for_conv4=args.size_of_conv4,
                                    filters_for_conv5=args.size_of_conv5, hidden_layer_size=hidden_layer_size,
                                    n_hidden_layers=n_hidden_layers, batch_size=args.batch_size,
                                    num_monte_carlo=args.num_monte_carlo, gpu_memory_frac=args.gpu_memory_frac,
                                    verbose=True, multioutput=False, random_seed=42, validation_fraction=0.15,
                                    max_epochs=100, patience=5, bayesian=(args.nn_type == 'bayesian'),
                                    kl_weight_init=0.05, kl_weight_fin=0.05)
        nn.fit(train_texts, train_labels)
        print('')
        with open(model_name, 'wb') as fp:
            pickle.dump(nn, fp)
    test_texts, test_labels = load_data('test')
    print('')
    print('Number of samples for final testing is {0}.'.format(len(test_texts)))
    print('Test F1-macro is {0:.2%}.'.format(nn.score(test_texts, test_labels)))
Ejemplo n.º 2
0
 def train(args) -> ImpartialTextClassifier:
     conv1_ = int(args[0])
     conv2_ = int(args[1])
     conv3_ = int(args[2])
     conv4_ = int(args[3])
     conv5_ = int(args[4])
     hidden_layer_size_ = int(args[5])
     n_hidden_layers_ = int(args[6])
     if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
         hidden_layer_size_ = 0
         n_hidden_layers_ = 0
     if nn_type == 'bayesian':
         init_kl_weight = float(args[7])
         fin_kl_weight = float(args[8])
     else:
         init_kl_weight = 1.0
         fin_kl_weight = 1.0
     train_index, val_index = ImpartialTextClassifier.train_test_split(labels, 0.1)
     if unlabeled_texts_for_training is None:
         train_texts = labeled_texts[train_index]
         train_labels = labels[train_index]
     else:
         train_texts = np.concatenate(
             (
                 labeled_texts[train_index],
                 unlabeled_texts_for_training
             )
         )
         train_labels = np.concatenate(
             (
                 labels[train_index],
                 np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
             )
         )
     val_texts = labeled_texts[val_index]
     val_labels = labels[val_index]
     cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                          else bert_handle),
                                  filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                  filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                  hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                  batch_size=16, gpu_memory_frac=gpu_memory_frac, num_monte_carlo=num_monte_carlo,
                                  verbose=True, random_seed=42, max_epochs=100, patience=5, multioutput=multioutput,
                                  bayesian=(nn_type == 'bayesian'),
                                  kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
     if os.path.exists(os.path.normpath(bert_handle)):
         cls.PATH_TO_BERT = os.path.normpath(bert_handle)
     cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
     del train_texts, train_labels, val_texts, val_labels
     return cls
Ejemplo n.º 3
0
 def func(args):
     conv1_ = int(args[0])
     conv2_ = int(args[1])
     conv3_ = int(args[2])
     conv4_ = int(args[3])
     conv5_ = int(args[4])
     hidden_layer_size_ = int(args[5])
     n_hidden_layers_ = int(args[6])
     if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
         hidden_layer_size_ = 0
         n_hidden_layers_ = 0
     quality = 0.0
     print('Filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'.format(
         conv1_, conv2_, conv3_, conv4_, conv5_))
     if n_hidden_layers_ > 0:
         print('Hidden layer size is {0}.'.format(hidden_layer_size_))
         print('Number of hidden layers is {0}.'.format(n_hidden_layers_))
     if nn_type == 'bayesian':
         init_kl_weight = float(args[7])
         fin_kl_weight = float(args[8])
         print('Optimal value of initial KL weight is {0:.6f}.'.format(init_kl_weight))
         print('Optimal value of final KL weight is {0:.6f}.'.format(fin_kl_weight))
     else:
         init_kl_weight = 1.0
         fin_kl_weight = 1.0
     if sum(args) == 0:
         return 1.0
     for fold_idx, (train_index, test_index) in enumerate(indices_for_cv):
         cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                              else bert_handle),
                                      filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                      filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                      hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                      multioutput=multioutput, gpu_memory_frac=gpu_memory_frac,
                                      num_monte_carlo=num_monte_carlo, verbose=False, random_seed=42, max_epochs=100,
                                      patience=5, batch_size=16, bayesian=(nn_type == 'bayesian'),
                                      kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
         if os.path.exists(os.path.normpath(bert_handle)):
             cls.PATH_TO_BERT = os.path.normpath(bert_handle)
         train_texts = labeled_texts[train_index]
         train_labels = labels[train_index]
         train_index_, val_index = cls.train_test_split(train_labels, 0.1)
         val_texts = train_texts[val_index]
         val_labels = train_labels[val_index]
         if unlabeled_texts_for_training is None:
             train_texts = train_texts[train_index_]
             train_labels = train_labels[train_index_]
         else:
             train_texts = np.concatenate(
                 (
                     train_texts[train_index_],
                     unlabeled_texts_for_training
                 )
             )
             train_labels = np.concatenate(
                 (
                     train_labels[train_index_],
                     np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                 )
             )
         cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
         del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
         if unlabeled_texts_for_testing is None:
             texts_for_final_testing = labeled_texts[test_index]
             labels_for_final_testing = labels[test_index]
         else:
             texts_for_final_testing = np.concatenate(
                 (
                     labeled_texts[test_index],
                     unlabeled_texts_for_testing
                 )
             )
             labels_for_final_testing = np.concatenate(
                 (
                     labels[test_index],
                     np.full(shape=(len(unlabeled_texts_for_testing),), fill_value=-1, dtype=np.int32)
                 )
             )
         instant_quality = cls.score(texts_for_final_testing, labels_for_final_testing)
         quality += instant_quality
         print('Fold {0}: {1:.6f}.'.format(fold_idx + 1, instant_quality))
         del cls, texts_for_final_testing, labels_for_final_testing
     quality /= float(len(indices_for_cv))
     print('Total quality = {0:.6f}.'.format(quality))
     print('')
     return -quality
Ejemplo n.º 4
0
def main():

    def func(args):
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        hidden_layer_size_ = int(args[5])
        n_hidden_layers_ = int(args[6])
        if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
            hidden_layer_size_ = 0
            n_hidden_layers_ = 0
        quality = 0.0
        print('Filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'.format(
            conv1_, conv2_, conv3_, conv4_, conv5_))
        if n_hidden_layers_ > 0:
            print('Hidden layer size is {0}.'.format(hidden_layer_size_))
            print('Number of hidden layers is {0}.'.format(n_hidden_layers_))
        if nn_type == 'bayesian':
            init_kl_weight = float(args[7])
            fin_kl_weight = float(args[8])
            print('Optimal value of initial KL weight is {0:.6f}.'.format(init_kl_weight))
            print('Optimal value of final KL weight is {0:.6f}.'.format(fin_kl_weight))
        else:
            init_kl_weight = 1.0
            fin_kl_weight = 1.0
        if sum(args) == 0:
            return 1.0
        for fold_idx, (train_index, test_index) in enumerate(indices_for_cv):
            cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                                 else bert_handle),
                                         filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                         filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                         hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                         multioutput=multioutput, gpu_memory_frac=gpu_memory_frac,
                                         num_monte_carlo=num_monte_carlo, verbose=False, random_seed=42, max_epochs=100,
                                         patience=5, batch_size=16, bayesian=(nn_type == 'bayesian'),
                                         kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
            if os.path.exists(os.path.normpath(bert_handle)):
                cls.PATH_TO_BERT = os.path.normpath(bert_handle)
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
            train_index_, val_index = cls.train_test_split(train_labels, 0.1)
            val_texts = train_texts[val_index]
            val_labels = train_labels[val_index]
            if unlabeled_texts_for_training is None:
                train_texts = train_texts[train_index_]
                train_labels = train_labels[train_index_]
            else:
                train_texts = np.concatenate(
                    (
                        train_texts[train_index_],
                        unlabeled_texts_for_training
                    )
                )
                train_labels = np.concatenate(
                    (
                        train_labels[train_index_],
                        np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                    )
                )
            cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
            del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
            if unlabeled_texts_for_testing is None:
                texts_for_final_testing = labeled_texts[test_index]
                labels_for_final_testing = labels[test_index]
            else:
                texts_for_final_testing = np.concatenate(
                    (
                        labeled_texts[test_index],
                        unlabeled_texts_for_testing
                    )
                )
                labels_for_final_testing = np.concatenate(
                    (
                        labels[test_index],
                        np.full(shape=(len(unlabeled_texts_for_testing),), fill_value=-1, dtype=np.int32)
                    )
                )
            instant_quality = cls.score(texts_for_final_testing, labels_for_final_testing)
            quality += instant_quality
            print('Fold {0}: {1:.6f}.'.format(fold_idx + 1, instant_quality))
            del cls, texts_for_final_testing, labels_for_final_testing
        quality /= float(len(indices_for_cv))
        print('Total quality = {0:.6f}.'.format(quality))
        print('')
        return -quality

    def score(args):
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        hidden_layer_size_ = int(args[5])
        n_hidden_layers_ = int(args[6])
        if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
            hidden_layer_size_ = 0
            n_hidden_layers_ = 0
        print('Optimal filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'.format(
            conv1_, conv2_, conv3_, conv4_, conv5_))
        if n_hidden_layers_ > 0:
            print('Optimal size of the hidden layer is {0}.'.format(hidden_layer_size_))
            print('Optimal number of hidden layers is {0}.'.format(n_hidden_layers_))
        if nn_type == 'bayesian':
            init_kl_weight = float(args[7])
            fin_kl_weight = float(args[8])
            print('Optimal value of initial KL weight is {0:.6f}.'.format(init_kl_weight))
            print('Optimal value of final KL weight is {0:.6f}.'.format(fin_kl_weight))
        else:
            init_kl_weight = 1.0
            fin_kl_weight = 1.0
        print('')
        y_pred = []
        y_true = []
        unlabeled_is_added = False
        for train_index, test_index in indices_for_cv:
            cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                                 else bert_handle),
                                         filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                         filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                         hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                         batch_size=16, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42,
                                         num_monte_carlo=num_monte_carlo, max_epochs=100, patience=5,
                                         multioutput=multioutput, bayesian=(nn_type == 'bayesian'),
                                         kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
            if os.path.exists(os.path.normpath(bert_handle)):
                cls.PATH_TO_BERT = os.path.normpath(bert_handle)
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
            train_index_, val_index = cls.train_test_split(train_labels, 0.1)
            val_texts = train_texts[val_index]
            val_labels = train_labels[val_index]
            if unlabeled_texts_for_training is None:
                train_texts = train_texts[train_index_]
                train_labels = train_labels[train_index_]
            else:
                train_texts = np.concatenate(
                    (
                        train_texts[train_index_],
                        unlabeled_texts_for_training
                    )
                )
                train_labels = np.concatenate(
                    (
                        train_labels[train_index_],
                        np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                    )
                )
            cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
            print('')
            del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
            if (not unlabeled_is_added) and (unlabeled_texts_for_testing is not None):
                y_pred.append(cls.predict(unlabeled_texts_for_testing))
                unlabeled_is_added = True
                y_true.append(np.full(shape=(len(unlabeled_texts_for_testing),), fill_value=-1, dtype=np.int32))
            y_pred.append(cls.predict(labeled_texts[test_index]))
            y_true.append(labels[test_index])
            del cls
        y_pred = np.concatenate(y_pred)
        y_true = np.concatenate(y_true)
        print('')
        if multioutput:
            for class_idx in range(len(classes_list)):
                y_true_ = np.zeros((len(y_true),), dtype=np.int32)
                y_pred_ = np.zeros((len(y_pred),), dtype=np.int32)
                for sample_idx in range(len(y_true)):
                    if isinstance(y_true[sample_idx], set):
                        if class_idx in y_true[sample_idx]:
                            y_true_[sample_idx] = 1
                    elif class_idx == y_true[sample_idx]:
                        y_true_[sample_idx] = 1
                    if isinstance(y_pred[sample_idx], set):
                        if class_idx in y_pred[sample_idx]:
                            y_pred_[sample_idx] = 1
                    elif class_idx == y_pred[sample_idx]:
                        y_pred_[sample_idx] = 1
                print(classification_report(y_true, y_pred, target_names=['OTHER', classes_list[class_idx]], digits=4))
        else:
            for sample_idx in range(len(y_true)):
                if y_true[sample_idx] < 0:
                    y_true[sample_idx] = len(classes_list)
                if y_pred[sample_idx] < 0:
                    y_pred[sample_idx] = len(classes_list)
            print(classification_report(y_true, y_pred, target_names=classes_list + ['UNKNOWN'], digits=4))
            print('')

    def train(args) -> ImpartialTextClassifier:
        conv1_ = int(args[0])
        conv2_ = int(args[1])
        conv3_ = int(args[2])
        conv4_ = int(args[3])
        conv5_ = int(args[4])
        hidden_layer_size_ = int(args[5])
        n_hidden_layers_ = int(args[6])
        if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
            hidden_layer_size_ = 0
            n_hidden_layers_ = 0
        if nn_type == 'bayesian':
            init_kl_weight = float(args[7])
            fin_kl_weight = float(args[8])
        else:
            init_kl_weight = 1.0
            fin_kl_weight = 1.0
        train_index, val_index = ImpartialTextClassifier.train_test_split(labels, 0.1)
        if unlabeled_texts_for_training is None:
            train_texts = labeled_texts[train_index]
            train_labels = labels[train_index]
        else:
            train_texts = np.concatenate(
                (
                    labeled_texts[train_index],
                    unlabeled_texts_for_training
                )
            )
            train_labels = np.concatenate(
                (
                    labels[train_index],
                    np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                )
            )
        val_texts = labeled_texts[val_index]
        val_labels = labels[val_index]
        cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                             else bert_handle),
                                     filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                     filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                     hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                     batch_size=16, gpu_memory_frac=gpu_memory_frac, num_monte_carlo=num_monte_carlo,
                                     verbose=True, random_seed=42, max_epochs=100, patience=5, multioutput=multioutput,
                                     bayesian=(nn_type == 'bayesian'),
                                     kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
        if os.path.exists(os.path.normpath(bert_handle)):
            cls.PATH_TO_BERT = os.path.normpath(bert_handle)
        cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
        del train_texts, train_labels, val_texts, val_labels
        return cls

    parser = ArgumentParser()
    parser.add_argument('-m', '--model', dest='model_name', type=str, required=True,
                        help='The binary file with the text classifier.')
    parser.add_argument('-b', '--bert', dest='bert', type=str, required=False,
                        default='https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1',
                        help='URL of used TF-Hub BERT model (or path to the BERT model in local drive).')
    parser.add_argument('-c', '--csv', dest='csv_data_file', type=str, required=True,
                        help='Path to the CSV file with labeled data.')
    parser.add_argument('-t', '--train', dest='train_file_name', type=str, required=False, default='',
                        help='Path to the text file with unlabeled data for training.')
    parser.add_argument('-e', '--test', dest='test_file_name', type=str, required=False, default='',
                        help='Path to the text file with unlabeled data for evaluation.')
    parser.add_argument('--gpu_frac', dest='gpu_memory_frac', type=float, required=False, default=0.9,
                        help='Allocable part of the GPU memory for the classifier.')
    parser.add_argument('--nn_type', dest='nn_type', type=str, choices=['bayesian', 'usual'], required=False,
                        default='bayesian', help='Neural network type: `bayesian` or `usual`.')
    parser.add_argument('--num_monte_carlo', dest='num_monte_carlo', type=int, required=False, default=100,
                        help='Number of generated Monte Carlo samples for each data sample.')
    parser.add_argument('--conv1', dest='size_of_conv1', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 1.')
    parser.add_argument('--conv2', dest='size_of_conv2', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 2.')
    parser.add_argument('--conv3', dest='size_of_conv3', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 3.')
    parser.add_argument('--conv4', dest='size_of_conv4', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 4.')
    parser.add_argument('--conv5', dest='size_of_conv5', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 5.')
    parser.add_argument('--hidden', dest='hidden_layer_size', type=str, required=False, default='500',
                        help='Size of each hidden layer and total number of hidden layers (separate them with colons).')
    parser.add_argument('--init_kl_weight', dest='init_kl_weight', type=float, required=False, default=1e-1,
                        help='Initial value of KL weight.')
    parser.add_argument('--fin_kl_weight', dest='fin_kl_weight', type=float, required=False, default=1e-2,
                        help='Final value of KL weight.')
    parser.add_argument('--search', dest='search_hyperparameters', required=False, action='store_true',
                        default=False, help='Will be hyperparameters found by the Bayesian optimization?')
    cmd_args = parser.parse_args()

    num_monte_carlo = cmd_args.num_monte_carlo
    gpu_memory_frac = cmd_args.gpu_memory_frac
    bert_handle = cmd_args.bert
    nn_type = cmd_args.nn_type
    model_name = os.path.normpath(cmd_args.model_name)
    labeled_data_name = os.path.normpath(cmd_args.csv_data_file)
    unlabeled_train_data_name = cmd_args.train_file_name.strip()
    hidden_layer_size, n_hidden_layers = parse_hidden_layers_description(cmd_args.hidden_layer_size)
    if len(unlabeled_train_data_name) > 0:
        unlabeled_train_data_name = os.path.normpath(unlabeled_train_data_name)
        unlabeled_texts_for_training = load_unlabeled_texts(unlabeled_train_data_name)
        assert len(unlabeled_texts_for_training) > 0, 'File `{0}` is empty!'.format(unlabeled_train_data_name)
    else:
        unlabeled_texts_for_training = None
    unlabeled_test_data_name = cmd_args.test_file_name.strip()
    if len(unlabeled_test_data_name) > 0:
        unlabeled_test_data_name = os.path.normpath(unlabeled_test_data_name)
        unlabeled_texts_for_testing = load_unlabeled_texts(unlabeled_test_data_name)
        assert len(unlabeled_texts_for_testing) > 0, 'File `{0}` is empty!'.format(unlabeled_test_data_name)
    else:
        unlabeled_texts_for_testing = None
    labeled_texts, labels, classes_list = read_csv(labeled_data_name, 7)
    print('Number of labeled texts is {0}.'.format(len(labeled_texts)))
    print('Number of classes is {0}.'.format(len(classes_list)))
    if any(map(lambda it: isinstance(it, set), labels)):
        print('Some data samples can be corresponded to several labels at once.')
        multioutput = True
    else:
        multioutput = False
    print('')
    print_classes_distribution(labels, classes_list)
    np.random.seed(42)
    indices_for_cv = ImpartialTextClassifier.cv_split(labels, 5)
    if cmd_args.search_hyperparameters:
        dimensions = [Integer(0, 300), Integer(0, 300), Integer(0, 300), Integer(0, 300), Integer(0, 300),
                      Integer(100, 2000), Integer(0, 3)]
        if nn_type == 'bayesian':
            dimensions += [Real(1e-5, 1.0, prior='log-uniform'), Real(1e-5, 1.0, prior='log-uniform')]
        optimal_res = gp_minimize(
            func, dimensions=dimensions,
            n_calls=100, n_random_starts=5, random_state=42, verbose=False, n_jobs=1
        )
        print('')
        hyperparameters = optimal_res.x
    else:
        hyperparameters = [cmd_args.size_of_conv1, cmd_args.size_of_conv2, cmd_args.size_of_conv3,
                           cmd_args.size_of_conv4, cmd_args.size_of_conv5, hidden_layer_size, n_hidden_layers,
                           cmd_args.init_kl_weight, cmd_args.fin_kl_weight]
    score(hyperparameters)
    with open(model_name, 'wb') as fp:
        pickle.dump(train(hyperparameters), fp)
Ejemplo n.º 5
0
 def score(args):
     conv1_ = int(args[0])
     conv2_ = int(args[1])
     conv3_ = int(args[2])
     conv4_ = int(args[3])
     conv5_ = int(args[4])
     hidden_layer_size_ = int(args[5])
     n_hidden_layers_ = int(args[6])
     if (n_hidden_layers_ == 0) or (hidden_layer_size_ == 0):
         hidden_layer_size_ = 0
         n_hidden_layers_ = 0
     print('Optimal filters number for different convolution kernels: ({0}, {1}, {2}, {3}, {4})'.format(
         conv1_, conv2_, conv3_, conv4_, conv5_))
     if n_hidden_layers_ > 0:
         print('Optimal size of the hidden layer is {0}.'.format(hidden_layer_size_))
         print('Optimal number of hidden layers is {0}.'.format(n_hidden_layers_))
     if nn_type == 'bayesian':
         init_kl_weight = float(args[7])
         fin_kl_weight = float(args[8])
         print('Optimal value of initial KL weight is {0:.6f}.'.format(init_kl_weight))
         print('Optimal value of final KL weight is {0:.6f}.'.format(fin_kl_weight))
     else:
         init_kl_weight = 1.0
         fin_kl_weight = 1.0
     print('')
     y_pred = []
     y_true = []
     unlabeled_is_added = False
     for train_index, test_index in indices_for_cv:
         cls = ImpartialTextClassifier(bert_hub_module_handle=(None if os.path.exists(os.path.normpath(bert_handle))
                                                              else bert_handle),
                                      filters_for_conv1=conv1_, filters_for_conv2=conv2_, filters_for_conv3=conv3_,
                                      filters_for_conv4=conv4_, filters_for_conv5=conv5_,
                                      hidden_layer_size=hidden_layer_size_, n_hidden_layers=n_hidden_layers_,
                                      batch_size=16, gpu_memory_frac=gpu_memory_frac, verbose=True, random_seed=42,
                                      num_monte_carlo=num_monte_carlo, max_epochs=100, patience=5,
                                      multioutput=multioutput, bayesian=(nn_type == 'bayesian'),
                                      kl_weight_init=init_kl_weight, kl_weight_fin=fin_kl_weight)
         if os.path.exists(os.path.normpath(bert_handle)):
             cls.PATH_TO_BERT = os.path.normpath(bert_handle)
         train_texts = labeled_texts[train_index]
         train_labels = labels[train_index]
         train_index_, val_index = cls.train_test_split(train_labels, 0.1)
         val_texts = train_texts[val_index]
         val_labels = train_labels[val_index]
         if unlabeled_texts_for_training is None:
             train_texts = train_texts[train_index_]
             train_labels = train_labels[train_index_]
         else:
             train_texts = np.concatenate(
                 (
                     train_texts[train_index_],
                     unlabeled_texts_for_training
                 )
             )
             train_labels = np.concatenate(
                 (
                     train_labels[train_index_],
                     np.full(shape=(len(unlabeled_texts_for_training),), fill_value=-1, dtype=np.int32)
                 )
             )
         cls.fit(train_texts, train_labels, validation_data=(val_texts, val_labels))
         print('')
         del train_texts, train_labels, val_texts, val_labels, train_index_, val_index
         if (not unlabeled_is_added) and (unlabeled_texts_for_testing is not None):
             y_pred.append(cls.predict(unlabeled_texts_for_testing))
             unlabeled_is_added = True
             y_true.append(np.full(shape=(len(unlabeled_texts_for_testing),), fill_value=-1, dtype=np.int32))
         y_pred.append(cls.predict(labeled_texts[test_index]))
         y_true.append(labels[test_index])
         del cls
     y_pred = np.concatenate(y_pred)
     y_true = np.concatenate(y_true)
     print('')
     if multioutput:
         for class_idx in range(len(classes_list)):
             y_true_ = np.zeros((len(y_true),), dtype=np.int32)
             y_pred_ = np.zeros((len(y_pred),), dtype=np.int32)
             for sample_idx in range(len(y_true)):
                 if isinstance(y_true[sample_idx], set):
                     if class_idx in y_true[sample_idx]:
                         y_true_[sample_idx] = 1
                 elif class_idx == y_true[sample_idx]:
                     y_true_[sample_idx] = 1
                 if isinstance(y_pred[sample_idx], set):
                     if class_idx in y_pred[sample_idx]:
                         y_pred_[sample_idx] = 1
                 elif class_idx == y_pred[sample_idx]:
                     y_pred_[sample_idx] = 1
             print(classification_report(y_true, y_pred, target_names=['OTHER', classes_list[class_idx]], digits=4))
     else:
         for sample_idx in range(len(y_true)):
             if y_true[sample_idx] < 0:
                 y_true[sample_idx] = len(classes_list)
             if y_pred[sample_idx] < 0:
                 y_pred[sample_idx] = len(classes_list)
         print(classification_report(y_true, y_pred, target_names=classes_list + ['UNKNOWN'], digits=4))
         print('')
Ejemplo n.º 6
0
def main():
    random_seed = 42
    parser = ArgumentParser()
    parser.add_argument('-m',
                        '--model',
                        dest='model_name',
                        type=str,
                        required=True,
                        help='The binary file with the text classifier.')
    parser.add_argument(
        '-d',
        '--data_dir',
        dest='data_dir',
        type=str,
        required=True,
        help=
        'Path to the directory with SNIPS-2017 data (see `2017-06-custom-intent-engines` subfolder'
        ' of the repository https://github.com/snipsco/nlu-benchmark).')
    parser.add_argument(
        '--conv1',
        dest='size_of_conv1',
        type=int,
        required=False,
        default=200,
        help='Size of the Bayesian convolution layer with kernel size 1.')
    parser.add_argument(
        '--conv2',
        dest='size_of_conv2',
        type=int,
        required=False,
        default=200,
        help='Size of the Bayesian convolution layer with kernel size 2.')
    parser.add_argument(
        '--conv3',
        dest='size_of_conv3',
        type=int,
        required=False,
        default=200,
        help='Size of the Bayesian convolution layer with kernel size 3.')
    parser.add_argument(
        '--conv4',
        dest='size_of_conv4',
        type=int,
        required=False,
        default=200,
        help='Size of the Bayesian convolution layer with kernel size 4.')
    parser.add_argument(
        '--conv5',
        dest='size_of_conv5',
        type=int,
        required=False,
        default=200,
        help='Size of the Bayesian convolution layer with kernel size 5.')
    parser.add_argument(
        '--hidden',
        dest='hidden_layer_size',
        type=str,
        required=False,
        default='500',
        help=
        'Size of each hidden layer and total number of hidden layers (separate them with colons).'
    )
    parser.add_argument(
        '--num_monte_carlo',
        dest='num_monte_carlo',
        type=int,
        required=False,
        default=100,
        help='Number of generated Monte Carlo samples for each data sample.')
    parser.add_argument('--batch_size',
                        dest='batch_size',
                        type=int,
                        required=False,
                        default=64,
                        help='Size of mini-batch.')
    parser.add_argument(
        '--gpu_frac',
        dest='gpu_memory_frac',
        type=float,
        required=False,
        default=0.9,
        help='Allocable part of the GPU memory for the classifier.')
    parser.add_argument(
        '--nn_type',
        dest='nn_type',
        type=str,
        choices=['bayesian', 'usual', 'additional_class'],
        required=False,
        default='bayesian',
        help=
        'Neural network type: `bayesian`, `usual` or `additional_class` (it is same as `usual` '
        'but unlabeled samples are modeled as additional class).')
    args = parser.parse_args()

    model_name = os.path.normpath(args.model_name)
    data_dir = os.path.normpath(args.data_dir)
    hidden_layer_size, n_hidden_layers = parse_hidden_layers_description(
        args.hidden_layer_size)

    train_data, val_data, test_data = read_snips2017_data(data_dir)
    print('Classes list: {0}'.format(sorted(list(set(train_data[1])))))
    print('Number of samples for training is {0}.'.format(len(train_data[0])))
    print('Number of samples for validation is {0}.'.format(len(val_data[0])))
    print('Number of samples for final testing is {0}.'.format(
        len(test_data[0])))
    generate_random_samples(train_data[0], train_data[1])
    print('')
    unlabeled_texts_for_training = load_reuters_corpus()
    unlabeled_texts_for_testing = load_brown_corpus()
    random.seed(random_seed)
    print(
        'Number of unlabeled (unknown) samples for training is {0}. For example:'
        .format(len(unlabeled_texts_for_training)))
    for it in random.sample(unlabeled_texts_for_training, 5):
        print('  {0}'.format(it))
    print(
        'Number of unlabeled (unknown) samples for final testing is {0}. For example:'
        .format(len(unlabeled_texts_for_testing)))
    for it in random.sample(unlabeled_texts_for_testing, 5):
        print('  {0}'.format(it))
    print('')

    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            nn = pickle.load(fp)
    else:
        if args.nn_type == 'additional_class':
            random.shuffle(unlabeled_texts_for_training)
            n = int(round(0.15 * len(unlabeled_texts_for_training)))
            train_texts = train_data[0] + unlabeled_texts_for_training[n:]
            train_labels = train_data[1] + [
                'UNKNOWN'
                for _ in range(len(unlabeled_texts_for_training) - n)
            ]
            val_texts = val_data[0] + unlabeled_texts_for_training[:n]
            val_labels = val_data[1] + ['UNKNOWN' for _ in range(n)]
        else:
            train_texts = train_data[0] + unlabeled_texts_for_training
            train_labels = train_data[1] + [
                -1 for _ in range(len(unlabeled_texts_for_training))
            ]
            val_texts = val_data[0]
            val_labels = val_data[1]
        nn = ImpartialTextClassifier(filters_for_conv1=args.size_of_conv1,
                                     filters_for_conv2=args.size_of_conv2,
                                     filters_for_conv3=args.size_of_conv3,
                                     filters_for_conv4=args.size_of_conv4,
                                     filters_for_conv5=args.size_of_conv5,
                                     batch_size=args.batch_size,
                                     hidden_layer_size=hidden_layer_size,
                                     n_hidden_layers=n_hidden_layers,
                                     num_monte_carlo=args.num_monte_carlo,
                                     gpu_memory_frac=args.gpu_memory_frac,
                                     verbose=True,
                                     multioutput=False,
                                     random_seed=random_seed,
                                     validation_fraction=0.15,
                                     max_epochs=50,
                                     patience=5,
                                     bayesian=(args.nn_type == 'bayesian'),
                                     kl_weight_init=1.0,
                                     kl_weight_fin=0.001)
        nn.fit(train_texts,
               train_labels,
               validation_data=(val_texts, val_labels))
        print('')
        with open(model_name, 'wb') as fp:
            pickle.dump(nn, fp)
    test_texts = test_data[0] + unlabeled_texts_for_testing
    test_labels = test_data[1] + [
        'UNKNOWN' for _ in range(len(unlabeled_texts_for_testing))
    ]
    start_time = time.time()
    if args.nn_type == 'additional_class':
        y_pred = [
            nn.classes_reverse_index_[class_idx]
            for class_idx in nn.predict_proba(test_texts).argmax(axis=1)
        ]
    else:
        y_pred_ = nn.predict(test_texts)
        y_pred = []
        for sample_idx in range(len(y_pred_)):
            if is_string(y_pred_[sample_idx]):
                y_pred.append(y_pred_[sample_idx])
            else:
                if y_pred_[sample_idx] < 0:
                    y_pred.append('UNKNOWN')
                else:
                    y_pred.append(y_pred_[sample_idx])
    end_time = time.time()
    print('Duration of testing is {0:.3f} seconds.'.format(end_time -
                                                           start_time))
    print(
        'Mean duration of a single test sample recognition is {0:.3f} seconds.'
        .format((end_time - start_time) / float(len(test_texts))))
    print('Results of {0}:'.format(
        'bayesian neural network' if args.nn_type == 'bayesian' else (
            'usual neural network' if args.nn_type ==
            'usual' else 'usual neural network with additional class')))
    print(classification_report(test_labels, y_pred, digits=4))
    if args.nn_type != 'additional_class':
        print('')
        print('Results of {0} without UNKNOWN class:'.format(
            'bayesian neural network' if args.nn_type ==
            'bayesian' else 'usual neural network'))
        y_pred = [
            nn.classes_reverse_index_[class_idx]
            for class_idx in nn.predict_proba(test_data[0]).argmax(axis=1)
        ]
        print(classification_report(test_data[1], y_pred, digits=4))
Ejemplo n.º 7
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-m', '--model', dest='model_name', type=str, required=True,
                        help='The binary file with the text classifier.')
    parser.add_argument('-t', '--train', dest='train_file_name', type=str, required=True,
                        help='Path to the archive with DSTC-2 training data.')
    parser.add_argument('-e', '--test', dest='test_file_name', type=str, required=True,
                        help='Path to the archive with DSTC-2 data for final testing.')
    parser.add_argument('--conv1', dest='size_of_conv1', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 1.')
    parser.add_argument('--conv2', dest='size_of_conv2', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 2.')
    parser.add_argument('--conv3', dest='size_of_conv3', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 3.')
    parser.add_argument('--conv4', dest='size_of_conv4', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 4.')
    parser.add_argument('--conv5', dest='size_of_conv5', type=int, required=False, default=20,
                        help='Size of the Bayesian convolution layer with kernel size 5.')
    parser.add_argument('--hidden', dest='hidden_layer_size', type=str, required=False, default='500',
                        help='Size of each hidden layer and total number of hidden layers (separate them with colons).')
    parser.add_argument('--num_monte_carlo', dest='num_monte_carlo', type=int, required=False, default=10,
                        help='Number of generated Monte Carlo samples for each data sample.')
    parser.add_argument('--batch_size', dest='batch_size', type=int, required=False, default=16,
                        help='Size of mini-batch.')
    parser.add_argument('--gpu_frac', dest='gpu_memory_frac', type=float, required=False, default=0.9,
                        help='Allocable part of the GPU memory for the classifier.')
    parser.add_argument('--nn_type', dest='nn_type', type=str, choices=['bayesian', 'usual'],
                        required=False, default='bayesian', help='Neural network type: `bayesian` or `usual`.')
    args = parser.parse_args()

    model_name = os.path.normpath(args.model_name)
    train_file_name = os.path.normpath(args.train_file_name)
    test_file_name = os.path.normpath(args.test_file_name)
    hidden_layer_size, n_hidden_layers = parse_hidden_layers_description(args.hidden_layer_size)

    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            nn, train_classes = pickle.load(fp)
        print('Classes list: {0}'.format(train_classes))
        print('')
    else:
        train_texts, train_labels, train_classes = read_dstc2_data(train_file_name)
        print('Classes list: {0}'.format(train_classes))
        print('Number of samples for training is {0}.'.format(len(train_texts)))
        nn = ImpartialTextClassifier(filters_for_conv1=args.size_of_conv1, filters_for_conv2=args.size_of_conv2,
                                    filters_for_conv3=args.size_of_conv3, filters_for_conv4=args.size_of_conv4,
                                    filters_for_conv5=args.size_of_conv5,
                                    hidden_layer_size=hidden_layer_size, n_hidden_layers=n_hidden_layers,
                                    batch_size=args.batch_size, num_monte_carlo=args.num_monte_carlo,
                                    gpu_memory_frac=args.gpu_memory_frac, verbose=True, multioutput=True,
                                    random_seed=42, validation_fraction=0.15, max_epochs=100, patience=5,
                                    bayesian=(args.nn_type == 'bayesian'))
        nn.fit(train_texts, train_labels)
        print('')
        with open(model_name, 'wb') as fp:
            pickle.dump((nn, train_classes), fp)
    test_texts, test_labels, test_classes = read_dstc2_data(test_file_name, train_classes)
    assert test_classes == train_classes, 'Classes in the test set do not correspond to classes in the train set! ' \
                                          '{0}'.format(test_classes)
    print('')
    print('Number of samples for final testing is {0}.'.format(len(test_texts)))
    y_pred = nn.predict(test_texts)
    accuracy_by_classes = dict()
    for class_idx in range(nn.n_classes_):
        n_total = 0
        n_correct = 0
        for sample_idx in range(len(test_texts)):
            if isinstance(test_labels[sample_idx], set):
                if class_idx in test_labels[sample_idx]:
                    y_true_ = 1
                else:
                    y_true_ = 0
            else:
                if class_idx == test_labels[sample_idx]:
                    y_true_ = 1
                else:
                    y_true_ = 0
            if isinstance(y_pred[sample_idx], set):
                if class_idx in y_pred[sample_idx]:
                    y_pred_ = 1
                else:
                    y_pred_ = 0
            else:
                if class_idx == y_pred[sample_idx]:
                    y_pred_ = 1
                else:
                    y_pred_ = 0
            if y_true_ == y_pred_:
                n_correct += 1
            if y_true_ > 0:
                n_total += 1
        if n_total > 0:
            accuracy_by_classes[class_idx] = float(n_correct) / float(len(test_texts))
    total_accuracy = 0.0
    name_width = 0
    for class_idx in accuracy_by_classes.keys():
        total_accuracy += accuracy_by_classes[class_idx]
        if len(test_classes[class_idx]) > name_width:
            name_width = len(test_classes[class_idx])
    total_accuracy /= float(len(accuracy_by_classes))
    print('Total accuracy: {0:6.2%}'.format(total_accuracy))
    print('By classes:')
    for class_idx in sorted(list(accuracy_by_classes.keys())):
        print('  {0:<{1}} {2:6.2%}'.format(test_classes[class_idx], name_width, accuracy_by_classes[class_idx]))