def train_and_eval(embeddings_name, fold_count, use_ELMo=False):
    batch_size = 256
    if use_ELMo:
        batch_size = 20
    model = Classifier('citations',
                       "gru",
                       list_classes=list_classes,
                       max_epoch=70,
                       fold_number=fold_count,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       use_ELMo=use_ELMo,
                       batch_size=batch_size,
                       class_weights=class_weights)

    print('loading citation sentiment corpus...')
    xtr, y = load_citation_sentiment_corpus(
        "data/textClassification/citations/citation_sentiment_corpus.txt")

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)
    model.eval(x_test, y_test)

    # saving the model
    model.save()
Beispiel #2
0
def train_and_eval_primary(embeddings_name,
                           fold_count,
                           use_ELMo=False,
                           use_BERT=False,
                           architecture="gru"):
    print('loading dataset type corpus...')
    xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-1.csv")

    # distinct values of classes
    print(list_classes)
    print(len(list_classes), "classes")

    print(len(xtr), "texts")
    print(len(y), "classes")

    class_weights = None
    batch_size = 256
    maxlen = 300
    if use_ELMo:
        batch_size = 20
    elif use_BERT:
        batch_size = 50

    # default bert model parameters
    if architecture.find("bert") != -1:
        batch_size = 32
        maxlen = 100

    model = Classifier('dataseer',
                       model_type=architecture,
                       list_classes=list_classes,
                       max_epoch=100,
                       fold_number=fold_count,
                       patience=10,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       use_ELMo=use_ELMo,
                       use_BERT=use_BERT,
                       batch_size=batch_size,
                       maxlen=maxlen,
                       class_weights=class_weights)

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    print(len(x_train), "train texts")
    print(len(y_train), "train classes")

    print(len(x_test), "eval texts")
    print(len(y_test), "eval classes")

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)
    model.eval(x_test, y_test)

    # saving the model
    model.save()
Beispiel #3
0
def train_and_eval(embeddings_name,
                   fold_count,
                   use_ELMo=False,
                   use_BERT=False,
                   architecture="gru"):
    print('loading binary software use dataset...')
    xtr, y = load_software_use_corpus_json(
        "data/textClassification/software/software-use.json.gz")

    nb_used = 0
    for the_class in y:
        if the_class[1] == 1.0:
            nb_used += 1
    nb_unused = len(y) - nb_used
    print("\ttotal:", len(y))
    print("\tused:", nb_used)
    print("\tnot used:", nb_unused)

    model_name = 'software_use'
    class_weights = None
    if use_ELMo:
        model_name += '-with_ELMo'
    elif use_BERT:
        model_name += '-with_BERT'

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    batch_size, maxlen = configure(architecture, use_BERT, use_ELMo)

    print(list_classes)

    model = Classifier(model_name,
                       model_type=architecture,
                       list_classes=list_classes,
                       max_epoch=100,
                       fold_number=fold_count,
                       patience=10,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       use_ELMo=use_ELMo,
                       use_BERT=use_BERT,
                       batch_size=batch_size,
                       maxlen=maxlen,
                       class_weights=class_weights)

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)
    model.eval(x_test, y_test)

    # saving the model
    model.save()
Beispiel #4
0
def train_and_eval_binary(embeddings_name,
                          fold_count,
                          architecture="gru",
                          transformer=None):
    print('loading multiclass software context dataset...')
    xtr, y = load_software_context_corpus_json(
        "data/textClassification/software/software-contexts.json.gz")

    report_training_contexts(y)
    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    for class_rank in range(len(list_classes)):
        model_name = 'software_context_' + list_classes[
            class_rank] + '_' + architecture
        class_weights = None

        batch_size, maxlen, patience, early_stop, max_epoch = configure(
            architecture)

        y_train_class_rank = [[1, 0] if y[class_rank] == 1.0 else [0, 1]
                              for y in y_train]
        y_test_class_rank = [[1, 0] if y[class_rank] == 1.0 else [0, 1]
                             for y in y_test]

        y_train_class_rank = np.array(y_train_class_rank)
        y_test_class_rank = np.array(y_test_class_rank)

        list_classes_rank = [
            list_classes[class_rank], "not_" + list_classes[class_rank]
        ]

        model = Classifier(model_name,
                           architecture=architecture,
                           list_classes=list_classes_rank,
                           max_epoch=max_epoch,
                           fold_number=fold_count,
                           patience=patience,
                           use_roc_auc=True,
                           embeddings_name=embeddings_name,
                           batch_size=batch_size,
                           maxlen=maxlen,
                           early_stop=early_stop,
                           class_weights=class_weights,
                           transformer_name=transformer)

        if fold_count == 1:
            model.train(x_train, y_train_class_rank)
        else:
            model.train_nfold(x_train, y_train_class_rank)
        model.eval(x_test, y_test_class_rank)
Beispiel #5
0
def train_and_eval_reuse(embeddings_name,
                         fold_count,
                         architecture="gru",
                         transformer=None):
    print('loading dataset type corpus...')
    xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-reuse.csv")

    # distinct values of classes
    print(list_classes)
    print(len(list_classes), "classes")

    print(len(xtr), "texts")
    print(len(y), "classes")

    batch_size, maxlen, patience, early_stop, max_epoch = configure(
        architecture)

    class_weights = {0: 1.5, 1: 1.}

    model = Classifier('dataseer-reuse_' + architecture,
                       architecture=architecture,
                       list_classes=list_classes,
                       max_epoch=max_epoch,
                       fold_number=fold_count,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       batch_size=batch_size,
                       maxlen=maxlen,
                       patience=patience,
                       early_stop=early_stop,
                       class_weights=class_weights,
                       transformer_name=transformer)

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    print(len(x_train), "train texts")
    print(len(y_train), "train classes")

    print(len(x_test), "eval texts")
    print(len(y_test), "eval classes")

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)
    model.eval(x_test, y_test)

    # saving the model
    model.save()
Beispiel #6
0
def train_and_eval(embeddings_name,
                   fold_count,
                   architecture="gru",
                   transformer=None):
    print('loading binary software use dataset...')
    xtr, y = load_software_use_corpus_json(
        "data/textClassification/software/software-use.json.gz")

    nb_used = 0
    for the_class in y:
        if the_class[1] == 1.0:
            nb_used += 1
    nb_unused = len(y) - nb_used
    print("\ttotal:", len(y))
    print("\tused:", nb_used)
    print("\tnot used:", nb_unused)

    model_name = 'software_use_' + architecture
    class_weights = None

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    batch_size, maxlen, patience, early_stop, max_epoch = configure(
        architecture)

    print(list_classes)

    model = Classifier(model_name,
                       architecture=architecture,
                       list_classes=list_classes,
                       max_epoch=max_epoch,
                       fold_number=fold_count,
                       patience=patience,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       batch_size=batch_size,
                       maxlen=maxlen,
                       early_stop=early_stop,
                       class_weights=class_weights,
                       transformer_name=transformer)

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)
    model.eval(x_test, y_test)

    # saving the model
    model.save()
Beispiel #7
0
def train_and_eval(embeddings_name,
                   fold_count,
                   architecture="gru",
                   transformer=None):
    print('loading multiclass software context dataset...')
    xtr, y = load_software_context_corpus_json(
        "data/textClassification/software/software-contexts.json.gz")

    report_training_contexts(y)

    model_name = 'software_context_' + architecture
    class_weights = None

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    batch_size, maxlen, patience, early_stop, max_epoch = configure(
        architecture)

    print(list_classes)

    model = Classifier(model_name,
                       architecture=architecture,
                       list_classes=list_classes,
                       max_epoch=max_epoch,
                       fold_number=fold_count,
                       patience=patience,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       batch_size=batch_size,
                       maxlen=maxlen,
                       early_stop=early_stop,
                       class_weights=class_weights,
                       transformer_name=transformer)

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)
    model.eval(x_test, y_test)

    # saving the model
    model.save()
Beispiel #8
0
def train_and_eval(embeddings_name,
                   fold_count,
                   use_ELMo=False,
                   use_BERT=False,
                   architecture="gru"):
    batch_size, maxlen = configure(architecture, use_BERT, use_ELMo)
    maxlen = 150

    model = Classifier('citations',
                       model_type=architecture,
                       list_classes=list_classes,
                       max_epoch=100,
                       fold_number=fold_count,
                       patience=10,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       use_ELMo=use_ELMo,
                       use_BERT=use_BERT,
                       batch_size=batch_size,
                       maxlen=maxlen,
                       class_weights=class_weights)

    print('loading citation sentiment corpus...')
    xtr, y = load_citation_sentiment_corpus(
        "data/textClassification/citations/citation_sentiment_corpus.txt")

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)
    model.eval(x_test, y_test)

    # saving the model
    model.save()
Beispiel #9
0
def train_and_eval(embeddings_name,
                   fold_count,
                   architecture="gru",
                   transformer=None):
    batch_size, maxlen, patience, early_stop, max_epoch = configure(
        architecture)

    model = Classifier('citations_' + architecture,
                       architecture=architecture,
                       list_classes=list_classes,
                       max_epoch=max_epoch,
                       fold_number=fold_count,
                       use_roc_auc=True,
                       embeddings_name=embeddings_name,
                       batch_size=batch_size,
                       maxlen=maxlen,
                       patience=patience,
                       early_stop=early_stop,
                       class_weights=class_weights,
                       transformer_name=transformer)

    print('loading citation sentiment corpus...')
    xtr, y = load_citation_sentiment_corpus(
        "data/textClassification/citations/citation_sentiment_corpus.txt")

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    if fold_count == 1:
        model.train(x_train, y_train)
    else:
        model.train_nfold(x_train, y_train)

    # saving the model
    model.save()

    model.eval(x_test, y_test)
Beispiel #10
0
def train_eval_cascaded(embeddings_name,
                        fold_count,
                        use_ELMo=False,
                        use_BERT=False,
                        architecture="gru"):
    # general setting of parameters
    class_weights = None
    batch_size = 256
    maxlen = 300
    if use_ELMo:
        batch_size = 20
    elif use_BERT:
        batch_size = 50

    # default bert model parameters
    if architecture.find("bert") != -1:
        batch_size = 32
        maxlen = 100

    # first binary classifier: dataset or no_dataset
    xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-binary.csv")

    print(list_classes)

    model_binary = Classifier('dataseer-binary',
                              model_type=architecture,
                              list_classes=list_classes,
                              max_epoch=100,
                              fold_number=fold_count,
                              patience=10,
                              use_roc_auc=True,
                              embeddings_name=embeddings_name,
                              use_ELMo=use_ELMo,
                              use_BERT=use_BERT,
                              batch_size=batch_size,
                              maxlen=maxlen,
                              class_weights=class_weights)

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    if fold_count == 1:
        model_binary.train(x_train, y_train)
    else:
        model_binary.train_nfold(x_train, y_train)
    model_binary.eval(x_test, y_test)

    x_test_binary = x_test
    y_test_binary = y_test

    # second, the first level datatype taxonomy for sentences classified as dataset
    xtr, y_classes, y_subclasses, y_leafclasses, list_classes, list_subclasses, list_leaf_classes = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-1.csv")
    # ignore the no_dataset, ignore the first eval set, build first level classifier

    ind = list_classes.index('no_dataset')
    to_remove = vectorizer(ind, len(list_classes))

    x_train, y_train = filter_exclude_class(xtr, y_classes, to_remove)
    y_train2 = np.zeros(shape=(len(y_train), len(list_classes) - 1))
    for i in range(0, len(y_train)):
        y_train2[i] = np.delete(y_train[i], ind)
    y_train = y_train2

    list_classes.remove('no_dataset')

    model_first = Classifier('dataseer-first',
                             model_type=architecture,
                             list_classes=list_classes,
                             max_epoch=100,
                             fold_number=fold_count,
                             patience=10,
                             use_roc_auc=True,
                             embeddings_name=embeddings_name,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT,
                             batch_size=batch_size,
                             maxlen=maxlen,
                             class_weights=class_weights)

    if fold_count == 1:
        model_first.train(x_train, y_train)
    else:
        model_first.train_nfold(x_train, y_train)
    model_first.eval(x_test, y_test)

    # eval by cascading
    result_binary = model_binary.predict(x_test_binary,
                                         output_format='default')
    result_first = model_first.predict(x_test, output_format='default')

    # select sequences classified as dataset
    result_intermediate = np.asarray(
        [np.argmax(line) for line in result_binary])

    def vectorize(index, size):
        result = np.zeros(size)
        if index < size:
            result[index] = 1
        return result

    result_binary = np.array(
        [vectorize(xi, len(list_classes)) for xi in result_intermediate])
Beispiel #11
0
def train_and_eval_secondary(embeddings_name,
                             fold_count,
                             use_ELMo=False,
                             use_BERT=False,
                             architecture="gru"):
    print('training second-level dataset subtype corpus...')
    xtr, y1, y2, _, list_classes, list_subclasses, _ = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-1.csv")
    # aggregate by class, we will have one training set per class

    print(list_classes)
    print(list_subclasses)
    print(len(list_classes), "classes")
    print(len(list_subclasses), "sub-classes")

    class_weights = None
    batch_size = 256
    maxlen = 300
    if use_ELMo:
        batch_size = 20
    elif use_BERT:
        batch_size = 50

    # default bert model parameters
    if architecture.find("bert") != -1:
        batch_size = 32
        maxlen = 100

    datatypes_y = {}
    datatypes_xtr = {}
    datatypes_list_subclasses = {}
    for i in range(0, len(xtr)):
        #print(np.where(y2[i] == 1))
        ind1 = np.where(y1[i] == 1)[0][0]
        ind2 = np.where(y2[i] == 1)[0][0]
        #print(ind2)
        datatype = list_classes[ind1]
        datasubtype = list_subclasses[ind2]
        #print(str(xtr[i]), datatype, datasubtype)
        if datatype in datatypes_y:
            datatypes_y[datatype].append(datasubtype)
            datatypes_xtr[datatype].append(xtr[i])
            if not datasubtype in datatypes_list_subclasses[datatype]:
                datatypes_list_subclasses[datatype].append(datasubtype)
        else:
            datatypes_y[datatype] = []
            datatypes_y[datatype].append(datasubtype)
            datatypes_xtr[datatype] = []
            datatypes_xtr[datatype].append(xtr[i])
            datatypes_list_subclasses[datatype] = []
            datatypes_list_subclasses[datatype].append(datasubtype)

    print(datatypes_list_subclasses)

    for the_class in list_classes:
        print('\ntraining', the_class)
        if not the_class in datatypes_list_subclasses:
            print('no subclass for', the_class)
            continue

        if len(datatypes_list_subclasses[the_class]) <= 1:
            print('only one subclass for', the_class)
            continue

        if len(datatypes_list_subclasses[the_class]
               ) == 2 and 'nan' in datatypes_list_subclasses[the_class]:
            continue

        if the_class == 'Protein Data':
            continue

        print('subtypes to be classified:',
              datatypes_list_subclasses[the_class])

        model_name = 'dataseer-' + the_class
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'

        model = Classifier(model_name,
                           model_type=architecture,
                           list_classes=datatypes_list_subclasses[the_class],
                           max_epoch=100,
                           fold_number=fold_count,
                           patience=10,
                           use_roc_auc=True,
                           embeddings_name=embeddings_name,
                           use_ELMo=use_ELMo,
                           use_BERT=use_BERT,
                           batch_size=batch_size,
                           maxlen=maxlen,
                           class_weights=class_weights)

        # we need to vectorize the y according to the actual list of classes
        local_y = []
        for the_y in datatypes_y[the_class]:
            the_ind = datatypes_list_subclasses[the_class].index(the_y)
            local_y.append(
                vectorizer(the_ind, len(datatypes_list_subclasses[the_class])))

        # segment train and eval sets
        x_train, y_train, x_test, y_test = split_data_and_labels(
            np.asarray(datatypes_xtr[the_class]), np.asarray(local_y), 0.9)

        if fold_count == 1:
            model.train(x_train, y_train)
        else:
            model.train_nfold(x_train, y_train)
        model.eval(x_test, y_test)
Beispiel #12
0
def train_and_eval_secondary(embeddings_name,
                             fold_count,
                             architecture="gru",
                             transformer=None):
    print('training second-level dataset subtype corpus...')
    xtr, y1, y2, _, list_classes, list_subclasses, _ = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-multilevel.csv")
    # aggregate by class, we will have one training set per class

    print(list_classes)
    print(list_subclasses)
    print(len(list_classes), "classes")
    print(len(list_subclasses), "sub-classes")

    class_weights = None
    batch_size, maxlen, patience, early_stop, max_epoch = configure(
        architecture)

    datatypes_y = {}
    datatypes_xtr = {}
    datatypes_list_subclasses = {}
    for i in range(0, len(xtr)):
        #print(np.where(y2[i] == 1))
        ind1 = np.where(y1[i] == 1)[0][0]
        ind2 = np.where(y2[i] == 1)[0][0]
        #print(ind2)
        datatype = list_classes[ind1]
        datasubtype = list_subclasses[ind2]
        #print(str(xtr[i]), datatype, datasubtype)
        if datatype in datatypes_y:
            datatypes_y[datatype].append(datasubtype)
            datatypes_xtr[datatype].append(xtr[i])
            if not datasubtype in datatypes_list_subclasses[datatype]:
                datatypes_list_subclasses[datatype].append(datasubtype)
        else:
            datatypes_y[datatype] = []
            datatypes_y[datatype].append(datasubtype)
            datatypes_xtr[datatype] = []
            datatypes_xtr[datatype].append(xtr[i])
            datatypes_list_subclasses[datatype] = []
            datatypes_list_subclasses[datatype].append(datasubtype)

    print(datatypes_list_subclasses)

    for the_class in list_classes:
        print('\ntraining', the_class)
        if not the_class in datatypes_list_subclasses:
            print('no subclass for', the_class)
            continue

        if len(datatypes_list_subclasses[the_class]) <= 1:
            print('only one subclass for', the_class)
            continue

        if len(datatypes_list_subclasses[the_class]
               ) == 2 and 'nan' in datatypes_list_subclasses[the_class]:
            continue

        if the_class == 'Protein Data':
            continue

        print('subtypes to be classified:',
              datatypes_list_subclasses[the_class])

        model_name = 'dataseer-' + the_class + "_" + architecture

        model = Classifier(model_name,
                           architecture=architecture,
                           list_classes=datatypes_list_subclasses[the_class],
                           max_epoch=max_epoch,
                           fold_number=fold_count,
                           use_roc_auc=True,
                           embeddings_name=embeddings_name,
                           batch_size=batch_size,
                           maxlen=maxlen,
                           patience=patience,
                           early_stop=early_stop,
                           class_weights=class_weights,
                           transformer_name=transformer)

        # we need to vectorize the y according to the actual list of classes
        local_y = []
        for the_y in datatypes_y[the_class]:
            the_ind = datatypes_list_subclasses[the_class].index(the_y)
            local_y.append(
                vectorizer(the_ind, len(datatypes_list_subclasses[the_class])))

        # segment train and eval sets
        x_train, y_train, x_test, y_test = split_data_and_labels(
            np.asarray(datatypes_xtr[the_class]), np.asarray(local_y), 0.9)

        if fold_count == 1:
            model.train(x_train, y_train)
        else:
            model.train_nfold(x_train, y_train)
        model.eval(x_test, y_test)
        # saving the model
        model.save()