def train_and_eval(embeddings_name, fold_count, use_ELMo=False): batch_size = 256 if use_ELMo: batch_size = 20 model = Classifier('citations', "gru", list_classes=list_classes, max_epoch=70, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, batch_size=batch_size, class_weights=class_weights) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval_primary(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): print('loading dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-1.csv") # distinct values of classes print(list_classes) print(len(list_classes), "classes") print(len(xtr), "texts") print(len(y), "classes") class_weights = None batch_size = 256 maxlen = 300 if use_ELMo: batch_size = 20 elif use_BERT: batch_size = 50 # default bert model parameters if architecture.find("bert") != -1: batch_size = 32 maxlen = 100 model = Classifier('dataseer', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) print(len(x_train), "train texts") print(len(y_train), "train classes") print(len(x_test), "eval texts") print(len(y_test), "eval classes") if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): print('loading binary software use dataset...') xtr, y = load_software_use_corpus_json( "data/textClassification/software/software-use.json.gz") nb_used = 0 for the_class in y: if the_class[1] == 1.0: nb_used += 1 nb_unused = len(y) - nb_used print("\ttotal:", len(y)) print("\tused:", nb_used) print("\tnot used:", nb_unused) model_name = 'software_use' class_weights = None if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) print(list_classes) model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval_binary(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading multiclass software context dataset...') xtr, y = load_software_context_corpus_json( "data/textClassification/software/software-contexts.json.gz") report_training_contexts(y) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) for class_rank in range(len(list_classes)): model_name = 'software_context_' + list_classes[ class_rank] + '_' + architecture class_weights = None batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) y_train_class_rank = [[1, 0] if y[class_rank] == 1.0 else [0, 1] for y in y_train] y_test_class_rank = [[1, 0] if y[class_rank] == 1.0 else [0, 1] for y in y_test] y_train_class_rank = np.array(y_train_class_rank) y_test_class_rank = np.array(y_test_class_rank) list_classes_rank = [ list_classes[class_rank], "not_" + list_classes[class_rank] ] model = Classifier(model_name, architecture=architecture, list_classes=list_classes_rank, max_epoch=max_epoch, fold_number=fold_count, patience=patience, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(x_train, y_train_class_rank) else: model.train_nfold(x_train, y_train_class_rank) model.eval(x_test, y_test_class_rank)
def train_and_eval_reuse(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-reuse.csv") # distinct values of classes print(list_classes) print(len(list_classes), "classes") print(len(xtr), "texts") print(len(y), "classes") batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) class_weights = {0: 1.5, 1: 1.} model = Classifier('dataseer-reuse_' + architecture, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) print(len(x_train), "train texts") print(len(y_train), "train classes") print(len(x_test), "eval texts") print(len(y_test), "eval classes") if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading binary software use dataset...') xtr, y = load_software_use_corpus_json( "data/textClassification/software/software-use.json.gz") nb_used = 0 for the_class in y: if the_class[1] == 1.0: nb_used += 1 nb_unused = len(y) - nb_used print("\ttotal:", len(y)) print("\tused:", nb_used) print("\tnot used:", nb_unused) model_name = 'software_use_' + architecture class_weights = None # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) print(list_classes) model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, patience=patience, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading multiclass software context dataset...') xtr, y = load_software_context_corpus_json( "data/textClassification/software/software-contexts.json.gz") report_training_contexts(y) model_name = 'software_context_' + architecture class_weights = None # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) print(list_classes) model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, patience=patience, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) maxlen = 150 model = Classifier('citations', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=None): batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) model = Classifier('citations_' + architecture, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) # saving the model model.save() model.eval(x_test, y_test)
def train_eval_cascaded(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): # general setting of parameters class_weights = None batch_size = 256 maxlen = 300 if use_ELMo: batch_size = 20 elif use_BERT: batch_size = 50 # default bert model parameters if architecture.find("bert") != -1: batch_size = 32 maxlen = 100 # first binary classifier: dataset or no_dataset xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-binary.csv") print(list_classes) model_binary = Classifier('dataseer-binary', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model_binary.train(x_train, y_train) else: model_binary.train_nfold(x_train, y_train) model_binary.eval(x_test, y_test) x_test_binary = x_test y_test_binary = y_test # second, the first level datatype taxonomy for sentences classified as dataset xtr, y_classes, y_subclasses, y_leafclasses, list_classes, list_subclasses, list_leaf_classes = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-1.csv") # ignore the no_dataset, ignore the first eval set, build first level classifier ind = list_classes.index('no_dataset') to_remove = vectorizer(ind, len(list_classes)) x_train, y_train = filter_exclude_class(xtr, y_classes, to_remove) y_train2 = np.zeros(shape=(len(y_train), len(list_classes) - 1)) for i in range(0, len(y_train)): y_train2[i] = np.delete(y_train[i], ind) y_train = y_train2 list_classes.remove('no_dataset') model_first = Classifier('dataseer-first', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model_first.train(x_train, y_train) else: model_first.train_nfold(x_train, y_train) model_first.eval(x_test, y_test) # eval by cascading result_binary = model_binary.predict(x_test_binary, output_format='default') result_first = model_first.predict(x_test, output_format='default') # select sequences classified as dataset result_intermediate = np.asarray( [np.argmax(line) for line in result_binary]) def vectorize(index, size): result = np.zeros(size) if index < size: result[index] = 1 return result result_binary = np.array( [vectorize(xi, len(list_classes)) for xi in result_intermediate])
def train_and_eval_secondary(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): print('training second-level dataset subtype corpus...') xtr, y1, y2, _, list_classes, list_subclasses, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-1.csv") # aggregate by class, we will have one training set per class print(list_classes) print(list_subclasses) print(len(list_classes), "classes") print(len(list_subclasses), "sub-classes") class_weights = None batch_size = 256 maxlen = 300 if use_ELMo: batch_size = 20 elif use_BERT: batch_size = 50 # default bert model parameters if architecture.find("bert") != -1: batch_size = 32 maxlen = 100 datatypes_y = {} datatypes_xtr = {} datatypes_list_subclasses = {} for i in range(0, len(xtr)): #print(np.where(y2[i] == 1)) ind1 = np.where(y1[i] == 1)[0][0] ind2 = np.where(y2[i] == 1)[0][0] #print(ind2) datatype = list_classes[ind1] datasubtype = list_subclasses[ind2] #print(str(xtr[i]), datatype, datasubtype) if datatype in datatypes_y: datatypes_y[datatype].append(datasubtype) datatypes_xtr[datatype].append(xtr[i]) if not datasubtype in datatypes_list_subclasses[datatype]: datatypes_list_subclasses[datatype].append(datasubtype) else: datatypes_y[datatype] = [] datatypes_y[datatype].append(datasubtype) datatypes_xtr[datatype] = [] datatypes_xtr[datatype].append(xtr[i]) datatypes_list_subclasses[datatype] = [] datatypes_list_subclasses[datatype].append(datasubtype) print(datatypes_list_subclasses) for the_class in list_classes: print('\ntraining', the_class) if not the_class in datatypes_list_subclasses: print('no subclass for', the_class) continue if len(datatypes_list_subclasses[the_class]) <= 1: print('only one subclass for', the_class) continue if len(datatypes_list_subclasses[the_class] ) == 2 and 'nan' in datatypes_list_subclasses[the_class]: continue if the_class == 'Protein Data': continue print('subtypes to be classified:', datatypes_list_subclasses[the_class]) model_name = 'dataseer-' + the_class if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model = Classifier(model_name, model_type=architecture, list_classes=datatypes_list_subclasses[the_class], max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) # we need to vectorize the y according to the actual list of classes local_y = [] for the_y in datatypes_y[the_class]: the_ind = datatypes_list_subclasses[the_class].index(the_y) local_y.append( vectorizer(the_ind, len(datatypes_list_subclasses[the_class]))) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels( np.asarray(datatypes_xtr[the_class]), np.asarray(local_y), 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test)
def train_and_eval_secondary(embeddings_name, fold_count, architecture="gru", transformer=None): print('training second-level dataset subtype corpus...') xtr, y1, y2, _, list_classes, list_subclasses, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-multilevel.csv") # aggregate by class, we will have one training set per class print(list_classes) print(list_subclasses) print(len(list_classes), "classes") print(len(list_subclasses), "sub-classes") class_weights = None batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) datatypes_y = {} datatypes_xtr = {} datatypes_list_subclasses = {} for i in range(0, len(xtr)): #print(np.where(y2[i] == 1)) ind1 = np.where(y1[i] == 1)[0][0] ind2 = np.where(y2[i] == 1)[0][0] #print(ind2) datatype = list_classes[ind1] datasubtype = list_subclasses[ind2] #print(str(xtr[i]), datatype, datasubtype) if datatype in datatypes_y: datatypes_y[datatype].append(datasubtype) datatypes_xtr[datatype].append(xtr[i]) if not datasubtype in datatypes_list_subclasses[datatype]: datatypes_list_subclasses[datatype].append(datasubtype) else: datatypes_y[datatype] = [] datatypes_y[datatype].append(datasubtype) datatypes_xtr[datatype] = [] datatypes_xtr[datatype].append(xtr[i]) datatypes_list_subclasses[datatype] = [] datatypes_list_subclasses[datatype].append(datasubtype) print(datatypes_list_subclasses) for the_class in list_classes: print('\ntraining', the_class) if not the_class in datatypes_list_subclasses: print('no subclass for', the_class) continue if len(datatypes_list_subclasses[the_class]) <= 1: print('only one subclass for', the_class) continue if len(datatypes_list_subclasses[the_class] ) == 2 and 'nan' in datatypes_list_subclasses[the_class]: continue if the_class == 'Protein Data': continue print('subtypes to be classified:', datatypes_list_subclasses[the_class]) model_name = 'dataseer-' + the_class + "_" + architecture model = Classifier(model_name, architecture=architecture, list_classes=datatypes_list_subclasses[the_class], max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) # we need to vectorize the y according to the actual list of classes local_y = [] for the_y in datatypes_y[the_class]: the_ind = datatypes_list_subclasses[the_class].index(the_y) local_y.append( vectorizer(the_ind, len(datatypes_list_subclasses[the_class]))) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels( np.asarray(datatypes_xtr[the_class]), np.asarray(local_y), 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()