def train(embeddings_name, fold_count, architecture="gru", transformer=None): batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) model = Classifier('citations_' + architecture, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading binary software use dataset...') xtr, y = load_software_use_corpus_json( "data/textClassification/software/software-use.json.gz") model_name = 'software_use_' + architecture class_weights = None batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, patience=patience, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) model = Classifier('citations', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_and_eval_primary(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): print('loading dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-1.csv") # distinct values of classes print(list_classes) print(len(list_classes), "classes") print(len(xtr), "texts") print(len(y), "classes") class_weights = None batch_size = 256 maxlen = 300 if use_ELMo: batch_size = 20 elif use_BERT: batch_size = 50 # default bert model parameters if architecture.find("bert") != -1: batch_size = 32 maxlen = 100 model = Classifier('dataseer', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) print(len(x_train), "train texts") print(len(y_train), "train classes") print(len(x_test), "eval texts") print(len(y_test), "eval classes") if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train(embeddings_name, fold_count, use_ELMo=False): batch_size = 256 if use_ELMo: batch_size = 20 model = Classifier('citations', "gru", list_classes=list_classes, max_epoch=70, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, batch_size=batch_size, class_weights=class_weights) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, use_ELMo=False): batch_size = 256 if use_ELMo: batch_size = 20 model = Classifier('citations', "gru", list_classes=list_classes, max_epoch=70, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, batch_size=batch_size, class_weights=class_weights) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train(embeddings_name=None, fold_count=1, architecture="gru", transformer=None): batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) model = Classifier('toxic_' + architecture, architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, class_weights=class_weights, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, transformer_name=transformer) print('loading train dataset...') xtr, y = load_texts_and_classes_pandas( "data/textClassification/toxic/train.csv") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): print('loading binary software use dataset...') xtr, y = load_software_use_corpus_json( "data/textClassification/software/software-use.json.gz") nb_used = 0 for the_class in y: if the_class[1] == 1.0: nb_used += 1 nb_unused = len(y) - nb_used print("\ttotal:", len(y)) print("\tused:", nb_used) print("\tnot used:", nb_unused) model_name = 'software_use' class_weights = None if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) print(list_classes) model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval_reuse(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-reuse.csv") # distinct values of classes print(list_classes) print(len(list_classes), "classes") print(len(xtr), "texts") print(len(y), "classes") batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) class_weights = {0: 1.5, 1: 1.} model = Classifier('dataseer-reuse_' + architecture, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) print(len(x_train), "train texts") print(len(y_train), "train classes") print(len(x_test), "eval texts") print(len(y_test), "eval classes") if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train(embeddings_name="fasttext-crawl", fold_count=1, use_ELMo=False, use_BERT=False, architecture="gru"): batch_size = 256 maxlen = 300 model = Classifier('toxic', architecture, list_classes=list_classes, max_epoch=30, fold_number=fold_count, class_weights=class_weights, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen) print('loading train dataset...') xtr, y = load_texts_and_classes_pandas("data/textClassification/toxic/train.csv") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading binary software use dataset...') xtr, y = load_software_use_corpus_json( "data/textClassification/software/software-use.json.gz") nb_used = 0 for the_class in y: if the_class[1] == 1.0: nb_used += 1 nb_unused = len(y) - nb_used print("\ttotal:", len(y)) print("\tused:", nb_used) print("\tnot used:", nb_unused) model_name = 'software_use_' + architecture class_weights = None # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) print(list_classes) model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, patience=patience, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train(embeddings_name, fold_count): model = Classifier('toxic', "gru", list_classes=list_classes, max_epoch=30, fold_number=fold_count, embeddings_name=embeddings_name) print('loading train dataset...') xtr, y = load_texts_and_classes_pandas( "data/textClassification/toxic/train.csv") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_binary(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading multiclass software context dataset...') x_train, y_train = load_software_context_corpus_json( "data/textClassification/software/software-contexts.json.gz") report_training_contexts(y_train) for class_rank in range(len(list_classes)): model_name = 'software_context_' + list_classes[ class_rank] + '_' + architecture class_weights = None batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) y_train_class_rank = [[1, 0] if y[class_rank] == 1.0 else [0, 1] for y in y_train] y_train_class_rank = np.array(y_train_class_rank) list_classes_rank = [ list_classes[class_rank], "not_" + list_classes[class_rank] ] model = Classifier(model_name, architecture=architecture, list_classes=list_classes_rank, max_epoch=max_epoch, fold_number=fold_count, patience=patience, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(x_train, y_train_class_rank) else: model.train_nfold(x_train, y_train_class_rank) # saving the model model.save()
def train(embeddings_name, fold_count): model = Classifier('citations', "gru", list_classes=list_classes, max_epoch=70, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count, architecture="gru", transformer=None): print('loading multiclass software context dataset...') xtr, y = load_software_context_corpus_json( "data/textClassification/software/software-contexts.json.gz") report_training_contexts(y) model_name = 'software_context_' + architecture class_weights = None # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) print(list_classes) model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, patience=patience, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): print('loading binary software use dataset...') xtr, y = load_software_use_corpus_json( "data/textClassification/software/software-use.json.gz") model_name = 'software_use' class_weights = None if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_and_eval_secondary(embeddings_name, fold_count, architecture="gru", transformer=None): print('training second-level dataset subtype corpus...') xtr, y1, y2, _, list_classes, list_subclasses, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-multilevel.csv") # aggregate by class, we will have one training set per class print(list_classes) print(list_subclasses) print(len(list_classes), "classes") print(len(list_subclasses), "sub-classes") class_weights = None batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) datatypes_y = {} datatypes_xtr = {} datatypes_list_subclasses = {} for i in range(0, len(xtr)): #print(np.where(y2[i] == 1)) ind1 = np.where(y1[i] == 1)[0][0] ind2 = np.where(y2[i] == 1)[0][0] #print(ind2) datatype = list_classes[ind1] datasubtype = list_subclasses[ind2] #print(str(xtr[i]), datatype, datasubtype) if datatype in datatypes_y: datatypes_y[datatype].append(datasubtype) datatypes_xtr[datatype].append(xtr[i]) if not datasubtype in datatypes_list_subclasses[datatype]: datatypes_list_subclasses[datatype].append(datasubtype) else: datatypes_y[datatype] = [] datatypes_y[datatype].append(datasubtype) datatypes_xtr[datatype] = [] datatypes_xtr[datatype].append(xtr[i]) datatypes_list_subclasses[datatype] = [] datatypes_list_subclasses[datatype].append(datasubtype) print(datatypes_list_subclasses) for the_class in list_classes: print('\ntraining', the_class) if not the_class in datatypes_list_subclasses: print('no subclass for', the_class) continue if len(datatypes_list_subclasses[the_class]) <= 1: print('only one subclass for', the_class) continue if len(datatypes_list_subclasses[the_class] ) == 2 and 'nan' in datatypes_list_subclasses[the_class]: continue if the_class == 'Protein Data': continue print('subtypes to be classified:', datatypes_list_subclasses[the_class]) model_name = 'dataseer-' + the_class + "_" + architecture model = Classifier(model_name, architecture=architecture, list_classes=datatypes_list_subclasses[the_class], max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) # we need to vectorize the y according to the actual list of classes local_y = [] for the_y in datatypes_y[the_class]: the_ind = datatypes_list_subclasses[the_class].index(the_y) local_y.append( vectorizer(the_ind, len(datatypes_list_subclasses[the_class]))) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels( np.asarray(datatypes_xtr[the_class]), np.asarray(local_y), 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru", cascaded=False): print('loading binary dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-binary.csv") model_name = 'dataseer-binary' class_weights = None batch_size = 256 maxlen = 300 if use_ELMo: batch_size = 20 model_name += '-with_ELMo' elif use_BERT: batch_size = 50 model_name += '-with_BERT' # default bert model parameters if architecture.lower().find("bert") != -1: batch_size = 32 maxlen = 100 model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save() print('loading reuse dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-reuse.csv") model_name = 'dataseer-reuse' class_weights = None batch_size = 256 maxlen = 300 if use_ELMo: batch_size = 20 model_name += '-with_ELMo' elif use_BERT: batch_size = 50 model_name += '-with_BERT' # default bert model parameters if architecture.lower().find("bert") != -1: batch_size = 32 maxlen = 100 model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save() print('loading first-level dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-1.csv") model_name = 'dataseer-first' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save() '''
def train(embeddings_name, fold_count, architecture="gru", transformer=None, cascaded=False): print('loading binary dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-binary.csv") model_name = 'dataseer-binary_' + architecture class_weights = None batch_size, maxlen, patience, early_stop, max_epoch = configure( architecture) model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save() print('loading reuse dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-reuse.csv") model_name = 'dataseer-reuse_' + architecture class_weights = {0: 1.5, 1: 1.} model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save() print('loading first-level dataset type corpus...') xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-multilevel.csv") model_name = 'dataseer-first_' + architecture class_weights = None model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name, batch_size=batch_size, maxlen=maxlen, patience=patience, early_stop=early_stop, class_weights=class_weights, transformer_name=transformer) if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save() '''