def main(): method_name = set_method_name() logfile = init_logfile(method_name, opt) pretrained=load_pretrained(opt) dataset = Dataset.load(dataset_name=opt.dataset, pickle_path=opt.pickle_path).show() word2index, out_of_vocabulary, unk_index, pad_index, devel_index, test_index = index_dataset(dataset, pretrained) # dataset split tr/val/test val_size = min(int(len(devel_index) * .2), 20000) train_index, val_index, ytr, yval = train_test_split(devel_index, dataset.devel_target, test_size=val_size, random_state=opt.seed, shuffle=True) yte = dataset.test_target vocabsize = len(word2index) + len(out_of_vocabulary) pretrained_embeddings, sup_range = embedding_matrix(dataset, pretrained, vocabsize, word2index, out_of_vocabulary, opt) model = init_Net(dataset.nC, vocabsize, pretrained_embeddings, sup_range, tocuda=True) optim = init_optimizer(model, lr=opt.lr) criterion = init_loss(dataset.classification_type) # train-validate tinit = time() create_if_not_exist(opt.checkpoint_dir) early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{opt.net}-{opt.dataset}') for epoch in range(1, opt.nepochs + 1): train(model, train_index, ytr, pad_index, tinit, logfile, criterion, optim, epoch, method_name) # validation macrof1 = test(model, val_index, yval, pad_index, dataset.classification_type, tinit, epoch, logfile, criterion, 'va') early_stop(macrof1, epoch) if opt.test_each>0: if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs): test(model, test_index, yte, pad_index, dataset.classification_type, tinit, epoch, logfile, criterion, 'te') if early_stop.STOP: print('[early-stop]') if not opt.plotmode: # with plotmode activated, early-stop is ignored break # restores the best model according to the Mf1 of the validation set (only when plotmode==False) stoptime = early_stop.stop_time - tinit stopepoch = early_stop.best_epoch logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime) if opt.plotmode==False: print('performing final evaluation') model = early_stop.restore_checkpoint() if opt.val_epochs>0: print(f'last {opt.val_epochs} epochs on the validation set') for val_epoch in range(1, opt.val_epochs + 1): train(model, val_index, yval, pad_index, tinit, logfile, criterion, optim, epoch+val_epoch, method_name) # test print('Training complete: testing') test(model, test_index, yte, pad_index, dataset.classification_type, tinit, epoch, logfile, criterion, 'final-te')
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7): _dataname = 'ohsumed50k' if data_path is None: data_path = join(os.path.expanduser('~'), _dataname) create_if_not_exist(data_path) pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle') if not os.path.exists(pickle_file): DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz') archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz') download_file_if_not_exists(DOWNLOAD_URL, archive_path) untardir = 'ohsumed-all' if not os.path.exists(os.path.join(data_path, untardir)): print("untarring ohsumed...") tarfile.open(archive_path, 'r:gz').extractall(data_path) target_names = [] doc_classes = dict() class_docs = dict() content = dict() doc_ids = set() for cat_id in os.listdir(join(data_path, untardir)): target_names.append(cat_id) class_docs[cat_id] = [] for doc_id in os.listdir(join(data_path, untardir, cat_id)): doc_ids.add(doc_id) text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read() if doc_id not in doc_classes: doc_classes[doc_id] = [] doc_classes[doc_id].append(cat_id) if doc_id not in content: content[doc_id] = text_content class_docs[cat_id].append(doc_id) target_names.sort() print('Read %d different documents' % len(doc_ids)) splitdata = dict({'train': [], 'test': []}) for cat_id in target_names: free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])] if len(free_docs) > 0: split_point = int(math.floor(len(free_docs) * train_test_split)) splitdata['train'].extend(free_docs[:split_point]) splitdata['test'].extend(free_docs[split_point:]) for split in ['train', 'test']: dataset = LabelledDocuments([], [], target_names) for doc_id in splitdata[split]: dataset.data.append(content[doc_id]) dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]]) pickle.dump(dataset, open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL) print(pickle_file) return pickle.load(open(pickle_file, 'rb'))
def get_embedding_matrix_path(datasetname, dataset, pretrained, supervised, random, vec_matrix_path): matrix_name = f'{datasetname}-pretrained{pretrained}-supervised{supervised}-random{random}.vec' matrix_path = f'{vec_matrix_path}/{matrix_name}' if not os.path.exists(matrix_path): vocabulary, matrix = embedding_matrix(dataset, pretrained, supervised, random) create_if_not_exist(vec_matrix_path) save_vectors(matrix_path, vocabulary, matrix) dims = matrix.shape[1] else: _, dims = load_metadata_from_vectors_file(matrix_path) return matrix_path, dims
def main(): # init the log-file method_name = 'fasttext' method_name += '-bigrams' if args.bigrams else '-unigrams' method_name += '-glove' if args.pretrained else '' method_name += '-rand' if args.pretrained and args.learnable > 0 else '' method_name += '-sup' if args.supervised else '' logfile = CSVLog(args.log_file, [ 'dataset', 'method', 'lr', 'learnable', 'nepochs', 'seed', 'measure', 'value', 'timelapse' ], autoflush=True) logfile.set_default('dataset', args.dataset) logfile.set_default('method', method_name) logfile.set_default('seed', args.seed) logfile.set_default('lr', args.lr) logfile.set_default('learnable', args.learnable) logfile.set_default('nepochs', args.nepochs) assert args.force or not logfile.already_calculated( ), f'results for dataset {args.dataset} method {method_name} and run {args.seed} already calculated' # load dataset dataset = Dataset.load(dataset_name=args.dataset, pickle_path=args.pickle_path) matrix_path = None if args.pretrained or args.supervised: matrix_path, dims = get_embedding_matrix_path(args.dataset, dataset, args.pretrained, args.supervised, args.learnable, args.vec_matrix_path) analyzer = dataset.analyzer() devel = [ ' '.join(analyzer(t)) for t in tqdm(dataset.devel_raw, desc='indexing-devel') ] test = [ ' '.join(analyzer(t)) for t in tqdm(dataset.test_raw, desc='indexing-test') ] # dataset split tr/val/test val_size = min(int(len(devel) * .2), 20000) train, val, ytr, yva = train_test_split(devel, dataset.devel_target, test_size=val_size, random_state=args.seed, shuffle=True) yte = dataset.test_target print(f'tr={len(train)} va={len(val)} test={len(test)} docs') create_if_not_exist(args.dataset_dir) trainpath = get_input_file(train, ytr) loss = 'ova' if dataset.classification_type == 'multilabel' else 'softmax' ngrams = 2 if args.bigrams else 1 tinit = time() if matrix_path is None: model = train_supervised(input=trainpath, epoch=args.nepochs, lr=args.lr, wordNgrams=ngrams, verbose=2, minCount=1, loss=loss, dim=args.learnable) else: model = train_supervised(input=trainpath, epoch=args.nepochs, lr=args.lr, wordNgrams=ngrams, verbose=2, minCount=1, loss=loss, pretrainedVectors=matrix_path, dim=dims) tend = time() - tinit predic_and_eval(model, val, yva, 'va', dataset.classification_type, logfile, tend) predic_and_eval(model, test, yte, 'te', dataset.classification_type, logfile, tend)
def dump(self, path): create_if_not_exist(os.path.dirname(path)) self.df.to_csv(path, sep='\t')
from time import time from data.domain import pack_domains from data.tasks import WebisCLS10_task_generator from domain_adaptation.dci import DCI from domain_adaptation.pivotselection import pivot_selection import os, sys from quantification.helpers import * from util.file import create_if_not_exist from os.path import join dcf = 'cosine' npivots = 450 dataset_home = '../' vectors = '../vectors' create_if_not_exist(vectors) def __fit_predict(Xtr, ytr, Xte, svm): svm.fit(Xtr, ytr) return svm.predict(Xte) def svm_fit_predict(Xs, ys, Xt, nfolds=10): print('Xtr=', Xs.shape, ys.mean()) print('Xte=', Xt.shape) parameters = {'C': [10**i for i in range(-5, 5)]} svm = GridSearchCV(LinearSVC(), parameters, n_jobs=-1,
def main(opt): method_name = set_method_name() logfile = init_logfile(method_name, opt) dataset = Dataset.load(dataset_name=opt.dataset, pickle_path=opt.pickle_path).show() #dataset.devel_raw=dataset.devel_raw[:100] #dataset.devel_target = dataset.devel_target[:100] #dataset.devel_labelmatrix = dataset.devel_labelmatrix[:100] # tokenize and truncate to max_length bert = Token2BertEmbeddings('bert-base-uncased', max_length=opt.max_length, device=opt.device) tokenize_and_truncate(dataset, bert.tokenizer, opt.max_length) # dataset split tr/val/test (train_docs, ytr), (val_docs, yval), (test_docs, yte) = train_val_test(dataset) wce = None if opt.supervised: WCE, WCE_range, WCE_vocab = embedding_matrix(opt, dataset) wce = Token2WCEmbeddings( WCE, WCE_range, WCE_vocab, drop_embedding_prop=opt.sup_drop, device=opt.device, max_length=opt.max_length ) model = init_Net(dataset.nC, bert, wce, opt.device) optim = init_optimizer(model, lr=opt.lr, weight_decay=opt.weight_decay) criterion = init_loss(dataset.classification_type) # train-validate tinit = time() create_if_not_exist(opt.checkpoint_dir) early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{opt.net}-{opt.dataset}' if not opt.plotmode else None) train_batcher = Batcher(opt.batch_size, opt.max_epoch_length) for epoch in range(1, opt.nepochs + 1): train(model, train_docs, ytr, tinit, logfile, criterion, optim, epoch, method_name, train_batcher) # validation macrof1 = test(model, val_docs, yval, dataset.classification_type, tinit, epoch, logfile, criterion, 'va') early_stop(macrof1, epoch) if opt.test_each>0: if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or \ (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs): test(model, test_docs, yte, dataset.classification_type, tinit, epoch, logfile, criterion, 'te') if early_stop.STOP: print('[early-stop]') if not opt.plotmode: # with plotmode activated, early-stop is ignored break # restores the best model according to the Mf1 of the validation set (only when plotmode==False) stoptime = early_stop.stop_time - tinit stopepoch = early_stop.best_epoch logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime) if not opt.plotmode: print('performing final evaluation') model = early_stop.restore_checkpoint() if opt.val_epochs>0: print(f'last {opt.val_epochs} epochs on the validation set') for val_epoch in range(1, opt.val_epochs + 1): train(model, val_docs, yval, tinit, logfile, criterion, optim, epoch+val_epoch, method_name) # test print('Training complete: testing') test(model, test_docs, yte, dataset.classification_type, tinit, epoch, logfile, criterion, 'final-te')