def run_languages_cv(args): print('------------------- Start -------------------') _, token_map, data_split, _, _ = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) max_order = 3 results = [[ 'lang', 'artificial', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print(i, end=' ') full_avg_len, avg_len, test_loss, val_loss = run_artificial_language_cv( lang, token_map, args, artificial=artificial, max_order=max_order) results += [[ lang, artificial, full_avg_len, avg_len, test_loss, val_loss ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
def run_languages(args): print('------------------- Start -------------------') _, token_map, data_split, concept_ids, ipa_to_concepts = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [[ 'lang', 'avg_len', 'test_shannon', 'test_loss', 'test_acc', 'val_loss', 'val_acc' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print('%d. %s %s' % (i, lang, 'artificial' if artificial else 'default')) avg_len, shannon, test_shannon, test_loss, \ test_acc, best_epoch, val_loss, val_acc = run_artificial_language( lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=artificial) results += [[ '%s %s' % (lang, 'art' if artificial else 'norm'), avg_len, shannon, test_shannon, test_loss, test_acc, best_epoch, val_loss, val_acc ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
def run_languages(args): languages, token_map, data_split, concept_ids, ipa_to_concept = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [[ 'lang', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc', 'val_loss', 'val_acc' ]] for i, lang in enumerate(languages): print() print('Lang:', i, end=' ') if args.opt: avg_len, shannon, test_shannon, test_loss, \ test_acc, val_loss, val_acc = run_opt_language_cv(lang, token_map, concept_ids, ipa_to_concept, args) else: avg_len, shannon, test_shannon, test_loss, \ test_acc, val_loss, val_acc = run_language_cv(lang, token_map, concept_ids, ipa_to_concept, args) results += [[ lang, avg_len, shannon, test_shannon, test_loss, test_acc, val_loss, val_acc ]] write_csv(results, '%s/%s__results.csv' % (args.rfolder, args.model)) write_csv(results, '%s/%s__results-final.csv' % (args.rfolder, args.model))
def run_language_cv(lang, token_map, concept_ids, args): global full_results, fold nfolds = 10 avg_test_loss, avg_val_loss = 0, 0 train_loader, val_loader, test_loader = get_data(lang) full_avg_len, avg_len, _, _ = _run_language(lang, train_loader, val_loader, test_loader, token_map, args) for fold in range(nfolds): print() print('Fold:', fold, end=' ') train_loader, val_loader, test_loader = get_data_cv( args.ffolder, fold, nfolds, lang, token_map, concept_ids) _, _, test_loss, val_loss = _run_language(lang, train_loader, val_loader, test_loader, token_map, args) full_results += [[ lang, fold, full_avg_len, avg_len, test_loss, val_loss ]] # + opt_params.tolist()] avg_test_loss += test_loss / nfolds avg_val_loss += val_loss / nfolds write_csv(full_results, '%s/%s__full-results.csv' % (args.rfolder, args.model)) return full_avg_len, avg_len, avg_test_loss, avg_val_loss
def run_languages_cv(args): print('------------------- Start -------------------') _, token_map, data_split, concept_ids, ipa_to_concepts = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [[ 'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print('%d. %s %s' % (i, lang, 'artificial' if artificial else 'default')) embedding_size, hidden_size, nlayers, dropout = \ opt_params.get_artificial_opt_params(args.model, lang, artificial, args.artificial_type, args.data) avg_len, shannon, test_shannon, test_loss, \ test_acc, val_loss, val_acc = run_artificial_language_cv( lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=artificial, embedding_size=embedding_size, hidden_size=hidden_size, nlayers=nlayers, dropout=dropout) results += [[ lang, artificial, avg_len, shannon, test_shannon, test_loss, test_acc, val_loss, val_acc ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
def run_artificial_language_cv(lang, token_map, args, artificial=True, max_order=3): global full_results, fold nfolds = 10 avg_test_loss, avg_val_loss = 0, 0 train_loader, val_loader, test_loader = get_data(lang, token_map, args, artificial=artificial) full_avg_len, avg_len, _, _, _, xp, yp = _run_language_bayesian( lang, train_loader, val_loader, test_loader, token_map, args, max_order=max_order) for fold in range(nfolds): print() print('Fold:', fold, end=' ') train_loader, val_loader, test_loader = get_data_cv( fold, nfolds, lang, token_map, args, artificial=artificial) full_avg_len_tmp, avg_len_tmp, test_loss, val_loss, opt_params = _run_language_opt( lang, train_loader, val_loader, test_loader, token_map, xp, yp, args, max_order=max_order) full_results += [[ lang, artificial, fold, full_avg_len_tmp, avg_len_tmp, test_loss, val_loss ]] # + opt_params.tolist()] avg_test_loss += test_loss / nfolds avg_val_loss += val_loss / nfolds write_csv( full_results, '%s/artificial__%s__full-results.csv' % (args.rfolder, args.model)) return full_avg_len, avg_len, avg_test_loss, avg_val_loss
def run_languages(args): print('------------------- Start -------------------') languages, token_map, data_split, concept_ids, _ = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss']] for i, lang in enumerate(languages): print() print('%d Language %s' % (i, lang)) full_avg_len, avg_len, test_loss, val_loss = run_language_cv( lang, token_map, concept_ids, args) results += [[lang, full_avg_len, avg_len, test_loss, val_loss]] write_csv(results, '%s/unigram.csv' % (args.rfolder)) write_csv(results, '%s/unigram-final.csv' % (args.rfolder))
def run_artificial_language_cv(lang, is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=True, embedding_size=None, hidden_size=256, nlayers=1, dropout=0.2): global full_results nfolds = 10 avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc = 0, 0, 0, 0, 0, 0 for fold in range(nfolds): print() print(fold, end=' ') print('Best hyperparams emb-hs: %d, hs: %d, nlayers: %d, drop: %.4f' % (embedding_size, hidden_size, nlayers, dropout)) train_loader, val_loader, test_loader = \ get_data_loaders_cv(args.ffolder, fold, nfolds, lang, is_devoicing, token_map, args, artificial=artificial) avg_len, shannon, test_shannon, test_loss, \ test_acc, best_epoch, val_loss, val_acc = _run_language( '%s %s' % (lang, 'art' if artificial else 'norm'), train_loader, val_loader, test_loader, token_map, ipa_to_concepts, args, embedding_size=embedding_size, hidden_size=hidden_size, nlayers=nlayers, dropout=dropout) full_results += [[ lang, artificial, fold, avg_len, test_shannon, test_loss, test_acc, val_loss, val_acc, best_epoch ]] avg_shannon += shannon / nfolds avg_test_shannon += test_shannon / nfolds avg_test_loss += test_loss / nfolds avg_test_acc += test_acc / nfolds avg_val_loss += val_loss / nfolds avg_val_acc += val_acc / nfolds write_csv( full_results, '%s/artificial__%s__full-results.csv' % (args.rfolder, args.model)) return avg_len, avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc
def run_languages(args): print('------------------- Start -------------------') languages, token_map, data_split, concept_ids, _ = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) max_order = 3 results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss'] + ['param_%d' % i for i in range(max_order)]] for i, lang in enumerate(languages): print() print('%d Language %s' % (i, lang)) full_avg_len, avg_len, test_loss, val_loss = \ run_language_cv(lang, token_map, concept_ids, args, max_order=max_order) results += [[lang, full_avg_len, avg_len, test_loss, val_loss]] # + opt_params.tolist()] write_csv(results, '%s/ngram.csv' % (args.rfolder)) write_csv(results, '%s/ngram-final.csv' % (args.rfolder))
def run_language_cv(lang, token_map, concept_ids, ipa_to_concept, args, embedding_size=None, hidden_size=256, nlayers=1, dropout=0.2): global full_results, fold nfolds = 10 avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc = 0, 0, 0, 0, 0, 0 for fold in range(nfolds): print() print('Fold:', fold, end=' ') train_loader, val_loader, test_loader = get_data_loaders_cv( args.ffolder, fold, nfolds, lang, token_map, concept_ids) avg_len, shannon, test_shannon, test_loss, \ test_acc, best_epoch, val_loss, val_acc = _run_language( lang, train_loader, val_loader, test_loader, token_map, ipa_to_concept, args, embedding_size=embedding_size, hidden_size=hidden_size, nlayers=nlayers, dropout=dropout, per_word=True) full_results += [[ lang, fold, avg_len, test_shannon, test_loss, test_acc, val_loss, val_acc, best_epoch ]] avg_shannon += shannon / nfolds avg_test_shannon += test_shannon / nfolds avg_test_loss += test_loss / nfolds avg_test_acc += test_acc / nfolds avg_val_loss += val_loss / nfolds avg_val_acc += val_acc / nfolds write_csv(full_results, '%s/%s__full-results.csv' % (args.rfolder, args.model)) return avg_len, avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc
def optimize_languages(args): print('------------------- Start -------------------') _, token_map, data_split, concept_ids, ipa_to_concepts = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Model %s' % args.model) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) n_iters = 45 bounds = np.array([[4, 256], [32, 256], [1, 2.95], [0.0, 0.5]]) n_pre_samples = 5 opt_results = [[ 'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc', 'embedding_size', 'hidden_size', 'nlayers', 'dropout' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print('%d. %s %s' % (i, lang, 'artificial' if artificial else 'default')) sample_loss = sample_loss_getter(lang, args.is_devoicing, token_map, ipa_to_concepts, args, artificial=artificial) xp, yp = bayesian_optimisation(n_iters, sample_loss, bounds, n_pre_samples=n_pre_samples) opt_results += [ get_optimal_loss(lang, args.is_devoicing, artificial, token_map, concept_ids, ipa_to_concepts, xp, yp, args) ] write_csv( results, '%s/artificial__%s__baysian-results.csv' % (args.rfolder, args.model)) write_csv( opt_results, '%s/artificial__%s__opt-results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__baysian-results-final.csv' % (args.rfolder, args.model))