def run_languages(args): languages, token_map, data_split, concept_ids, ipa_to_concept = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [[ 'lang', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc', 'val_loss', 'val_acc' ]] for i, lang in enumerate(languages): print() print('Lang:', i, end=' ') if args.opt: avg_len, shannon, test_shannon, test_loss, \ test_acc, val_loss, val_acc = run_opt_language_cv(lang, token_map, concept_ids, ipa_to_concept, args) else: avg_len, shannon, test_shannon, test_loss, \ test_acc, val_loss, val_acc = run_language_cv(lang, token_map, concept_ids, ipa_to_concept, args) results += [[ lang, avg_len, shannon, test_shannon, test_loss, test_acc, val_loss, val_acc ]] write_csv(results, '%s/%s__results.csv' % (args.rfolder, args.model)) write_csv(results, '%s/%s__results-final.csv' % (args.rfolder, args.model))
def run_languages_cv(args): print('------------------- Start -------------------') _, token_map, data_split, _, _ = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) max_order = 3 results = [[ 'lang', 'artificial', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print(i, end=' ') full_avg_len, avg_len, test_loss, val_loss = run_artificial_language_cv( lang, token_map, args, artificial=artificial, max_order=max_order) results += [[ lang, artificial, full_avg_len, avg_len, test_loss, val_loss ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
def run_languages(args): print('------------------- Start -------------------') _, token_map, data_split, concept_ids, ipa_to_concepts = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [[ 'lang', 'avg_len', 'test_shannon', 'test_loss', 'test_acc', 'val_loss', 'val_acc' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print('%d. %s %s' % (i, lang, 'artificial' if artificial else 'default')) avg_len, shannon, test_shannon, test_loss, \ test_acc, best_epoch, val_loss, val_acc = run_artificial_language( lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=artificial) results += [[ '%s %s' % (lang, 'art' if artificial else 'norm'), avg_len, shannon, test_shannon, test_loss, test_acc, best_epoch, val_loss, val_acc ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
def run_languages_cv(args): print('------------------- Start -------------------') _, token_map, data_split, concept_ids, ipa_to_concepts = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [[ 'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print('%d. %s %s' % (i, lang, 'artificial' if artificial else 'default')) embedding_size, hidden_size, nlayers, dropout = \ opt_params.get_artificial_opt_params(args.model, lang, artificial, args.artificial_type, args.data) avg_len, shannon, test_shannon, test_loss, \ test_acc, val_loss, val_acc = run_artificial_language_cv( lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=artificial, embedding_size=embedding_size, hidden_size=hidden_size, nlayers=nlayers, dropout=dropout) results += [[ lang, artificial, avg_len, shannon, test_shannon, test_loss, test_acc, val_loss, val_acc ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
def optimize_languages(args): print('------------------- Start -------------------') _, token_map, data_split, concept_ids, ipa_to_concepts = read_info() languages = get_languages(is_devoicing=args.is_devoicing) token_map = add_new_symbols_to_vocab(token_map) print('Model %s' % args.model) print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) n_iters = 45 bounds = np.array([[4, 256], [32, 256], [1, 2.95], [0.0, 0.5]]) n_pre_samples = 5 opt_results = [[ 'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc', 'embedding_size', 'hidden_size', 'nlayers', 'dropout' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print('%d. %s %s' % (i, lang, 'artificial' if artificial else 'default')) sample_loss = sample_loss_getter(lang, args.is_devoicing, token_map, ipa_to_concepts, args, artificial=artificial) xp, yp = bayesian_optimisation(n_iters, sample_loss, bounds, n_pre_samples=n_pre_samples) opt_results += [ get_optimal_loss(lang, args.is_devoicing, artificial, token_map, concept_ids, ipa_to_concepts, xp, yp, args) ] write_csv( results, '%s/artificial__%s__baysian-results.csv' % (args.rfolder, args.model)) write_csv( opt_results, '%s/artificial__%s__opt-results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__baysian-results-final.csv' % (args.rfolder, args.model))
def get_data_loaders(ffolder, lang, is_devoicing, token_map, args, artificial=True): _, _, data_split, _, _ = read_info() return _get_data_loaders(data_split, ffolder, lang, is_devoicing, token_map, args, artificial=artificial)
def run_languages(args): print('------------------- Start -------------------') languages, token_map, data_split, concept_ids, _ = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss']] for i, lang in enumerate(languages): print() print('%d Language %s' % (i, lang)) full_avg_len, avg_len, test_loss, val_loss = run_language_cv( lang, token_map, concept_ids, args) results += [[lang, full_avg_len, avg_len, test_loss, val_loss]] write_csv(results, '%s/unigram.csv' % (args.rfolder)) write_csv(results, '%s/unigram-final.csv' % (args.rfolder))
def run_languages(args): print('------------------- Start -------------------') languages, token_map, data_split, concept_ids, _ = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) max_order = 3 results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss'] + ['param_%d' % i for i in range(max_order)]] for i, lang in enumerate(languages): print() print('%d Language %s' % (i, lang)) full_avg_len, avg_len, test_loss, val_loss = \ run_language_cv(lang, token_map, concept_ids, args, max_order=max_order) results += [[lang, full_avg_len, avg_len, test_loss, val_loss]] # + opt_params.tolist()] write_csv(results, '%s/ngram.csv' % (args.rfolder)) write_csv(results, '%s/ngram-final.csv' % (args.rfolder))
def get_data(lang, token_map, args, artificial=True): _, _, data_split, _, _ = read_info() return _get_data(data_split, lang, token_map, args, artificial=artificial)
def get_data_split_cv(fold, nfolds, verbose=True): _, _, data_split, _, _ = read_info() concepts = [y for x in data_split for y in x] return _get_data_split_cv(fold, nfolds, concepts, verbose=verbose)