def sgd_1(name = "sgd-1", data_sets = ("metis", "presemt-dev",), lang=None, n_graphs=None, n_jobs=1): remove_exp_dir(name) descriptor = [ ("data", "S16"), ("source", "S8", "source_lang"), ("target", "S8", "target_lang"), ("class_weighting", "b"), ("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("correct", "i", "accuracy.correct"), ("incorrect", "i", "accuracy.incorrect"), ("ignored", "i", "accuracy.ignored"), ("accuracy", "f", "accuracy.score"), ("exp_name", "S128"), ("models_fname", "S256"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) # best setting found in sgd-cv exps classifier = SGDClassifier(loss = "log", penalty = "l2", alpha = 0.001, n_iter = 5, shuffle=True, n_jobs=n_jobs) # 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, classifier=classifier, data=data, _lang=lang or config["eval"][data].keys(), write_text=ex.SKIP, draw_graphs=ex.SKIP, n_graphs=n_graphs, # *** input to SGDClassifier must be shuffled! *** shuffle=True, _class_weighting=(True, False), ) for ns in exps: result_store.append(ns)
def fs_2(data_sets=("metis, presemt-dev"), n_graphs=None): name = "fs-2" remove_exp_dir(name) descriptor = [ ("data", "S16"), ("source", "S8", "source_lang"), ("target", "S8", "target_lang"), ("min_count", "f", "MCF__min_count"), ("max_freq", "f", "MFF__max_freq"), ("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("exp_name", "S128"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) # tricky: 'classifiers' cannot be an iterator # because it is called many times during grid_search classifiers = list(nb_classifier( _min_count=[1, 5, 10, 25, 50, 100, 250, 500], _max_freq=[1.0, 0.5, 0.1, 0.075, 0.05, 0.025, 0.01, 0.005], chi2_alpha=None)) vectorizer=Vectorizer(score_attr="freq_score") # 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, _classifier=classifiers, data=data, _lang=config["eval"][data].keys(), #_lang=("de-en",), write_text=ex.SKIP, draw_graphs=ex.SKIP, build_models=nb_build_model, vectorizer=vectorizer, thrash_models=ex.thrash_models, n_graphs=n_graphs, ) for ns in exps: result_store.append(ns)
def bounds(data_sets=config["eval"]["data_sets"], lang_pairs=()): """ Compute upper and lower bounds on scores. The baseline that serves as the lower bound is the Most Frequent Translation (MFT) score, which is obtained by choosing the translation with te highest frequency in the target language corpus. Upper bound is the Approximated Maximum (AM) score, which is obtained by choosing the translation that occurs most often in the reference translations(s) of the sentence. Probility scores for lempos translations are already in the preprocessed graphs. This function just computes the resulting NIST and BLEU scores. """ name = "bounds" remove_exp_dir(name) descriptor = [ ("data", "S16"), ("source", "S8", "source_lang"), ("target", "S8", "target_lang"), ("score_attr", "S16", "score_attr"), ("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("exp_name", "S128"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) for data in data_sets: exps = ex.single_exp( name=name, classifier=None, data=data, _lang=lang_pairs or config["eval"][data].keys(), _score_attr=("freq_score","dup_score", "mup_score"), build=ex.SKIP, compute_classifier_score=ex.SKIP, write_text=ex.SKIP, write_diff=ex.SKIP, draw_graphs=ex.SKIP) for ns in exps: result_store.append(ns)
def lr_1(name = "lr-1", data_sets = ("presemt-dev",), n_graphs=None): remove_exp_dir(name) descriptor = [ ("data", "S16"), ("source", "S8", "source_lang"), ("target", "S8", "target_lang"), ("loss", "S16", "classifier.loss"), ("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("correct", "i", "accuracy.correct"), ("incorrect", "i", "accuracy.incorrect"), ("ignored", "i", "accuracy.ignored"), ("accuracy", "f", "accuracy.score"), ("exp_name", "S128"), ("models_fname", "S256"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) # tricky: 'classifiers' cannot be an iterator # because it is called many times during grid_search classifiers = list(lr_classifier( )) # 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, _classifier=classifiers, data=data, _lang=config["eval"][data].keys(), #_lang=("de-en",), write_text=ex.SKIP, draw_graphs=ex.SKIP, #build_models=lr_build_models, n_graphs=n_graphs, ) for ns in exps: result_store.append(ns)
def nc_2(name = "nc-2", n_graphs=None): remove_exp_dir(name) descriptor = [ ("data", "S16"), ("source", "S8", "source_lang"), ("target", "S8", "target_lang"), ("metric", "S16", "NCC__metric"), ("vect_score_attr", "S16", "vectorizer.score_attr"), ("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("correct", "i", "accuracy.correct"), ("incorrect", "i", "accuracy.incorrect"), ("ignored", "i", "accuracy.ignored"), ("accuracy", "f", "accuracy.score"), ("exp_name", "S128"), ("models_fname", "S256"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) vectorizers= [Vectorizer(score_attr=score_attr) for score_attr in (None, "freq_score", "dup_score")] nc_1_results = np.load("_nc-1.npy") for record in nc_1_results: exps = ex.single_exp( name=name, data=record["data"], lang=record["source"] + "-" + record["target"], classifier=None, write_text=ex.SKIP, draw_graphs=ex.SKIP, build_models=ex.SKIP, trash_models=ex.SKIP, models_fname=record["models_fname"], _vectorizer=vectorizers, n_graphs=n_graphs, ) for ns in exps: # hack, because there is no classifier in exps ns.NCC__metric = record["metric"] result_store.append(ns)
def nb_2(name = "nb-2", n_graphs=None): remove_exp_dir(name) descriptor = [ ("data", "S16"), ("source", "S8", "source_lang"), ("target", "S8", "target_lang"), ("class_weighting", "b"), ("vect_score_attr", "S16", "vectorizer.score_attr"), ("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("correct", "i", "accuracy.correct"), ("incorrect", "i", "accuracy.incorrect"), ("ignored", "i", "accuracy.ignored"), ("accuracy", "f", "accuracy.score"), ("exp_name", "S128"), ("models_fname", "S256"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) vectorizers=list(vectorizer( _score_attr=(None, "freq_score", "dup_score"))) nb_1_results = np.load("_nb-1.npy") for record in nb_1_results: exps = ex.single_exp( name=name, data=record["data"], lang=record["source"] + "-" + record["target"], classifier=None, class_weighting=record["class_weighting"], write_text=ex.SKIP, draw_graphs=ex.SKIP, build_models=ex.SKIP, trash_models=ex.SKIP, models_fname=record["models_fname"], _vectorizer=vectorizers, n_graphs=n_graphs, ) for ns in exps: result_store.append(ns)
def nc_1(data_sets=config["eval"]["data_sets"], lang_pairs=(), n_graphs=None, name = "nc-1"): remove_exp_dir(name) descriptor = [ ("data", "S16"), ("source", "S8", "source_lang"), ("target", "S8", "target_lang"), ("metric", "S16", "NCC__metric"), ("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("correct", "i", "accuracy.correct"), ("incorrect", "i", "accuracy.incorrect"), ("ignored", "i", "accuracy.ignored"), ("accuracy", "f", "accuracy.score"), ("exp_name", "S128"), ("models_fname", "S256"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) classifiers = list(nc_classifier( # Contrary to docs, l1 distance (manhattan) does NOT support sparse _metric=("cosine", "euclidean"))) # 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, _classifier=classifiers, data=data, _lang=lang_pairs or config["eval"][data].keys(), write_text=ex.SKIP, draw_graphs=ex.SKIP, n_graphs=n_graphs, ) for ns in exps: result_store.append(ns)