def max_scores(): subset = r[(r["alpha"] == 0.001) & (r["loss"] == "log") & (r["n_iter"] == 5) & (r["penalty"] == "l2")] subset.sort(axis=0, order=["f-score"]) text_table(subset[::-1]) print
def ambig_dist_report(lang_pairs=config["dict"].keys(), entry="lempos", with_single_word=True, with_multi_word=False, max_trans=1000, outf=sys.stdout): """ Report statistics on translation ambiguity in dictionary Parameters ---------- trans_dict: TransDict object entry: 'lempos' or 'lemma' count lemma and POS tag combinations or lemmas only (sums ambiguity of lemmas with different POS tag) with_single_word: bool count single words with_multi_word: bool count multi-word expressions max_trans: int maximum number of translations considered outf: file output file (defaults to stdout) """ for lang_pair in lang_pairs: pkl_fname = config["dict"][lang_pair]["pkl_fname"] outf.write("dictionary file: {}\n".format(pkl_fname)) outf.write("language pair: {}\n".format(lang_pair)) outf.write("entries: {}\n".format(entry)) outf.write("count single word entries: {}\n".format(with_single_word)) outf.write("count multi-word entries: {}\n".format(with_multi_word)) outf.write("maximum number of translations: {}\n".format(max_trans)) trans_dict = cPickle.load(open(pkl_fname)) dist = ambig_dist(trans_dict, entry=entry, with_single_word=with_single_word, with_multi_word=with_multi_word, max_trans=max_trans) outf.write("total number of entries: {0}\n".format(dist["count"].sum())) outf.write("total number of ambiguous entries: {0} ({1:.2f}%)\n".format( dist[2:]["count"].sum(), dist[2:]["percent"].sum())) outf.write("total number of non-ambiguous entries: {0} ({1:.2f}%)\n".format( dist[:2]["count"].sum(), dist[:2]["percent"].sum())) av_ambig = ( (dist[2:]["count"] * dist[2:]["#trans"]).sum() / dist[2:]["count"].sum().astype("float")) outf.write("average ambiguity (over ambiguous entries only): " "{0:.2f} translations\n\n".format(av_ambig)) text_table(dist, outf) outf.write("\n\n") print "\n"
def summary(): params = ["alpha", "loss", "n_iter", "penalty"] scores = ["prec", "rec", "f-score", "accuracy"] keys = np.unique(r[params]) summary = np.zeros(len(keys), r.dtype.descr[3:]) for i, k in enumerate(keys): subset = r[r[params] == k] subset_scores = subset[scores] view = subset_scores.view(("f", len(subset_scores.dtype.names))) means = view.mean(axis=0) summary[i] = tuple(k) + tuple(means) summary.sort(axis=0, order=["f-score"]) text_table(summary[::-1]) print
def nb_exp(data_sets=config["eval"]["data_sets"], lang_pairs=(), text=False, draw=False, diff=False, trash_models=False): n_components = 10 descriptor = [ ("data", "S16"), ("lang", "S8"), ("nist", "f"), ("blue", "f"), ("name", "S256") ] results = np.zeros(9999, dtype=descriptor) exp_count = 0 script_fname = os.path.splitext(os.path.basename(__file__))[0] results_fname = "_" + script_fname + "_results.txt" results_outf = open(results_fname, "w") for data in data_sets: for lang in lang_pairs or config["eval"][data].keys(): ambig_fname = config["sample"][lang]["ambig_fname"] try: samples_fname = config["sample"][lang]["samples_filt_fname"] except KeyError: samples_fname = config["sample"][lang]["samples_fname"] log.warn("backing off to unfiltered samples from " + samples_fname) graphs_fname = config["eval"][data][lang]["graphs_fname"] name = "{}_{}_{}".format( script_fname, data, lang) exp_dir = "_" + name if not os.path.exists(exp_dir): os.makedirs(exp_dir) models_fname = exp_dir + "/" + name + ".hdf5" classifier = Pipeline( [("MCF", MinCountFilter(5)), ("MFF", MaxFreqFilter(0.05)), ("CHI2", SelectFpr(chi2, alpha=0.001 )), ("NMF", NMF(n_components=n_components)), ("MNB", MultinomialNB()), ]) # get ambiguity map ambig_map = AmbiguityMap(ambig_fname, graphs_fname=graphs_fname) #ambig_map = AmbiguityMap(ambig_fname, subset={"klar/adj"}) # train classifier model_builder = ModelBuilder( ambig_map, samples_fname, models_fname, classifier) #,with_vocab_mask=True) model_builder.run() # apply classifier model = TranslationClassifier(models_fname) score_attr="nb_score" source_lang = lang.split("-")[0] scorer = ClassifierScore(model, score_attr=score_attr, filter=filter_functions(source_lang), vectorizer="mft") graph_list = cPickle.load(open(graphs_fname)) scorer(graph_list) best_scorer = BestScorer(["nb_score", "freq_score"]) best_scorer(graph_list) scored_graphs_fname = exp_dir + "/" + name + "_graphs.pkl" log.info("saving scored graphs to " + scored_graphs_fname) cPickle.dump(graph_list, open(scored_graphs_fname, "w")) #graph_list = cPickle.load(open(scored_graphs_fname)) nist_score, bleu_score = postprocess( name, data, lang, graph_list, best_score_attr="best_score", base_score_attrs=["nb_score","freq_score"], out_dir=exp_dir, base_fname=name, text=text, draw=draw, diff=diff ) results[exp_count] = (data, lang, nist_score, bleu_score, name) results_fname = exp_dir + "/" + name + ".npy" log.info("saving result to " + results_fname) np.save(results_fname, results[exp_count]) exp_count += 1 if trash_models: log.info("Trashing models file " + models_fname) os.remove(models_fname) # add to table of results per data set & language pair sub_results = results[(results["lang"] == lang) & (results["data"] == data)] sub_results = np.sort(sub_results, axis=0, order=("lang", "blue"))[::-1] text_table(sub_results, results_outf) results_outf.write("\n\n") results_outf.close() results = results[:exp_count] results_fname = "_" + script_fname + "_results.npy" log.info("saving pickled results to " + results_fname) np.save(results_fname, results) text_table(results) return results
def run_cv1(lang_pair, results_fname, subset=None): ambig_fname = config["sample"][lang_pair]["ambig_fname"] ambig_map = AmbiguityMap(ambig_fname, subset=subset) samples_fname = config["sample"][lang_pair]["samples_filt_fname"] sample_hdfile = h5py.File(samples_fname, "r") data_gen = DataSetGenerator(ambig_map, sample_hdfile) classifiers = list(sgd_classifier( _alpha = (0.00001, 0.0001, 0.001), _loss = ("hinge", "log"), _n_iter = (5, 10), _penalty = ("l1", "l2"), shuffle = True, # shuffle seems always benificial random_state = 73761232569, # but needs to be repeatable n_jobs = 10, )) descriptor = [ ("lemma", "U32"), ("pos", "U32"), ("#cand", "i"), ("alpha", "f"), ("loss", "S16"), ("n_iter", "i"), ("penalty", "S16"), ("prec", "f"), ("rec", "f"), ("f-score", "f"), ("accuracy", "f")] results = np.zeros(9999, dtype=descriptor) i = 0 for n, data in enumerate(data_gen): if not data.target_lempos: log.error(data.source_lempos + u"no samples") continue log.info(u"{}/{} {}".format(n+1, len(ambig_map), data.source_lempos)) lemma, pos = data.source_lempos.rsplit("/", 1) n_cand = len(data.target_lempos) # *** shuffling is essential for SGD! *** samples, targets = shuffle(data.samples, data.targets) for classifier in classifiers: scorer = Scorer() cross_val_score(classifier, samples, targets, scoring=scorer) params = (lemma, pos, n_cand, classifier.alpha, classifier.loss, classifier.n_iter, classifier.penalty) results[i] = params + tuple(scorer.mean_scores()) i += 1 np.save(results_fname, results[:i]) text_table(results[:i], results_fname.replace(".npy", ".txt"))
def centroid_exp(data_sets=config["eval"]["data_sets"], lang_pairs=(), text=False, draw=False, diff=False, trash_models=False, dump_centroids=False): descriptor = [ ("data", "S16"), ("lang", "S8"), ("min_count", "f"), ("max_freq", "f"), ("nist", "f"), ("blue", "f"), ("name", "S256") ] results = np.zeros(9999, dtype=descriptor) exp_count = 0 script_fname = os.path.splitext(os.path.basename(__file__))[0] results_fname = "_" + script_fname + "_results.txt" results_outf = open(results_fname, "w") for data in data_sets: for lang in lang_pairs or config["eval"][data].keys(): ambig_fname = config["sample"][lang]["ambig_fname"] try: samples_fname = config["sample"][lang]["samples_filt_fname"] except KeyError: samples_fname = config["sample"][lang]["samples_fname"] log.warn("backing off to unfiltered samples from " + samples_fname) graphs_fname = config["eval"][data][lang]["graphs_fname"] #for min_count in (1, 5, 10, 25, 50, 100, 250, 1000, 2500, 5000): # for max_freq in (0.0001, 0.001, 0.005, 0.01, 0.05, 0.10, 0.25, 0.5, 1.0): for min_count in (5,): for max_freq in (0.01,): name = "{}_{}_{}_min_count={:d}_max_freq={:f}".format( script_fname, data, lang, min_count, max_freq) exp_dir = "_" + name if not os.path.exists(exp_dir): os.makedirs(exp_dir) models_fname = exp_dir + "/" + name + ".hdf5" classifier = Pipeline( [("MCF", MinCountFilter(min_count)), ("MFF", MaxFreqFilter(max_freq)), ("CHI2", SelectFpr()), #("TFIDF", TfidfTransformer()), ("CNC", CosNearestCentroid()) #("NC", NearestCentroidProb()) ]) # train classifier model_builder = ModelBuilder( ambig_fname, samples_fname, models_fname, classifier, graphs_fname, with_vocab_mask=True) model_builder.run() # print the centroids to a file, only the 50 best features if dump_centroids: print_fname = exp_dir + "/" + name + "_centroids.txt" print_centroids(models_fname, n=50, outf=print_fname) # apply classifier model = TranslationClassifier(models_fname) score_attr="centroid_score" source_lang = lang.split("-")[0] scorer = ClassifierScore(model, score_attr=score_attr, filter=filter_functions(source_lang)) graph_list = cPickle.load(open(graphs_fname)) scorer(graph_list) best_scorer = BestScorer(["centroid_score", "freq_score"]) best_scorer(graph_list) scored_graphs_fname = exp_dir + "/" + name + "_graphs.pkl" log.info("saving scored graphs to " + scored_graphs_fname) cPickle.dump(graph_list, open(scored_graphs_fname, "w")) #graph_list = cPickle.load(open(scored_graphs_fname)) nist_score, bleu_score = postprocess( name, data, lang, graph_list, best_score_attr="best_score", base_score_attrs=["centroid_score","freq_score"], out_dir=exp_dir, base_fname=name, text=text, draw=draw, diff=diff ) results[exp_count] = (data, lang, min_count, max_freq, nist_score, bleu_score, name) results_fname = exp_dir + "/" + name + ".npy" log.info("saving result to " + results_fname) np.save(results_fname, results[exp_count]) exp_count += 1 if trash_models: log.info("Trashing models file " + models_fname) os.remove(models_fname) sub_results = results[(results["lang"] == lang) & (results["data"] == data)] sub_results = np.sort(sub_results, axis=0, order=("lang", "blue"))[::-1] text_table(sub_results, results_outf) results_outf.write("\n\n") results_outf.close() results = results[:exp_count] results_fname = "_" + script_fname + "_results.npy" log.info("saving pickled results to " + results_fname) np.save(results_fname, results) text_table(results) return results
("bleu", "f"), ("exp_name", "S128"), ] new_results = np.zeros(len(old_results), descriptor) for i, exp in enumerate(old_results): ref_fname = config["eval"][exp["data"]][exp["source"] + "-" + exp["target"]]["lemma_ref_fname"] graphs_fname = "_{}/{}_graphs.pkl".format(name, exp["exp_name"]) graphs = cPickle.load(open(graphs_fname)) accuracy = accuracy_score(graphs, ref_fname, name + "_score") new_results[i]["graphs"] = len(graphs) new_results[i]["data"] = exp["data"] new_results[i]["source"] = exp["source"] new_results[i]["target"] = exp["target"] new_results[i]["min_count"] = exp["min_count"] new_results[i]["max_freq"] = exp["max_freq"] new_results[i]["correct"] = accuracy.correct new_results[i]["incorrect"] = accuracy.incorrect new_results[i]["accuracy"] = accuracy.score new_results[i]["ignored"] = accuracy.ignored new_results[i]["nist"] = exp["nist"] new_results[i]["bleu"] = exp["bleu"] new_results[i]["exp_name"] = exp["exp_name"] np.save("_" + name + "-acc.npy", new_results) text_table(new_results, "_" + name + "-acc.txt")
def print_results(results_fname, out_fname=None): table = numpy.load(results_fname) text_table(table, out_fname)