("nist", "f", "scores.NIST"), ("bleu", "f", "scores.BLEU"), ("exp_name", "S128"), ] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) for data in data_sets: exps = ex.single_exp( name=name, classifier=None, data=data, _lang=lang_pairs or config["eval"][data].keys(), _score_attr=("freq_score","dup_score", "mup_score"), build=ex.SKIP, compute_classifier_score=ex.SKIP, write_text=ex.SKIP, write_diff=ex.SKIP, draw_graphs=ex.SKIP) for ns in exps: result_store.append(ns) if __name__ == "__main__": set_default_log(log_fname="_bounds.log") bounds()
# continue classifier = MultinomialNB() models_fname = join(exp_dir, "nb_models.hdf5") builder = NBModelBuilder(tab_fname, samp_fname, models_fname, classifier, graphs_pkl_fname=graphs_pkl_fname, counts_pkl_fname=counts_pkl_fname, feat_selector=SelectKBest(chi2, k)) builder.run() nist, blue = score_model(lang_pair, exp_dir, draw=False) results[exp_count]["NIST"] = nist results[exp_count]["BLUE"] = blue exp_count += 1 results = results[:exp_count] print results results.dump("nb_feat_select_bst_results_2.pkl") # for logging to stderr in utf-8 use: set_default_log(level=logging.INFO) import logging logging.getLogger("model").setLevel(logging.DEBUG) run_all()
default=True, action="store_false", help="count single word entries (default is True)") parser.add_argument( "--with-multi-word", default=False, action="store_true", help="cont multi word entries (default is False)") parser.add_argument( "-v", "--verbose", action="store_true") args = parser.parse_args() if args.verbose: set_default_log() ambig_dist_report(lang_pairs=args.lang_pairs, entry=args.entry, with_single_word=args.with_single_word, with_multi_word=args.with_multi_word)
fname_prefix = "_" + name) # tricky: 'classifiers' cannot be an iterator # because it is called many times during grid_search classifiers = list(lr_classifier( )) # 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, _classifier=classifiers, data=data, _lang=config["eval"][data].keys(), #_lang=("de-en",), write_text=ex.SKIP, draw_graphs=ex.SKIP, #build_models=lr_build_models, n_graphs=n_graphs, ) for ns in exps: result_store.append(ns) if __name__ == "__main__": set_default_log(log_fname="_lr-1.log") lr_1( n_graphs=1 )
samples, targets = shuffle(data.samples, data.targets) for classifier in classifiers: scorer = Scorer() cross_val_score(classifier, samples, targets, scoring=scorer) params = (lemma, pos, n_cand, classifier.alpha, classifier.loss, classifier.n_iter, classifier.penalty) results[i] = params + tuple(scorer.mean_scores()) i += 1 np.save(results_fname, results[:i]) text_table(results[:i], results_fname.replace(".npy", ".txt")) if __name__ == "__main__": lang_pair = "de-en" set_default_log(log_fname="_sgd-cv-1_results_{}.log".format(lang_pair)) results_fname = "_sgd-cv-1_results_{}.npy".format(lang_pair) run_cv1(lang_pair, results_fname, #subset = {"anmelden/v*.full", "Magazin/n"} )
n_jobs=n_jobs) # 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, classifier=classifier, data=data, _lang=lang or config["eval"][data].keys(), write_text=ex.SKIP, draw_graphs=ex.SKIP, n_graphs=n_graphs, # *** input to SGDClassifier must be shuffled! *** shuffle=True, _class_weighting=(True, False), ) for ns in exps: result_store.append(ns) if __name__ == "__main__": set_default_log(log_fname="_sgd-1.log") sgd_1( name = "sgd-1", #n_graphs=2, #lang=("de-en",), n_jobs=10 )
] result_store = ResultsStore(descriptor, fname_prefix = "_" + name) classifiers = list(nc_classifier( # Contrary to docs, l1 distance (manhattan) does NOT support sparse _metric=("cosine", "euclidean"))) # 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, _classifier=classifiers, data=data, _lang=lang_pairs or config["eval"][data].keys(), write_text=ex.SKIP, draw_graphs=ex.SKIP, n_graphs=n_graphs, ) for ns in exps: result_store.append(ns) if __name__ == "__main__": set_default_log(log_fname="_nc_1.log") nc_1( data_sets = ("metis","presemt-dev"), #n_graphs=2, )
# 'data' cannot be expanded implicitly through grid search # because _lang expansion depends on its value :-( for data in data_sets: exps = ex.single_exp( name=name, _classifier=classifiers, data=data, _lang=config["eval"][data].keys(), #_lang=("de-en",), write_text=ex.SKIP, draw_graphs=ex.SKIP, build_models=nb_build_model, vectorizer=vectorizer, thrash_models=ex.thrash_models, n_graphs=n_graphs, ) for ns in exps: result_store.append(ns) if __name__ == "__main__": set_default_log(log_fname="_fs-2.log") fs_2(data_sets=( "metis", "presemt-dev" ), #n_graphs=2 )
# so assume boolean em[i,j] = 1 if log.isEnabledFor(logging.DEBUG): log.debug(u"{0} ==> {1}".format( target_lemma, ", ".join([str((reverse_vocab[j], count)) for j, count in zip(em.rows[i], em.data[i])]))) log.info("converting to csr_matrix") return em.tocsr() if __name__ == "__main__": from tg.utils import set_default_log set_default_log(level=logging.DEBUG) from tg.config import config extend_samples(#samp_hdf_fname = "en_samples_filtered.hdf5", samp_hdf_fname = "en_samples_subset_filtered.hdf5", tdict_pkl_fname = config["dict"]["de-en"]["pkl_fname"], reverse_tdict_pkl_fname = config["dict"]["en-de"]["pkl_fname"], #ext_hdf_fname = "en_samples_filtered_extended.hdf5", ##ext_hdf_fname = "en_samples_subset_filtered_extended.hdf5", ext_hdf_fname = "ff.hdf5", max_samp = 1, ) #extend_samples(#samp_hdf_fname = "de_samples_filtered.hdf5", #samp_hdf_fname = "de_samples_subset_filtered.hdf5",