def run_all(): language_pairs = "en-de", "de-en" tab_fnames = ( join(config["private_data_dir"], "corpmod/en/de/en-de_ambig.tab"), join(config["private_data_dir"], "corpmod/de/en/de-en_ambig.tab") ) descriptor = {'names': ('lang_pair', 'vocab_i', 'vocab_j', 'NIST', 'BLUE'), 'formats': ('S8','i4', 'i4', 'f4', 'f4')} results = np.zeros(9999, dtype=descriptor) exp_count = 0 for lang_pair, tab_fname in zip(language_pairs, tab_fnames): target_lang = lang_pair.split("-")[1] samp_fname = target_lang + "_samples_subset_filtered.hdf5" graphs_pkl_fname = "prep/{}_graphs.pkl".format(lang_pair) lempos_subset = extract_source_lempos_subset(graphs_pkl_fname) target_lang = lang_pair.split("-")[1] counts_pkl_fname = config["count"]["lemma"][target_lang]["pkl_fname"] for vocab_i, vocab_j in cut_vocab(samp_fname, counts_pkl_fname): results[exp_count] = ( lang_pair, vocab_i, vocab_j, 0, 0 ) exp_dir = "exp_" + "_".join("{}={}".format(var, value) for var, value in zip(results.dtype.names, results[exp_count])[:-2]) if not os.path.exists(exp_dir): os.makedirs(exp_dir) classifier = MultinomialNB() models_fname = join(exp_dir, "nb_models.hdf5") make_models(tab_fname, samp_fname, models_fname, classifier, source_lempos_subset=lempos_subset, counts_pkl_fname=counts_pkl_fname, vocab_i=vocab_i, vocab_j=vocab_j) nist, blue = score_model(lang_pair, exp_dir, draw=False) results[exp_count]["NIST"] = nist results[exp_count]["BLUE"] = blue exp_count += 1 results = results[:exp_count] print results results.dump("nb_cut_vocab_results.pkl")
def run_all(): language_pairs = "en-de", "de-en" tab_fnames = ( join(config["private_data_dir"], "corpmod/en/de/en-de_ambig.tab"), join(config["private_data_dir"], "corpmod/de/en/de-en_ambig.tab") ) extended_vectors = True, #False classifier_types = MultinomialNB, BernoulliNB alpha_values = 1.0, 0.1, 0.01, np.finfo(np.double).eps corpus_prior_values = False, True # the fit_prior parameters seems to make absolutely no difference # fit_prior_values = True, False descriptor = {'names': ('lang_pair', 'classifier', 'alpha', 'corpus_priors', 'extended', 'NIST', 'BLUE'), 'formats': ('S8', 'S64', 'f4', 'b', 'b', 'f4', 'f4')} results = np.zeros(9999, dtype=descriptor) exp_count = 0 for lang_pair, tab_fname in zip(language_pairs, tab_fnames): graphs_pkl_fname = "prep/{}_graphs.pkl".format(lang_pair) lempos_subset = extract_source_lempos_subset(graphs_pkl_fname) target_lang = lang_pair.split("-")[1] for extended in extended_vectors: if extended: samp_fname = target_lang + "_samples_subset_filtered_extended.hdf5" else: samp_fname = target_lang + "_samples_subset_filtered.hdf5" for corpus_prior in corpus_prior_values: if corpus_prior: counts_pkl_fname = config["count"]["lemma"][target_lang]["pkl_fname"] else: counts_pkl_fname = None for classifier_class in classifier_types: for alpha in alpha_values: results[exp_count] = ( lang_pair, classifier_class.__name__, alpha, corpus_prior, extended, 0, 0 ) exp_dir = "exp_" + "_".join("{}={}".format(var, value) for var, value in zip(results.dtype.names, results[exp_count])[:-2]) if not os.path.exists(exp_dir): os.makedirs(exp_dir) classifier = classifier_class(alpha=alpha) models_fname = join(exp_dir, "nb_models.hdf5") make_models(tab_fname, samp_fname, models_fname, classifier, source_lempos_subset=lempos_subset, counts_pkl_fname=counts_pkl_fname) nist, blue = score_model(lang_pair, exp_dir, draw=False) results[exp_count]["NIST"] = nist results[exp_count]["BLUE"] = blue exp_count += 1 results = results[:exp_count] print results results.dump("results.pkl")