def print_aggregate(cnts): for k in cnts: print k tot = 0 for v, k2 in dictsort(cnts[k]): tot += v for v, k2 in dictsort(cnts[k]): print "\t", percent(v, tot), k2
def print_aggregate_compare(cnts, cntsmore): """ Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs. """ cntscopy = copy.deepcopy(cnts) for k in cnts: print k for k2 in cnts[k].keys(): cntscopy[k][k2] = (1. * cnt[k][k2]/cntsmore[k][k2], cnts[k][k2], cntsmore[k][k2]) maxperc = dictsort(cntscopy[k])[0][0][0] for v, k2 in dictsort(cntscopy[k]): # The second column (v[0]/maxperc) is a score for how good this hyperparam is. print "\t", k2, "\t", "%.2f" % (v[0]/maxperc), "\t", percent(v[1], v[2], rev=True)
def print_aggregate_compare(cnts, cntsmore): """ Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs. """ cntscopy = copy.deepcopy(cnts) for k in cnts: print k for k2 in cnts[k].keys(): cntscopy[k][k2] = (1. * cnt[k][k2] / cntsmore[k][k2], cnts[k][k2], cntsmore[k][k2]) maxperc = dictsort(cntscopy[k])[0][0][0] for v, k2 in dictsort(cntscopy[k]): # The second column (v[0]/maxperc) is a score for how good this hyperparam is. print "\t", k2, "\t", "%.2f" % (v[0] / maxperc), "\t", percent( v[1], v[2], rev=True)
from vocabulary import wordmap, wordform, language from targetvocabulary import targetmap for w1 in wordmap().all: w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)` continue if w1 not in targetmap(): print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)` continue for l2 in targetmap()[w1]: totcnt = 0 for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2])] print >> sys.stderr, "REVERSE MAP NOW" for w1 in wordmap().all: w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)` continue if w1 not in targetmap(name="reverse"): print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)` continue for l2 in targetmap(name="reverse")[w1]:
from targetvocabulary import targetmap for w1 in wordmap().all: w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) ` continue if w1 not in targetmap(): print >> sys.stderr, "Skipping %s, not a source word in targetmap" % ` wordmap( ).str(w1) ` continue for l2 in targetmap()[w1]: totcnt = 0 for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt print wordmap().str(w1), l2, [ (percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2]) ] print >> sys.stderr, "REVERSE MAP NOW" for w1 in wordmap().all: w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) ` continue
import w2w.corpora import string from common.mydict import sort as dictsort from collections import defaultdict wordfreq = defaultdict(int) for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames(): for w in readwords(f1): wordfreq[(l1,w)] += 1 for w in readwords(f2): wordfreq[(l2,w)] += 1 for l, f in w2w.corpora.monocorpora_filenames(): assert 0 for (l, w) in wordfreq.keys(): if wordfreq[(l, w)] < HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]: del wordfreq[(l, w)] if w == "*UNKNOWN*": del wordfreq[(l, w)] import w2w.vocabulary import common.idmap wordfreqkeys = [key for cnt, key in dictsort(wordfreq)] # for k in wordfreq.keys(): # print k v = common.idmap.IDmap([(None, "*LBOUNDARY*"), (None, "*RBOUNDARY*")] + wordfreqkeys, allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"], unknown_key=(None, "*UNKNOWN*")) w2w.vocabulary.write(v)
import examples, sys import graph import numpy as N from vocabulary import labelmap ODIM = labelmap.len from common.mydict import sort as dictsort for l in sys.stdin: e = examples._example_from_string(l) (x, y) = e if HYPERPARAMETERS["locally normalize"]: targety = N.array([y]) else: targety = N.zeros(ODIM) targety[y] = 1. if HLAYERS == 2: o = graph.validatefn([x.data], targety, w1[x.indices], b1, wh, bh, w2, b2) (kl, softmax, argmax, prehidden1, prehidden2) = o else: o = graph.validatefn([x.data], targety, w1[x.indices], b1, w2, b2) (kl, softmax, argmax, prehidden) = o assert softmax.shape[0] == 1 softmax = softmax[0] prs = {} for i in range(softmax.shape[0]): prs[labelmap.str(i)] = softmax[i] print dictsort(prs)[:3] # print argmax, softmax
from collections import defaultdict wordfreq = defaultdict(int) for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames(): for w in readwords(f1): wordfreq[(l1, w)] += 1 for w in readwords(f2): wordfreq[(l2, w)] += 1 for l, f in w2w.corpora.monocorpora_filenames(): assert 0 for (l, w) in wordfreq.keys(): if wordfreq[(l, w)] < HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]: del wordfreq[(l, w)] if w == "*UNKNOWN*": del wordfreq[(l, w)] import w2w.vocabulary import common.idmap wordfreqkeys = [key for cnt, key in dictsort(wordfreq)] # for k in wordfreq.keys(): # print k v = common.idmap.IDmap( [(None, "*LBOUNDARY*"), (None, "*RBOUNDARY*")] + wordfreqkeys, allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"], unknown_key=(None, "*UNKNOWN*")) w2w.vocabulary.write(v)