def play(*x, **kw): hidecorrect = False if "hidecorrect" in kw: hidecorrect = kw["hidecorrect"] if len(x) == 1: x = x[0] q = wordids2string(qmat_x[x], rwd=qrwd, maskid=maskid, reverse=True) ga = wordids2string(amat_x[x, 1:], rwd=arwd, maskid=maskid) pred = encdec.predict(qmat_x[x:x + 1], amati_x[x:x + 1, :-1]) pa = wordids2string(np.argmax(pred[0], axis=1), rwd=arwd, maskid=maskid) if hidecorrect and ga == pa[:len(ga)]: # correct return False else: print "{}: {}".format(x, q) print ga print pa return True elif len(x) == 0: for i in range(0, qmat_x.shape[0]): r = play(i) if r: raw_input() else: raise Exception("invalid argument to play")
def to_char_level(qmat, amat, qdic, adic, maskid): qmat = wordmat2charmat(qmat, qdic, maxlen=1000, maskid=maskid) amat = wordmat2charmat(amat, adic, maxlen=1000, maskid=maskid) qmat[qmat > 0] += 2 amat[amat > 0] += 2 qdic = dict([(chr(x), x + 2) for x in range(np.max(qmat))]) adic = dict([(chr(x), x + 2) for x in range(np.max(amat))]) qdic.update({"<RARE>": 1}) adic.update({"<RARE>": 1}) print wordids2string(qmat[0], {v: k for k, v in qdic.items()}) print wordids2string(amat[0], {v: k for k, v in adic.items()}) return qmat, amat, qdic, adic
def searchwordmat(self, wordmat, wd, top=5): cans = [] rwd = {v: k for k, v in wd.items()} tt = ticktock("wordmatsearcher") tt.tick("started searching") for i in range(wordmat.shape[0]): sentence = wordids2string(wordmat[i], rwd=rwd) #ssentence.replace(" '", "") res = self.searchsentence(sentence, top=top) cans.append([r["fb_id"] for r in res]) tt.progress(i, wordmat.shape[0], live=True) tt.tock("done searching") return cans
def xpp(i): print wordids2string(qmat_x[i], rqdic, 0) print wordids2string(amat_x[i], radic, 0)
def tpp(i): print wordids2string(qmat_t[i], rqdic, 0) print wordids2string(amat_t[i], radic, 0)
def pp(i): print wordids2string(qmat_auto[i], {v: k for k, v in qdic_auto.items()}, 0) print wordids2string(amat_auto[i], {v: k for k, v in adic.items()}, 0)
def pp(i): print wordids2string(newtqmat[i], rqdic, 0) print wordids2string(newtamat[i], radic, 0)
def pp(x): print wordids2string([int(xe) for xe in x[0].split()], rqdic, maskid=0) print wordids2string([int(xe) for xe in x[1].split()], radic, maskid=0)
def pp(i): print wordids2string(qmat[i], {v: k for k, v in qdic.items()}) print wordids2string(amat[i], {v: k for k, v in adic.items()})
def preprocess(qmat, amat, qdic, adic, qwc, awc, maskid, qreversed=False, dorare=True): # TODO: add positional replacement and change other functions accordingly amat[amat == adic["capital:c"]] = adic["capital:t"] replaceina = set() for k in adic: if (k[-2:] in ":c :s :r :m :n".split() or k[-3:] in ":lo :co".split()) and not k == "capital:c": replaceina.add(k) for r in replaceina: splits = r.split(":") rt = splits[1] + "-type" if not rt in adic: adic[rt] = max(adic.values()) + 1 if not rt in qdic: qdic[rt] = max(qdic.values()) + 1 radic = {v: k for k, v in adic.items()} rqdic = {v: k for k, v in qdic.items()} for i in range(qmat.shape[0]): if i == 379: pass for j in range(amat.shape[1]): if amat[i, j] in {adic[x] for x in replaceina}: sf = radic[amat[i, j]].split(":")[0].split("_") #if sf[-1] == "river" or len(sfs[0][-1]) == 2: # sf = sf[:-1] sft = radic[amat[i, j]].split(":")[1] amat[i, j] = adic[sft + "-type"] sfs = [sf] qmati = qmat[i] if qreversed: qmatio = maskid * np.ones_like(qmati) m = qmati.shape[0] - 1 n = 0 while m >= 0: if qmati[m] == maskid: pass else: qmatio[n] = qmati[m] n += 1 m -= 1 qmati = qmatio if sf == ["usa"]: sfs.append("united states".split()) sfs.append("the country".split()) sfs.append("the states".split()) sfs.append(["us"]) sfs.append(["america"]) for sf in sfs: k = 0 done = False while k < qmat.shape[1]: if qmati[k] != maskid and \ rqdic[qmati[k]] == sf[0]: l = 0 while l < len(sf) and l + k < qmat.shape[1]: if rqdic[qmati[k + l]] == sf[l]: l += 1 else: break if l >= len(sf) - (1 if sf[0] != "the" else 0): qmati[k] = qdic[sft + "-type"] qmati[k + 1:qmat.shape[1] - l + 1] = qmati[k + l:] qmati[qmat.shape[1] - l + 1:] = maskid done = True break k += 1 if done: break if qreversed: qmatio = maskid * np.ones_like(qmati) m = qmati.shape[0] - 1 n = 0 while m >= 0: if qmati[m] == maskid: pass else: qmatio[n] = qmati[m] n += 1 m -= 1 qmati = qmatio qmat[i] = qmati # test wop = [] for i in range(qmat.shape[0]): if "-type" in wordids2string(amat[i], {v: k for k, v in adic.items()}) and \ "-type" not in wordids2string(qmat[i], {v: k for k, v in qdic.items()}): wop.append(i) print "{}/{}".format(len(wop), qmat.shape[0]) # rare words if dorare: rareset = set( map( lambda (x, y): x, filter(lambda (x, y): y < 2, sorted(qwc.items(), key=lambda (x, y): y)))) rareids = {qdic[x] for x in rareset} qmat = np.vectorize(lambda x: qdic["<RARE>"] if x in rareids else x)(qmat) def pp(i): print wordids2string(qmat[i], {v: k for k, v in qdic.items()}) print wordids2string(amat[i], {v: k for k, v in adic.items()}) #embed() return qmat, amat, qdic, adic, qwc, awc