def evalsemrel(l1, l2, pairsfile, l1colnum, l2colnum, l2gpmffile, l1l2methpmffile): '''Evaluate semantic relatedness. l1 and l2 = languages s and t in p(t|s) pairsfile = translation pairs in l1 and l2, l1 words in column <colnum> l2gpmffile = gold pmf over l2 words, for l2 words (including those in pairsfile) l1l2methpmffile = method-induced pmf over l2 words, for l1 words (including those in pairsfile) ''' l2gpmf = L1L2PMF(l2, l2, l2gpmffile) l1l2pmf = L1L2PMF(l1, l2, l1l2methpmffile) # print "#JSDiv Spearmanr" jsds, rhos = [], [] for line in open(pairsfile): line = line.decode('utf-8').rstrip() pair = line.split() w1, w2 = pair[l1colnum - 1], pair[l2colnum - 1] gpmf = l2gpmf.pmf[w2] #gold pmf mpmf = l1l2pmf.pmf[w1] #method pmf vecs = [(gpmf[x2], mpmf[x2]) for x2 in gpmf if x2 in mpmf] vecs.extend([(gpmf[x2], 0.0) for x2 in gpmf if x2 not in mpmf]) vecs.extend([(0.0, mpmf[x2]) for x2 in mpmf if x2 not in gpmf]) gvec, mvec = zip(*vecs) jsd = MyUtils.jsd(gvec, mvec, base=2) rho, pval = mstats.spearmanr(gvec, mvec, use_ties=True) jsds.append(jsd) rhos.append(rho) print "%f\t%f\t%f" % (jsd, rho, pval) print "\t\t\t%f\t%f" % (sum(jsds) / len(jsds), sum(rhos) / len(rhos))
def __init__(self, sl, tl, al, at_pmffile, sa_pmffile, at_tvocfile, as_avocfile): '''Input: source lang, target lang, auxiliary lang, p(t|a), p(a|s), |Vat_t|, |Vas_a|. ''' self.sl, self.tl, self.al = sl, tl, al self.PtGa = L1L2PMF(al, tl, at_pmffile) #p(t|a) self.PaGs = L1L2PMF(sl, al, sa_pmffile) #p(a|s) # vocab size of target in at_pmf self.Vat_t = set(open(at_tvocfile).read().decode('utf-8').split()) self.OneByVat_t = 1 / float(len(self.Vat_t)) self.Vas_a = set(open(as_avocfile).read().decode('utf-8').split()) self.OneByVas_a = 1 / float(len(self.Vas_a))
def getpmmc(sl, tl, slvecfile, tlvocfile, st_pmffiles, K, tr_ts_cand_par_filelist, truncprob): mmc = MixModelClus(sl, tl, slvecfile, st_pmffiles.split(',')) tlvoc = open(tlvocfile).read().decode('utf-8').split() for line in open(tr_ts_cand_par_filelist): line = line.rstrip() tsfile, colnum, trfile, l1, l2, candfile, paramfile = line.split() # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora # (so that impact of bringing in `a' can be measured accurately) # tsfile = should not contain words present in trfile colnum = int(colnum) # column in tsfile, from which to read test words mmc.settrdata(trfile, l1, l2) mmc.train(K) mmc.save(paramfile) f = gzip.open(candfile, 'wb') for s in [line.decode('utf-8').strip().split()[colnum-1] for line in open(tsfile).readlines()]: # start = time.clock() cands = [(t, mmc.pmm(t,s)) for t in tlvoc] cands.sort(key=lambda x: x[1], reverse=True) cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = s + u'\t' + u'\t'.join( [t+u' '+unicode(pr) for t, pr in cands] ) print >> f, line.encode('utf-8') # print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands)) f.close()
def getpmmc(sl, tl, slvecfile, tlvocfile, st_pmffiles, K, tr_ts_cand_par_filelist, truncprob): mmc = MixModelClus(sl, tl, slvecfile, st_pmffiles.split(',')) tlvoc = open(tlvocfile).read().decode('utf-8').split() for line in open(tr_ts_cand_par_filelist): line = line.rstrip() tsfile, colnum, trfile, l1, l2, candfile, paramfile = line.split() # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora # (so that impact of bringing in `a' can be measured accurately) # tsfile = should not contain words present in trfile colnum = int(colnum) # column in tsfile, from which to read test words mmc.settrdata(trfile, l1, l2) mmc.train(K) mmc.save(paramfile) f = gzip.open(candfile, 'wb') for s in [ line.decode('utf-8').strip().split()[colnum - 1] for line in open(tsfile).readlines() ]: # start = time.clock() cands = [(t, mmc.pmm(t, s)) for t in tlvoc] cands.sort(key=lambda x: x[1], reverse=True) cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = s + u'\t' + u'\t'.join( [t + u' ' + unicode(pr) for t, pr in cands]) print >> f, line.encode('utf-8') # print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands)) f.close()
def getpmm4eps(sl, tl, tlvocfile, st_pmffiles, epsfile, tr_ts_cand_lat_filelist, tr_or_ts, truncprob): # mm = MixModel(sl, tl, [st_pmffile, sta_pmffile]) # st_pmffiles = comma separated list of pmf files, starting with # the base dist, followed by aux lang dist mm = MixModel(sl, tl, st_pmffiles.split(',')) mm.readeps(epsfile) tlvoc = open(tlvocfile).read().decode('utf-8').split() for line in open(tr_ts_cand_lat_filelist): line = line.rstrip() tsfile, colnum, trfile, l1, l2, candfile, epsfile1 = line.split() # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora # (so that impact of bringing in a can be measured accurately) # tsfile = should not contain words present in trfile colnum = int(colnum) f = gzip.open(candfile, 'wb') qfile = tsfile if tr_or_ts == "TEST" else trfile for s in [ line.decode('utf-8').strip().split()[colnum - 1] for line in open(qfile).readlines() ]: cands = [(t, mm.pmm(t, s)) for t in tlvoc] cands.sort(key=lambda x: x[1], reverse=True) cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = s + u'\t' + u'\t'.join( [t + u' ' + unicode(pr) for t, pr in cands]) print >> f, line.encode('utf-8') f.close()
def getpmmfine(sl, tl, tlvocfile, st_pmffile, sta_pmffile, tr_ts_cand_lat_filelist, st_ptmfile, truncprob): stptm = PTM.load(st_ptmfile) scd = SourceCatDist() scd.set_from_PTM(stptm, sl) mmf = MixModelFine(sl, tl, [st_pmffile, sta_pmffile], scd) tlvoc = open(tlvocfile).read().decode('utf-8').split() for line in open(tr_ts_cand_lat_filelist): line = line.rstrip() tsfile, colnum, trfile, l1, l2, candfile, latentfile = line.split() # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora # (so that impact of bringing in a can be measured accurately) # tsfile = should not contain words present in trfile colnum = int(colnum) mmf.settrdata(trfile, l1, l2) mmf.learnEM() mmf.savelatent(latentfile) f = gzip.open(candfile, 'wb') for s in [ line.decode('utf-8').strip().split()[colnum - 1] for line in open(tsfile).readlines() ]: # start = time.clock() cands = [(t, mmf.pmm(t, s)) for t in tlvoc] cands.sort(key=lambda x: x[1], reverse=True) cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = s + u'\t' + u'\t'.join( [t + u' ' + unicode(pr) for t, pr in cands]) print >> f, line.encode('utf-8') # print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands)) f.close()
def savecands(l1, qfile, l2, meth, truncprob, candfile): f = gzip.open(candfile, 'wb') for w1 in open(qfile).read().decode('utf-8').split(): cands = meth.get_similar(l1, w1, l2) # convert to pmf tot = sum([sc for w, sc in cands]) cands = [(w, sc / tot) for w, sc in cands] # truncate and renormalize pmf cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = w1 + '\t' + '\t'.join([w2 + ' ' + str(sc) for w2, sc in cands]) print >> f, line.encode('utf-8') f.close()
def savecands(l1, qfile, l2, meth, truncprob, candfile): f = gzip.open(candfile, 'wb') for w1 in open(qfile).read().decode('utf-8').split(): cands = meth.get_similar(l1, w1, l2) # convert to pmf tot = sum([sc for w, sc in cands]) cands = [(w, sc/tot) for w, sc in cands] # truncate and renormalize pmf cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = w1 + '\t' + '\t'.join( [w2+' '+str(sc) for w2, sc in cands] ) print >> f, line.encode('utf-8') f.close()
def SRptm(l1, l2, l1l2pairsfile, l1l2candfile, l2l1candfile): ''' Input: p(t|s), p(s|t), {(s,t)} pairs from human annotation task Output: {(s,t,score)} for all the pairs Algo: For each s,t pair Get distributions p(t'|s) and p(s'|t). l-INF normalize the distributions to get r(t'|s) and r(s'|t) Take the max of r(t|s) and r(s|t); this is the score. ''' l1l2, l2l1 = L1L2PMF(l1, l2, l1l2candfile), L1L2PMF(l2, l1, l2l1candfile) for line in open(l1l2pairsfile): line = line.decode('utf-8').rstrip() w1, w2 = line.split() if w2 in l1l2.pmf[w1]: # rl1l2 = l1l2.pmf[w1][w2] / max( l1l2.pmf[w1].itervalues() ) rl1l2 = l1l2.pmf[w1][w2] else: rl1l2 = 0 if w1 in l2l1.pmf[w2]: # rl2l1 = l2l1.pmf[w2][w1] / max( l2l1.pmf[w2].itervalues() ) rl2l1 = l2l1.pmf[w2][w1] else: rl2l1 = 0 score = max(rl1l2, rl2l1) print (w1+'\t'+w2+'\t'+unicode(score)).encode('utf-8')
def getpaux(sl, qfile, tl, tlvocfile, al, at_pmffile, sa_pmffile, at_tvocfile, as_avocfile, truncprob, candfile): pa = Paux(sl, tl, al, at_pmffile, sa_pmffile, at_tvocfile, as_avocfile) tlvoc = open(tlvocfile).read().decode('utf-8').split() f = gzip.open(candfile, 'wb') for s in open(qfile).read().decode('utf-8').split(): if s in pa.PaGs.pmf: #ignore source words without data # start = time.clock() cands = [(t, pa.p_a(t, s)) for t in tlvoc] cands.sort(key=lambda x: x[1], reverse=True) cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = s + '\t' + '\t'.join([t + ' ' + str(pr) for t, pr in cands]) print >> f, line.encode('utf-8') # print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands)) f.close()
def getcands4meth(qfile, colnum, candfile, truncprob, getrelmeth): f = gzip.open(candfile, 'wb') for line in open(qfile): line = line.decode('utf-8').rstrip() w1 = line.split()[colnum-1] cands = getrelmeth(w1) # convert to pmf minsc = min([sc for w, sc in cands]) cands = [(w, sc-minsc) for w, sc in cands] # shift all scores to make them positive tot = sum([sc for w, sc in cands]) cands = [(w, sc/tot) for w, sc in cands] # truncate and renormalize pmf cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = w1 + u'\t' + u'\t'.join( [w+u' '+unicode(sc) for w, sc in cands] ) print >> f, line.encode('utf-8') f.close()
def __init__(self, sl, tl, st_pmffiles): '''Input: source lang, target lang, p(t|s) pmf file names, training lexicon''' self.sl, self.tl = sl, tl self.beta = [] # p(t|s) pmfs for different experts self.tvocs = [] # target vocabularies for different experts self.OneByVt = [] # 1/V_t for different experts for fil in st_pmffiles: self.beta.append(L1L2PMF(sl, tl, fil).pmf) self.tvocs.append( set([ t for s in self.beta[-1].iterkeys() for t in self.beta[-1][s].iterkeys() ])) self.OneByVt.append(1.0 / len(self.tvocs[-1])) self.E = len(self.beta) # number of experts self.eps = [1.0 / self.E for i in range(self.E)] # default expert mixture
def getcands4meth(qfile, colnum, candfile, truncprob, getrelmeth): f = gzip.open(candfile, 'wb') for line in open(qfile): line = line.decode('utf-8').rstrip() w1 = line.split()[colnum - 1] cands = getrelmeth(w1) # convert to pmf minsc = min([sc for w, sc in cands]) cands = [(w, sc - minsc) for w, sc in cands] # shift all scores to make them positive tot = sum([sc for w, sc in cands]) cands = [(w, sc / tot) for w, sc in cands] # truncate and renormalize pmf cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = w1 + u'\t' + u'\t'.join( [w + u' ' + unicode(sc) for w, sc in cands]) print >> f, line.encode('utf-8') f.close()
def __init__(self, sl, tl, slvecfile, st_pmffiles): '''Input: source lang, target lang, source word feature vectors, p(t|s) pmf file names, and number of clusters K.''' self.sl, self.tl = sl, tl self.slfeat = WordFeat.loadfeat(slvecfile) self.slfeat = self.scalefeat(self.slfeat) self.beta = [] # p(t|s) pmfs for different experts self.tvocs = [] # target vocabularies for different experts self.OneByVt = [] # 1/V_t for different experts for fil in st_pmffiles: self.beta.append(L1L2PMF(sl, tl, fil).pmf) self.tvocs.append( set([ t for s in self.beta[-1].iterkeys() for t in self.beta[-1][s].iterkeys() ])) self.OneByVt.append(1.0 / len(self.tvocs[-1])) self.E = len(self.beta) # number of experts
def __init__(self, sl, tl, st_pmffiles, scd): '''Input: source lang, target lang, p(t|s) pmf file names, source categ dist object''' self.sl, self.tl = sl, tl self.beta = [] # p(t|s) pmfs for different experts self.tvocs = [] # target vocabularies for different experts self.OneByVt = [] # 1/V_t for different experts for fil in st_pmffiles: self.beta.append(L1L2PMF(sl, tl, fil).pmf) self.tvocs.append( set([ t for s in self.beta[-1].iterkeys() for t in self.beta[-1][s].iterkeys() ])) self.OneByVt.append(1.0 / len(self.tvocs[-1])) self.E = len(self.beta) # number of experts self.scd = scd self.T = len(scd.phi) # number of categories
def getpmm(sl, tl, tlvocfile, st_pmffiles, tr_ts_cand_lat_filelist, truncprob, learnmeth): # mm = MixModel(sl, tl, [st_pmffile, sta_pmffile]) # st_pmffiles = comma separated list of pmf files, starting with # the base dist, followed by aux lang dist mm = MixModel(sl, tl, st_pmffiles.split(',')) tlvoc = open(tlvocfile).read().decode('utf-8').split() for line in open(tr_ts_cand_lat_filelist): line = line.rstrip() tsfile, colnum, trfile, l1, l2, candfile, epsfile = line.split() # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora # (so that impact of bringing in a can be measured accurately) # tsfile = should not contain words present in trfile colnum = int(colnum) mm.settrdata(trfile, l1, l2) if learnmeth == "EM": mm.learnepsEM() elif learnmeth == "GRID": mm.learnepsgrid(tlvoc) else: mm.learnepsgrid(tlvoc) mm.saveeps(epsfile) f = gzip.open(candfile, 'wb') for s in [ line.decode('utf-8').strip().split()[colnum - 1] for line in open(tsfile).readlines() ]: # start = time.clock() cands = [(t, mm.pmm(t, s)) for t in tlvoc] cands.sort(key=lambda x: x[1], reverse=True) cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True) line = s + u'\t' + u'\t'.join( [t + u' ' + unicode(pr) for t, pr in cands]) print >> f, line.encode('utf-8') # print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands)) f.close()