def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base): f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8') #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8') sentID = base for i in xrange(len(srcSnt)): #print wa[i], srcSnt[i], tgtSnt[i] if i % 1000 == 0: print >> sys.stderr, i, bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \ oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i])) for suba in bead.otherSuba: example = (features(bead, suba), False, str(sentID) + '--' + suba.__str__() ) # add negative training examples f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') for suba in bead.goldSuba: example = (features(bead, suba), True, str(sentID) + '--' + suba.__str__() ) # add positive training examples f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') #f.write('\n') sentID += 1 f.close()
def extractRules(chF, enF, subaF, waF): chSentList = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()] enSentList = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()] subaList = [[item.split('-') for item in line.split()] for line in codecs.open(subaF, 'r', 'utf-8').readlines()] subaList = [[[int(d) for d in item] for item in line] for line in subaList] waList = [line for line in codecs.open(waF, 'r', 'utf-8').readlines()] assert len(chSentList) == len(enSentList) == len(subaList) == len(waList), \ "len(chSentList) == %d, len(enSentList) == %d, len(subaList) == %d, len(waList) == %d" % (len(chSentList), len(enSentList), len(subaList), len(waList)) ruleList = [] for i in xrange(len(subaList)): #print i, # rules with non-terminal Xs subaDic = _level_(subaList[i]) waMatrix = oneline2waMatrix(waList[i], len(chSentList[i]), len(enSentList[i])) for bigSquare in subaDic: rule = _extract_(bigSquare, subaDic[bigSquare], chSentList[i], enSentList[i], waMatrix) if rule: ruleList.append(rule) # rules without non-terminal Xs for square in subaList[i]: lhsSrc, lhsTgt = 'X', 'X' rhsSrc = range(square[0], square[1]) rhsTgt = range(square[2], square[3]) align = [] if _isLegalRule_(rhsSrc, rhsTgt, chSentList[i], enSentList[i], "complete"): rule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, waMatrix, chSentList[i], enSentList[i], square) ruleList.append(rule) # rules that are word alignments (i.e. word pairs) but not corresponding subtree alignments #pdb.set_trace() lhsSrc, lhsTgt = 'X', 'X' rhsSrc, rhsTgt, align = [], [], [] # here align is for the alignment of Xs, not word alignment, so keep empty for item in waList[i].split(): k = int(item.split('-')[0]) j = int(item.split('-')[1]) if waMatrix[k][j]: if sum(waMatrix[k]) == 1 and sum([row[j] for row in waMatrix]) == 1: rhsSrc, rhsTgt = [k], [j] if _isLegalRule_(rhsSrc, rhsTgt, chSentList[i], enSentList[i], "complete"): rule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, waMatrix, chSentList[i], enSentList[i], (k, k + 1, j, j + 1)) ruleList.append(rule) return ruleList
def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base): f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8') #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8') sentID = base for i in xrange(len(srcSnt)): #print wa[i], srcSnt[i], tgtSnt[i] if i % 1000 == 0: print >> sys.stderr, i, bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \ oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i])) for suba in bead.otherSuba: example = (features(bead, suba), False, str(sentID) + '--' + suba.__str__()) # add negative training examples f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') for suba in bead.goldSuba: example = (features(bead, suba), True, str(sentID) + '--' + suba.__str__()) # add positive training examples f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') #f.write('\n') sentID += 1 f.close()