def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base):
    f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8')
    #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8')
    sentID = base
    for i in xrange(len(srcSnt)):
        #print wa[i], srcSnt[i], tgtSnt[i]
        if i % 1000 == 0: print >> sys.stderr, i,
        bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \
          oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i]))

        for suba in bead.otherSuba:
            example = (features(bead, suba), False,
                       str(sentID) + '--' + suba.__str__()
                       )  # add negative training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        for suba in bead.goldSuba:
            example = (features(bead, suba), True,
                       str(sentID) + '--' + suba.__str__()
                       )  # add positive training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        #f.write('\n')
        sentID += 1
    f.close()
Exemple #2
0
def extractRules(chF, enF, subaF, waF):
	chSentList = [line.split() for line in codecs.open(chF, 'r', 'utf-8').readlines()]
	enSentList = [line.split() for line in codecs.open(enF, 'r', 'utf-8').readlines()]
	subaList = [[item.split('-') for item in line.split()] for line in codecs.open(subaF, 'r', 'utf-8').readlines()]
	subaList = [[[int(d) for d in item] for item in line] for line in subaList]
	waList = [line for line in codecs.open(waF, 'r', 'utf-8').readlines()]

	assert len(chSentList) == len(enSentList) == len(subaList) == len(waList), \
			"len(chSentList) == %d, len(enSentList) == %d, len(subaList) == %d, len(waList) == %d" % (len(chSentList), len(enSentList), len(subaList), len(waList))

	ruleList = []

	for i in xrange(len(subaList)):
		#print i,
		# rules with non-terminal Xs
		subaDic = _level_(subaList[i])
		waMatrix = oneline2waMatrix(waList[i], len(chSentList[i]), len(enSentList[i]))
		for bigSquare in subaDic:
			rule = _extract_(bigSquare, subaDic[bigSquare], chSentList[i], enSentList[i], waMatrix) 
			if rule: ruleList.append(rule)
	
		# rules without non-terminal Xs
		for square in subaList[i]:
			lhsSrc, lhsTgt = 'X', 'X'
			rhsSrc = range(square[0], square[1])
			rhsTgt = range(square[2], square[3])
			align = []
			if _isLegalRule_(rhsSrc, rhsTgt, chSentList[i], enSentList[i], "complete"):
				rule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, waMatrix, chSentList[i], enSentList[i], square) 
				ruleList.append(rule)

		# rules that are word alignments (i.e. word pairs) but not corresponding subtree alignments
		#pdb.set_trace()
		lhsSrc, lhsTgt = 'X', 'X'
		rhsSrc, rhsTgt, align = [], [], []   # here align is for the alignment of Xs, not word alignment, so keep empty 
		for item in waList[i].split():
			k = int(item.split('-')[0])
			j = int(item.split('-')[1])
			if waMatrix[k][j]:
				if sum(waMatrix[k]) == 1 and sum([row[j] for row in waMatrix]) == 1:
					rhsSrc, rhsTgt = [k], [j]
					if _isLegalRule_(rhsSrc, rhsTgt, chSentList[i], enSentList[i], "complete"):
						rule = Rule(lhsSrc, lhsTgt, rhsSrc, rhsTgt, align, waMatrix, chSentList[i], enSentList[i], (k, k + 1, j, j + 1))
						ruleList.append(rule)
	return ruleList
def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base):
	f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8')
	#f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8')
	sentID = base
	for i in xrange(len(srcSnt)):
		#print wa[i], srcSnt[i], tgtSnt[i]
		if i % 1000 == 0: print >> sys.stderr, i, 
		bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \
				oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i]))

		for suba in bead.otherSuba:
			example = (features(bead, suba), False, str(sentID) + '--' + suba.__str__())   # add negative training examples
			f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n')
			#f.write(suba.__str__()+' ')
		for suba in bead.goldSuba:
			example = (features(bead, suba), True, str(sentID) + '--' + suba.__str__())    # add positive training examples
			f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n')
			#f.write(suba.__str__()+' ')
		#f.write('\n')
		sentID += 1
	f.close()