def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base):
    f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8')
    #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8')
    sentID = base
    for i in xrange(len(srcSnt)):
        #print wa[i], srcSnt[i], tgtSnt[i]
        if i % 1000 == 0: print >> sys.stderr, i,
        bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \
          oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i]))

        for suba in bead.otherSuba:
            example = (features(bead, suba), False,
                       str(sentID) + '--' + suba.__str__()
                       )  # add negative training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        for suba in bead.goldSuba:
            example = (features(bead, suba), True,
                       str(sentID) + '--' + suba.__str__()
                       )  # add positive training examples
            f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' +
                    '\t'.join(example[0]) + '\n')
            #f.write(suba.__str__()+' ')
        #f.write('\n')
        sentID += 1
    f.close()
def makeTrainData(rawDataDir):
	beadList = []
	for filename in os.listdir(rawDataDir):
		print >> sys.stderr, filename
		beadList.extend(Bead2.loadData(os.path.join(rawDataDir, filename)))
	
	trainExamples = []
	for bead in beadList:
		for suba in bead.otherSuba:
			trainExamples.append((features(bead, suba), False))   # add negative training examples
		for suba in bead.goldSuba:
			trainExamples.append((features(bead, suba), True))    # add positive training examples
	
	return trainExamples
Esempio n. 3
0
def makeTrainData(rawDataDir):
    beadList = []
    for filename in os.listdir(rawDataDir):
        print >> sys.stderr, filename
        beadList.extend(Bead2.loadData(os.path.join(rawDataDir, filename)))

    trainExamples = []
    for bead in beadList:
        for suba in bead.otherSuba:
            trainExamples.append(
                (features(bead,
                          suba), False))  # add negative training examples
        for suba in bead.goldSuba:
            trainExamples.append(
                (features(bead, suba), True))  # add positive training examples

    return trainExamples
def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base):
	f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8')
	#f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8')
	sentID = base
	for i in xrange(len(srcSnt)):
		#print wa[i], srcSnt[i], tgtSnt[i]
		if i % 1000 == 0: print >> sys.stderr, i, 
		bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \
				oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i]))

		for suba in bead.otherSuba:
			example = (features(bead, suba), False, str(sentID) + '--' + suba.__str__())   # add negative training examples
			f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n')
			#f.write(suba.__str__()+' ')
		for suba in bead.goldSuba:
			example = (features(bead, suba), True, str(sentID) + '--' + suba.__str__())    # add positive training examples
			f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n')
			#f.write(suba.__str__()+' ')
		#f.write('\n')
		sentID += 1
	f.close()