def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base): f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8') #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8') sentID = base for i in xrange(len(srcSnt)): #print wa[i], srcSnt[i], tgtSnt[i] if i % 1000 == 0: print >> sys.stderr, i, bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \ oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i])) for suba in bead.otherSuba: example = (features(bead, suba), False, str(sentID) + '--' + suba.__str__() ) # add negative training examples f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') for suba in bead.goldSuba: example = (features(bead, suba), True, str(sentID) + '--' + suba.__str__() ) # add positive training examples f.write('ID' + example[2] + '\t' + str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') #f.write('\n') sentID += 1 f.close()
def makeTrainData(rawDataDir): beadList = [] for filename in os.listdir(rawDataDir): print >> sys.stderr, filename beadList.extend(Bead2.loadData(os.path.join(rawDataDir, filename))) trainExamples = [] for bead in beadList: for suba in bead.otherSuba: trainExamples.append((features(bead, suba), False)) # add negative training examples for suba in bead.goldSuba: trainExamples.append((features(bead, suba), True)) # add positive training examples return trainExamples
def makeTrainData(rawDataDir): beadList = [] for filename in os.listdir(rawDataDir): print >> sys.stderr, filename beadList.extend(Bead2.loadData(os.path.join(rawDataDir, filename))) trainExamples = [] for bead in beadList: for suba in bead.otherSuba: trainExamples.append( (features(bead, suba), False)) # add negative training examples for suba in bead.goldSuba: trainExamples.append( (features(bead, suba), True)) # add positive training examples return trainExamples
def make(srcSnt, tgtSnt, srcTree, tgtTree, wa, gsuba, base): f = codecs.open('/dev/shm/subaFeatEx.' + str(base), 'w', 'utf-8') #f = codecs.open('/dev/shm/koala.suba', 'w', 'utf-8') sentID = base for i in xrange(len(srcSnt)): #print wa[i], srcSnt[i], tgtSnt[i] if i % 1000 == 0: print >> sys.stderr, i, bead = Bead2(nltk.ParentedTree(srcTree[i]), nltk.ParentedTree(tgtTree[i]), \ oneline2waMatrix(wa[i], len(srcSnt[i].split()), len(tgtSnt[i].split())), oneline2subaList(gsuba[i])) for suba in bead.otherSuba: example = (features(bead, suba), False, str(sentID) + '--' + suba.__str__()) # add negative training examples f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') for suba in bead.goldSuba: example = (features(bead, suba), True, str(sentID) + '--' + suba.__str__()) # add positive training examples f.write('ID' + example[2] + '\t' +str(example[1]) + '\t' + '\t'.join(example[0]) + '\n') #f.write(suba.__str__()+' ') #f.write('\n') sentID += 1 f.close()