def trainPCFG(Directors, Maps, PcfgFile=PcfgFileName, SenseTaggerFile=SenseTaggerFileName, cv=0.1, Starts=[], mapversions='[01]', Lexicon=''): corpus_regexp = constructItemRegexp(Directors, Maps, starts=Starts, mapversions=mapversions) if Lexicon: corpus_regexp = constructSetOrRegexp([corpus_regexp, Lexicon]) Directions = DirectionCorpusReader(corpus_regexp) Pcfg, SenseTagger, TestSet = cvTrainPCFG(Directions, saveParses=0, StartSymbol='S', Group='CorrFullDirTrees', cv=cv, parseTest=doParses) if __debug__: print Pcfg if PcfgFile: cPickle.dump(Pcfg, open(PcfgFile, 'w')) cPickle.dump(SenseTagger, open(SenseTaggerFile, 'w')) return Pcfg, SenseTagger, TestSet
def genUncorrContentFrames(Directors): import re Corpus = DirectionCorpusReader(constructItemRegexp(Directors, mapversions='[01]')) for filename in lstail('Directions/FullTrees', re.compile('^FullTree-.*.txt$')): try: genCorrContentFrame(filename, TreePath='FullTrees/') except ValueError: pass
def parseInstruction(instructID): return DirectionCorpusReader(constructItemRegexp( Directors, Maps)).parseInstruction(getDirParser(Directors, Maps, collectStats=False), instructID, saveParses=True, frames=True)
def genUncorrContentFrames(Directors): import re Corpus = DirectionCorpusReader(constructItemRegexp(Directors, mapversions="[01]")) for filename in lstail("Directions/FullTrees", re.compile("^FullTree-.*.txt$")): try: genCorrContentFrame(filename, TreePath="FullTrees/") except ValueError: pass
def trainPCFG(Directors, Maps, PcfgFile=PcfgFileName, SenseTaggerFile=SenseTaggerFileName, cv=0.1, Starts=[], mapversions='[01]', Lexicon=''): corpus_regexp = constructItemRegexp(Directors, Maps, starts=Starts, mapversions=mapversions) if Lexicon: corpus_regexp = constructSetOrRegexp([corpus_regexp, Lexicon]) Directions = DirectionCorpusReader(corpus_regexp) Pcfg,SenseTagger,TestSet = cvTrainPCFG(Directions, saveParses=0, StartSymbol = 'S', Group='CorrFullDirTrees', cv=cv, parseTest=doParses) if __debug__: print Pcfg if PcfgFile: cPickle.dump(Pcfg,open(PcfgFile,'w')) cPickle.dump(SenseTagger,open(SenseTaggerFile,'w')) return Pcfg,SenseTagger,TestSet
def genCorrContentFrame(filename, Corpus=Corpus, TreePath='CorrFullTrees/'): if '-' in filename: instructionID = filename.split('-')[1] else: instructionID = filename print '\n',instructionID if not Corpus: Directors = ['EDA','EMWC','KLS','KXP','TJS','WLH'] Maps = ['Jelly','L','Grid'] Corpus = DirectionCorpusReader(constructItemRegexp(Directors,Maps)) Trees=[tree['TREE'] for tree in Corpus.read(TreePath+'/FullTree-'+instructionID)] Frames = trees2frames(Trees) saveParse(Trees,instructionID,directory='Directions/'+TreePath) saveFrame(Frames,instructionID) for frame in Frames: print `frame`
def genCorrContentFrame(filename, Corpus=Corpus, TreePath="CorrFullTrees/"): if "-" in filename: instructionID = filename.split("-")[1] else: instructionID = filename print "\n", instructionID if not Corpus: Directors = ["EDA", "EMWC", "KLS", "KXP", "TJS", "WLH"] Maps = ["Jelly", "L", "Grid"] Corpus = DirectionCorpusReader(constructItemRegexp(Directors, Maps)) Trees = [tree["TREE"] for tree in Corpus.read(TreePath + "/FullTree-" + instructionID)] Frames = trees2frames(Trees) saveParse(Trees, instructionID, directory="Directions/" + TreePath) saveFrame(Frames, instructionID) for frame in Frames: print ` frame `
def parse3From12(): PcfgFileName = 'Corpus1+Corpus2-12-Corrected.pcfg' Directors= Directors1+Directors2 try: nltk.corpus.set_basedir(system_corpora) except: system_corpora=nltk.corpus.get_basedir() logger.initLogger('ParseDirections',LogDir='MarcoLogs') import enchant from Sense import Lexicon spellchecker = enchant.DictWithPWL('en_US', Lexicon) DirParser = getDirParser(Directors, Maps, usePOSTagger, POSTaggerFileName, PcfgFileName, SenseTaggerFileName, collectStats, spellchecker=spellchecker) Directions = DirectionCorpusReader(constructItemRegexp(Directors3,Maps,mapversions='[01]')) parseTestSet(DirParser, Directions, list(Directions.items('CleanDirs')), 1)
logger.error("%s.",CaughtErrorTxt) if str(e).startswith("Error parsing field structure"): CaughtError = 'EOFError' else: CaughtError = 'ValueError' return frames,CaughtError,CaughtErrorTxt def getSSS(instructID): if not instructID.endswith('txt'): instructID += '.txt' return readCorrFrame([],instructID)[0] if __name__ == '__main__': logger.initLogger('Sense',LogDir='MarcoLogs') Directors = ['EDA','EMWC','KLS','KXP','TJS','WLH'] Maps = ['Jelly','L','Grid'] Corpus = DirectionCorpusReader(constructItemRegexp(Directors,Maps)) else: Corpus = None def genCorrContentFrame(filename, Corpus=Corpus, TreePath='CorrFullTrees/'): if '-' in filename: instructionID = filename.split('-')[1] else: instructionID = filename print '\n',instructionID if not Corpus: Directors = ['EDA','EMWC','KLS','KXP','TJS','WLH'] Maps = ['Jelly','L','Grid'] Corpus = DirectionCorpusReader(constructItemRegexp(Directors,Maps)) Trees=[tree['TREE'] for tree in Corpus.read(TreePath+'/FullTree-'+instructionID)] Frames = trees2frames(Trees) saveParse(Trees,instructionID,directory='Directions/'+TreePath) saveFrame(Frames,instructionID) for frame in Frames: print `frame`
else: CaughtError = "ValueError" return frames, CaughtError, CaughtErrorTxt def getSSS(instructID): if not instructID.endswith("txt"): instructID += ".txt" return readCorrFrame([], instructID)[0] if __name__ == "__main__": logger.initLogger("Sense", LogDir="MarcoLogs") Directors = ["EDA", "EMWC", "KLS", "KXP", "TJS", "WLH"] Maps = ["Jelly", "L", "Grid"] Corpus = DirectionCorpusReader(constructItemRegexp(Directors, Maps)) else: Corpus = None def genCorrContentFrame(filename, Corpus=Corpus, TreePath="CorrFullTrees/"): if "-" in filename: instructionID = filename.split("-")[1] else: instructionID = filename print "\n", instructionID if not Corpus: Directors = ["EDA", "EMWC", "KLS", "KXP", "TJS", "WLH"] Maps = ["Jelly", "L", "Grid"] Corpus = DirectionCorpusReader(constructItemRegexp(Directors, Maps)) Trees = [tree["TREE"] for tree in Corpus.read(TreePath + "/FullTree-" + instructionID)]
import enchant from Sense import Lexicon spellchecker = enchant.DictWithPWL('en_US', Lexicon) DirParser = getDirParser(Directors, Maps, usePOSTagger, POSTaggerFileName, PcfgFileName, SenseTaggerFileName, collectStats, spellchecker=spellchecker) if doParses == 'Profile': import profile profile.run('testParses(DirParser)', 'parse.prof') elif doParses == 'TestSet': Directions = DirectionCorpusReader(constructItemRegexp( Directors, Maps)) parseTestSet(DirParser, Directions, TestSet, 1) elif doParses == 'CommandLine': if len(sys.argv) > 1 and sys.argv[1]: Files = sys.argv[1] else: Files = constructItemRegexp(Directors, Maps, mapversions='[01]') print 'Parsing', Files Directions = DirectionCorpusReader(Files) TestSet = list(Directions.items('CleanDirs')) #random.shuffle(TestSet); #timeSort(TestSet) parseTestSet(DirParser, Directions, TestSet, 1)
def parseInstruction(instructID): return DirectionCorpusReader(constructItemRegexp(Directors,Maps)).parseInstruction( getDirParser(Directors, Maps, collectStats=False), instructID, saveParses=True, frames=True)
try: nltk.corpus.set_basedir(system_corpora) except: system_corpora=nltk.corpus.get_basedir() logger.initLogger('ParseDirections',LogDir='MarcoLogs') import enchant from Sense import Lexicon spellchecker = enchant.DictWithPWL('en_US', Lexicon) DirParser = getDirParser(Directors, Maps, usePOSTagger, POSTaggerFileName, PcfgFileName, SenseTaggerFileName, collectStats, spellchecker=spellchecker) if doParses == 'Profile': import profile profile.run('testParses(DirParser)','parse.prof') elif doParses == 'TestSet': Directions = DirectionCorpusReader(constructItemRegexp(Directors,Maps)) parseTestSet(DirParser,Directions,TestSet,1) elif doParses == 'CommandLine': if len(sys.argv)>1 and sys.argv[1]: Files = sys.argv[1] else: Files = constructItemRegexp(Directors,Maps,mapversions='[01]') print 'Parsing', Files Directions = DirectionCorpusReader(Files) TestSet=list(Directions.items('CleanDirs')) #random.shuffle(TestSet); #timeSort(TestSet) parseTestSet(DirParser,Directions,TestSet,1)