def trainPCFG(Directors, Maps, PcfgFile=PcfgFileName, SenseTaggerFile=SenseTaggerFileName, cv=0.1, Starts=[], mapversions='[01]', Lexicon=''): corpus_regexp = constructItemRegexp(Directors, Maps, starts=Starts, mapversions=mapversions) if Lexicon: corpus_regexp = constructSetOrRegexp([corpus_regexp, Lexicon]) Directions = DirectionCorpusReader(corpus_regexp) Pcfg, SenseTagger, TestSet = cvTrainPCFG(Directions, saveParses=0, StartSymbol='S', Group='CorrFullDirTrees', cv=cv, parseTest=doParses) if __debug__: print Pcfg if PcfgFile: cPickle.dump(Pcfg, open(PcfgFile, 'w')) cPickle.dump(SenseTagger, open(SenseTaggerFile, 'w')) return Pcfg, SenseTagger, TestSet
def genUncorrContentFrames(Directors): import re Corpus = DirectionCorpusReader(constructItemRegexp(Directors, mapversions='[01]')) for filename in lstail('Directions/FullTrees', re.compile('^FullTree-.*.txt$')): try: genCorrContentFrame(filename, TreePath='FullTrees/') except ValueError: pass
def parseInstruction(instructID): return DirectionCorpusReader(constructItemRegexp( Directors, Maps)).parseInstruction(getDirParser(Directors, Maps, collectStats=False), instructID, saveParses=True, frames=True)
def genCorrContentFrame(filename, Corpus=Corpus, TreePath='CorrFullTrees/'): if '-' in filename: instructionID = filename.split('-')[1] else: instructionID = filename print '\n',instructionID if not Corpus: Directors = ['EDA','EMWC','KLS','KXP','TJS','WLH'] Maps = ['Jelly','L','Grid'] Corpus = DirectionCorpusReader(constructItemRegexp(Directors,Maps)) Trees=[tree['TREE'] for tree in Corpus.read(TreePath+'/FullTree-'+instructionID)] Frames = trees2frames(Trees) saveParse(Trees,instructionID,directory='Directions/'+TreePath) saveFrame(Frames,instructionID) for frame in Frames: print `frame`
def parse3From12(): PcfgFileName = 'Corpus1+Corpus2-12-Corrected.pcfg' Directors= Directors1+Directors2 try: nltk.corpus.set_basedir(system_corpora) except: system_corpora=nltk.corpus.get_basedir() logger.initLogger('ParseDirections',LogDir='MarcoLogs') import enchant from Sense import Lexicon spellchecker = enchant.DictWithPWL('en_US', Lexicon) DirParser = getDirParser(Directors, Maps, usePOSTagger, POSTaggerFileName, PcfgFileName, SenseTaggerFileName, collectStats, spellchecker=spellchecker) Directions = DirectionCorpusReader(constructItemRegexp(Directors3,Maps,mapversions='[01]')) parseTestSet(DirParser, Directions, list(Directions.items('CleanDirs')), 1)
logger.error("%s.",CaughtErrorTxt) if str(e).startswith("Error parsing field structure"): CaughtError = 'EOFError' else: CaughtError = 'ValueError' return frames,CaughtError,CaughtErrorTxt def getSSS(instructID): if not instructID.endswith('txt'): instructID += '.txt' return readCorrFrame([],instructID)[0] if __name__ == '__main__': logger.initLogger('Sense',LogDir='MarcoLogs') Directors = ['EDA','EMWC','KLS','KXP','TJS','WLH'] Maps = ['Jelly','L','Grid'] Corpus = DirectionCorpusReader(constructItemRegexp(Directors,Maps)) else: Corpus = None def genCorrContentFrame(filename, Corpus=Corpus, TreePath='CorrFullTrees/'): if '-' in filename: instructionID = filename.split('-')[1] else: instructionID = filename print '\n',instructionID if not Corpus: Directors = ['EDA','EMWC','KLS','KXP','TJS','WLH'] Maps = ['Jelly','L','Grid'] Corpus = DirectionCorpusReader(constructItemRegexp(Directors,Maps)) Trees=[tree['TREE'] for tree in Corpus.read(TreePath+'/FullTree-'+instructionID)] Frames = trees2frames(Trees) saveParse(Trees,instructionID,directory='Directions/'+TreePath) saveFrame(Frames,instructionID) for frame in Frames: print `frame`