def makeSimFiles(language, outpath=os.path.join(os.getcwd().split('code')[0], 'maxent2', 'temp'), testDataToUse=1 / 5, predefault=False, ag_disag=False): """ makes corpus.txt, test.txt, learning.txt files compatible with the command line learner If not given a test data file, it will use 1/5th of the learning data file to make a random subset (or the amount you specify in the last arg) """ inpath = os.path.join(os.getcwd().split('code')[0], 'data', language) fix_input_files.fixFeatureFile(inpath, outpath) #make a test file from 1/5th of the data, then make a learning file from what remains if "TestingData.txt" in os.listdir(inpath): fix_input_files.fixDataFile(inpath, typ='test') fix_input_files.fixDataFile(inpath, typ='learning') #otherwise we'll make one out of 20% of your learning data and withhold it: else: #converting LearningData.txt to 'corpus.txt': fix_input_files.fixDataFile(inpath, typ='learning') corpath = os.path.join(outpath, 'corpus.txt') #'LearningData.txt') testdatapath = os.path.join(outpath, 'test.txt') #'TestingData.txt') newlearndatapath = os.path.join(outpath, 'subcorpus.txt') #make a random sample file using specified amount of data datasampler.makeRandomTestFile(corpath, testdatapath, newlearndatapath, testDataToUse) #fix_input_files.fixDataFile(testdatapath, typ='test') os.remove(os.path.join(outpath, 'corpus.txt')) os.rename(os.path.join(outpath, 'subcorpus.txt'), os.path.join(outpath, 'corpus.txt')) #if you want to turn on the preselection option if predefault: dflt_grammar.makeDefGramFile(inpath) if ag_disag: agree_disagree.make_gram_file()
def copyTestFiles(grammarpath, testfilepath): ''' for testing an existing grammar. this copies a grammar.txt file and a projections.txt file to the maxent directory, and a test file and a features file to the maxent directory grammarpath leads to the locatioon of grammar.txt, and testfilepath to TestingData.txt. ''' maxentpath = os.path.join(os.getcwd().split('code')[0], 'maxent2', 'temp') testfiledir = testfilepath.split('TestingData.txt')[0] fix_input_files.fixFeatureFile(testfiledir, maxentpath) fix_input_files.fixDataFile(testfiledir, typ='test') with open(os.path.join(maxentpath, 'params.txt'), 'w', encoding='utf-8') as f: f.write( '-test\ttest.txt\n-grammar\tgrammar.txt\n-projections\tprojections.txt\n-features\tfeatures.txt' ) shutil.copy(grammarpath, os.path.join(maxentpath, 'grammar.txt')) projections = os.path.join( grammarpath.split('grammar.txt')[0], 'projections.txt') shutil.copy(projections, os.path.join(maxentpath, 'projections.txt'))