def cleanUnigramCountFile(inputfile, outputfile, n): '''Filter the unigram count file, and reduce the number of items in it.''' ngrok.cleanUnigramCountFile(inputfile, outputfile, n)
[opus.processLanguage(x, directories['expandpath'], directories['combinedpath']) for x in languages] [opus.combineLanguage(os.path.join(directories['combinedpath'], x), os.path.join(directories['intermediatecountpath'],x+'_combined.txt')) for x in languages] for language in languages: print('Counting ngrams...') countfile = os.path.join(directories['intermediatecountpath'],language+'_counted.txt') ngrok.countNgrams(os.path.join(directories['intermediatecountpath'],language+'_combined.txt'), countfile, n=1) print('Rearranging ngrams...') rearrangedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_rearrange_counts.txt') ngrok.rearrangeNgramFile(countfile, rearrangedFile , reverse=False) print('Cleaning ngrams...') cleanedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_cleaned.txt') ngrok.cleanUnigramCountFile(rearrangedFile, cleanedFile, numItems, language) print('Sorting ngrams...') sortedfile = os.path.join(directories['intermediatecountpath'],language+'_2013_sorted.txt') ngrok.sortNgramFile(cleanedFile, sortedfile) print('Collapsing ngrams...') collapsedfile = os.path.join(directories['intermediatecountpath'],language+'_2013_collapsed.txt') ngrok.collapseNgrams(sortedfile, collapsedfile) print('Marginalizing ngrams, first pass...') ngrok.marginalizeNgramFile(collapsedfile, os.path.join(slowstoragedir,language+'_2013.txt'), n=1, sorttype='numeric')