def buildclasser(): global DOTOKENIZE, ENCODING, outputprefix log("Counting unigrams (for classer) ...",stream=sys.stderr) freqlist = FrequencyList() f = open(corpusfile) for i, line in enumerate(f): if (i % 10000 == 0): log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) line = line.strip().split(' ') freqlist.append(['<begin>'] + line + ['<end>']) f.close() log("Building classer ...", stream=sys.stderr) classer = Classer(freqlist) classer.save(outputprefix + '.cls') log("\t" + str(len(classer)) + " classes found", stream=sys.stderr) return classer
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Classer from pynlpl.statistics import FrequencyList import sys filename = sys.argv[1] print >>sys.stderr, "Counting tokens" f = open(filename) freqlist = FrequencyList() for i, line in enumerate(f): if (i % 10000 == 0): print >>sys.stderr, "\tLine " + str(i+1) line = ['<s>'] + line.strip().split(' ') + ['</s>'] freqlist.append(line) f.close() print >>sys.stderr, "Building classer" classer = Classer(freqlist, filesupport=True ) classer.save(filename + '.cls') print >>sys.stderr, "Encoding data" classer.encodefile(filename, filename + '.clsenc')
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Classer import sys classer = Classer(sys.argv[1]) for line in classer.decodefile(sys.argv[2]): print " ".join(line).encode('utf-8')
from pynlpl.textprocessors import crude_tokenizer, Classer import sys import codecs import asizeof freqlist = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f: line = crude_tokenizer(line.strip()) freqlist.append(line) f.close() print "FREQLIST: " ,asizeof.asizeof(freqlist) classer = Classer(freqlist) print "CLASSER: " ,asizeof.asizeof(classer) classer2 = Classer(freqlist, False,True) print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2) freqlist2 = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f: line = crude_tokenizer(line.strip()) freqlist2.append(classer.encodeseq(line)) f.close() print "FREQLIST-AFTER-CLASSER: " ,asizeof.asizeof(freqlist2)