def buildclasser():
    global DOTOKENIZE, ENCODING, outputprefix
    log("Counting unigrams (for classer) ...",stream=sys.stderr)
    freqlist = FrequencyList()
    f = open(corpusfile)
    for i, line in enumerate(f):            
        if (i % 10000 == 0): 
            log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr)
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        line = line.strip().split(' ')
        freqlist.append(['<begin>'] + line + ['<end>'])
    f.close()
    
    log("Building classer ...", stream=sys.stderr)
    classer = Classer(freqlist)
    classer.save(outputprefix + '.cls')
    log("\t" + str(len(classer)) + " classes found", stream=sys.stderr)
    return classer    
Example #2
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from pynlpl.textprocessors import Classer
from pynlpl.statistics import FrequencyList
import sys

filename = sys.argv[1]

print >>sys.stderr, "Counting tokens"
f = open(filename)
freqlist = FrequencyList()
for i, line in enumerate(f):            
    if (i % 10000 == 0): 
        print >>sys.stderr, "\tLine " + str(i+1)
    line = ['<s>'] + line.strip().split(' ') + ['</s>']
    
    freqlist.append(line)
f.close()

print >>sys.stderr, "Building classer"
classer = Classer(freqlist, filesupport=True )
classer.save(filename + '.cls')

print >>sys.stderr, "Encoding data"
classer.encodefile(filename, filename + '.clsenc')
Example #3
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from pynlpl.textprocessors import Classer
import sys

classer = Classer(sys.argv[1])
for line in classer.decodefile(sys.argv[2]):
    print " ".join(line).encode('utf-8')
Example #4
0
from pynlpl.textprocessors import crude_tokenizer, Classer
import sys
import codecs
import asizeof

freqlist = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f:
    line = crude_tokenizer(line.strip())
    freqlist.append(line)    
f.close()

print "FREQLIST:               " ,asizeof.asizeof(freqlist)




classer = Classer(freqlist)
print "CLASSER:                " ,asizeof.asizeof(classer)

classer2 = Classer(freqlist, False,True)
print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2)

freqlist2 = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f:
    line = crude_tokenizer(line.strip())
    freqlist2.append(classer.encodeseq(line))    
f.close()
print "FREQLIST-AFTER-CLASSER: " ,asizeof.asizeof(freqlist2)