Beispiel #1
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive = False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:", o, file=sys.stderr)
            sys.exit(1)

    if not files:
        print >> sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename, 'r', encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line), n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type, tuple) or isinstance(type, list):
            type = " ".join(type)
        s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
            dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
    print("Types:            ", len(freqlist), file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
    print("Entropy:          ", dist.entropy(), file=sys.stderr)
Beispiel #2
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err),file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive =  False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:",o,file=sys.stderr)
            sys.exit(1)

    if not files:
        print >>sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename,'r',encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line),n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type,tuple) or isinstance(type,list):
            type = " ".join(type)
        s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
    print("Types:            ", len(freqlist),file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
    print("Entropy:          ", dist.entropy(),file=sys.stderr)
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0):
    global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX, MINLENGTH
    log("Counting "+str(n)+"-grams ...", stream=sys.stderr)
    f.seek(0)
    gaps = list(consecutivegaps(n))
    for i, line in enumerate(f):
        if (i % 10000 == 0): 
            if linecount == 0:
                log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr)
            else:
                log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) 
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        else:
            line = [ x for x in line.strip().split(' ') if x ]
        for ngram in Windower(line,n):
            if DOCLASSER: ngram = tuple(classer.encodeseq(ngram))
            if n - 1 in freqlist:
                count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1])
            else:
                count = True
            if count:
                freqlist[n].count(ngram)
                if DOINDEX:
                    try:
                        index[ngram].add(i)
                    except KeyError:
                        index[ngram] = set((i,))
                if DOSKIPGRAMS and n >= 2 and ngram[0] != '<begin>' and ngram[-1] != '<end>':                    
                    for beginindex, length in gaps:
                        preskip = ngram[:beginindex]
                        postskip = ngram[beginindex+length:]                                                
                        if len(preskip) >= MINLENGTH and not (preskip in freqlist[len(preskip)]):
                            continue #this skip-gram isn't going to make it over the min threshold
                        if len(postskip) >= MINLENGTH and not (postskip in freqlist[len(postskip)]):
                            continue  #this skip-gram isn't going to make it over the min threshold
                    
                        skipgram = (preskip, postskip)                        
                        body = ngram[beginindex:beginindex+length]
                        if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later
                            simpleskipgrams[n][skipgram] = {None: 1}
                        else:
                            simpleskipgrams[n][skipgram][None] += 1
                        if body in simpleskipgrams[n][skipgram]:
                            if DOINDEX:
                                simpleskipgrams[n][skipgram][body].add(i)
                            else:
                                simpleskipgrams[n][skipgram][body] += 1
                        else:
                            if DOINDEX:
                                simpleskipgrams[n][skipgram][body] = set((i,))
                            else:
                                simpleskipgrams[n][skipgram][body] = 1
                    
    log("Found " + str(len(freqlist[n])) +  " " + str(n) + "-grams and " + str(len(simpleskipgrams[n])) + " skip-grams", stream=sys.stderr)                    
    return i+1
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0):
    global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX
    log("Counting "+str(n)+"-grams ...", stream=sys.stderr)
    f.seek(0)
    for i, line in enumerate(f):
        if (i % 10000 == 0): 
            if linecount == 0:
                log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr)
            else:
                log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) 
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        else:
            line = [ x for x in line.strip().split(' ') if x ]
        for ngram in Windower(line,n):
            if DOCLASSER: ngram = tuple(classer.encodeseq(ngram))
            if n - 1 in freqlist:
                count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1])
            else:
                count = True
            if count:
                freqlist[n].count(ngram)
                if DOINDEX:
                    try:
                        index[ngram].add(i)
                    except KeyError:
                        index[ngram] = set((i,))
                if DOSKIPGRAMS and n >= 3 and ngram[0] != '<begin>' and ngram[-1] != '<end>':
                    skipgram =  ( (ngram[0],) , (ngram[-1],) )
                    body = tuple(ngram[1:-1])
                    if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later
                        simpleskipgrams[n][skipgram] = {None: 1}
                    else:
                        simpleskipgrams[n][skipgram][None] += 1
                    if body in simpleskipgrams[n][skipgram]:
                        if DOINDEX:
                            simpleskipgrams[n][skipgram][body].add(i)
                        else:
                            simpleskipgrams[n][skipgram][body] += 1
                    else:
                        if DOINDEX:
                            simpleskipgrams[n][skipgram][body] = set((i,))
                        else:
                            simpleskipgrams[n][skipgram][body] = 1
                    
                    #simpleskipgrams[n].count( skipgram )                     
                    #try:
                    #    skips[skipgram].append( ngram[1:-1] )
                    #except:
                    #    skips[skipgram] = [ ngram[1:-1] ]
    log("Found " + str(len(freqlist[n])) +  " " + str(n) + "-grams", stream=sys.stderr)                    
    return i+1
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1)
    parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true")
    parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8')
    parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")


    args = parser.parse_args()

    if not args.files:
        print("No files specified", file=sys.stderr)
        sys.exit(1)

    freqlist = FrequencyList(None, args.caseinsensitive)
    for filename in args.files:
        f = io.open(filename,'r',encoding=args.encoding)
        for line in f:
            if args.ngramsize > 1:
                freqlist.append(Windower(crude_tokenizer(line),args.ngramsize))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type,tuple) or isinstance(type,list):
            type = " ".join(type)
        s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
    print("Types:            ", len(freqlist),file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
    print("Entropy:          ", dist.entropy(),file=sys.stderr)
def buildclasser():
    global DOTOKENIZE, ENCODING, outputprefix
    log("Counting unigrams (for classer) ...",stream=sys.stderr)
    freqlist = FrequencyList()
    f = open(corpusfile)
    for i, line in enumerate(f):            
        if (i % 10000 == 0): 
            log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr)
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        line = line.strip().split(' ')
        freqlist.append(['<begin>'] + line + ['<end>'])
    f.close()
    
    log("Building classer ...", stream=sys.stderr)
    classer = Classer(freqlist)
    classer.save(outputprefix + '.cls')
    log("\t" + str(len(classer)) + " classes found", stream=sys.stderr)
    return classer    
Beispiel #7
0
    elif o == "-e":
        encoding = a
    else:
        print >>sys.stderr, "ERROR: Unknown option:",o
        sys.exit(1)

if not files:
    print >>sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename,'r',encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line),n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type,tuple) or isinstance(type,list):
        type = " ".join(type)
    s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
    print s.encode('utf-8')

print >>sys.stderr, "Tokens:           ", freqlist.tokens()
print >>sys.stderr, "Types:            ", len(freqlist)
print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio()
Beispiel #8
0
    elif o == "-e":
        encoding = a
    else:
        print("ERROR: Unknown option:", o, file=sys.stderr)
        sys.exit(1)

if not files:
    print >> sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename, 'r', encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line), n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type, tuple) or isinstance(type, list):
        type = " ".join(type)
    s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
        dist.information(type))
    print(s)

print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
print("Types:            ", len(freqlist), file=sys.stderr)
Beispiel #9
0
 def test_tokenize(self):
     """Crude tokeniser"""
     global text
     self.assertEqual(crude_tokenizer("This is a test."),text)
Beispiel #10
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-


from pynlpl.statistics import FrequencyList
from pynlpl.textprocessors import crude_tokenizer, Classer
import sys
import codecs
import asizeof

freqlist = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f:
    line = crude_tokenizer(line.strip())
    freqlist.append(line)    
f.close()

print "FREQLIST:               " ,asizeof.asizeof(freqlist)




classer = Classer(freqlist)
print "CLASSER:                " ,asizeof.asizeof(classer)

classer2 = Classer(freqlist, False,True)
print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2)

freqlist2 = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f: