Esempio n. 1
0
 def test_freqlist_tokencount(self):
     """Frequency List (count tokens)"""
     global sentences
     f= FrequencyList()
     for sentence in sentences:
         f.append(sentence)
     self.assertEqual(f.total,13) 
Esempio n. 2
0
 def test_freqlist_caseinsens(self):
     """Bigram Frequency List (case insensitive)"""
     global sentences
     f = FrequencyList(None, False)
     for sentence in sentences:
         f.append(Windower(sentence, 2))
     self.assertTrue((f[('is', 'a')] == 2 and f[('this', 'is')] == 1))
Esempio n. 3
0
 def test_freqlist_caseinsens(self):
     """Frequency List (case insensitive)"""
     global sentences
     f= FrequencyList(None, False)
     for sentence in sentences:
         f.append(sentence)
     self.assertTrue(( f['sentence'] == 2 and  f['this'] == 2 and f['Test'] == 1 )) 
Esempio n. 4
0
 def test_freqlist_caseinsens(self):
     """Bigram Frequency List (case insensitive)"""
     global sentences
     f= FrequencyList(None, False)
     for sentence in sentences:
         f.append(Windower(sentence,2))
     self.assertTrue(( f[('is','a')] == 2 and  f[('this','is')] == 1))
Esempio n. 5
0
 def test_freqlist_tokencount(self):
     """Frequency List (count tokens)"""
     global sentences
     f = FrequencyList()
     for sentence in sentences:
         f.append(sentence)
     self.assertEqual(f.total, 13)
Esempio n. 6
0
 def test_freqlist_typecount(self):
     """Frequency List (count types)"""
     global sentences
     f = FrequencyList()
     for sentence in sentences:
         f.append(sentence)
     self.assertEqual(len(f), 9)
Esempio n. 7
0
 def test_freqlist_typecount(self):
     """Frequency List (count types)"""
     global sentences
     f= FrequencyList()
     for sentence in sentences:
         f.append(sentence)
     self.assertEqual(len(f),9) 
Esempio n. 8
0
 def test_freqlist_caseinsens(self):
     """Frequency List (case insensitive)"""
     global sentences
     f = FrequencyList(None, False)
     for sentence in sentences:
         f.append(sentence)
     self.assertTrue((f['sentence'] == 2 and f['this'] == 2
                      and f['Test'] == 1))
Esempio n. 9
0
def buildclasser(file):
    freqlist = FrequencyList()
    f = open(file,'r')
    for line in f:
        line = line.strip()
        freqlist.append(line.split(' '))
    f.close()
    return Classer(freqlist)
Esempio n. 10
0
        def buildfromtext(self, files, encoding='utf-8'):
            freqlist = FrequencyList()
            if isinstance(files, str): files = [files]
            for filename in files:
                with open(filename, 'r', encoding=encoding) as f:
                    for line in f:
                        tokens = line.strip().split()
                        freqlist.append(tokens)

            self.buildfromfreqlist(freqlist)
Esempio n. 11
0
        def buildfromtext(self, files, encoding='utf-8'):
            freqlist = FrequencyList()
            if isinstance(files, str): files = [files]
            for filename in files:
                with open(filename, 'r',encoding=encoding) as f:
                    for line in f:
                        tokens = line.strip().split()
                        freqlist.append(tokens)

            self.buildfromfreqlist(freqlist)
Esempio n. 12
0
        def buildfromfolia(self, files, encoding='utf-8'):
            freqlist = FrequencyList()
            if isinstance(files, str): files = [files]
            for filename in files:
                f = folia.Document(file=filename)
                for sentence in f.sentences():
                    tokens = sentence.toktext().split(' ')
                    freqlist.append(tokens)

            self.buildfromfreqlist(freqlist)
Esempio n. 13
0
        def buildfromfolia(self, files, encoding='utf-8'):
            freqlist = FrequencyList()
            if isinstance(files, str): files = [files]
            for filename in files:
                f = folia.Document(file=filename)
                for sentence in f.sentences():
                    tokens = sentence.toktext().split(' ')
                    freqlist.append(tokens)


            self.buildfromfreqlist(freqlist)
Esempio n. 14
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive = False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:", o, file=sys.stderr)
            sys.exit(1)

    if not files:
        print >> sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename, 'r', encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line), n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type, tuple) or isinstance(type, list):
            type = " ".join(type)
        s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
            dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
    print("Types:            ", len(freqlist), file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
    print("Entropy:          ", dist.entropy(), file=sys.stderr)
Esempio n. 15
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err),file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive =  False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:",o,file=sys.stderr)
            sys.exit(1)

    if not files:
        print >>sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename,'r',encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line),n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type,tuple) or isinstance(type,list):
            type = " ".join(type)
        s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
    print("Types:            ", len(freqlist),file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
    print("Entropy:          ", dist.entropy(),file=sys.stderr)
Esempio n. 16
0
def buildclasser():
    global DOTOKENIZE, ENCODING, outputprefix
    log("Counting unigrams (for classer) ...",stream=sys.stderr)
    freqlist = FrequencyList()
    f = open(corpusfile)
    for i, line in enumerate(f):            
        if (i % 10000 == 0): 
            log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr)
        if DOTOKENIZE: 
            line = crude_tokenizer(line.strip())
        line = line.strip().split(' ')
        freqlist.append(['<begin>'] + line + ['<end>'])
    f.close()
    
    log("Building classer ...", stream=sys.stderr)
    classer = Classer(freqlist)
    classer.save(outputprefix + '.cls')
    log("\t" + str(len(classer)) + " classes found", stream=sys.stderr)
    return classer    
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1)
    parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true")
    parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8')
    parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")


    args = parser.parse_args()

    if not args.files:
        print("No files specified", file=sys.stderr)
        sys.exit(1)

    freqlist = FrequencyList(None, args.caseinsensitive)
    for filename in args.files:
        f = io.open(filename,'r',encoding=args.encoding)
        for line in f:
            if args.ngramsize > 1:
                freqlist.append(Windower(crude_tokenizer(line),args.ngramsize))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type,tuple) or isinstance(type,list):
            type = " ".join(type)
        s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
    print("Types:            ", len(freqlist),file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
    print("Entropy:          ", dist.entropy(),file=sys.stderr)
Esempio n. 18
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from pynlpl.textprocessors import Classer
from pynlpl.statistics import FrequencyList
import sys

filename = sys.argv[1]

print >>sys.stderr, "Counting tokens"
f = open(filename)
freqlist = FrequencyList()
for i, line in enumerate(f):            
    if (i % 10000 == 0): 
        print >>sys.stderr, "\tLine " + str(i+1)
    line = ['<s>'] + line.strip().split(' ') + ['</s>']
    
    freqlist.append(line)
f.close()

print >>sys.stderr, "Building classer"
classer = Classer(freqlist, filesupport=True )
classer.save(filename + '.cls')

print >>sys.stderr, "Encoding data"
classer.encodefile(filename, filename + '.clsenc')
Esempio n. 19
0
    elif o == "-e":
        encoding = a
    else:
        print >>sys.stderr, "ERROR: Unknown option:",o
        sys.exit(1)

if not files:
    print >>sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename,'r',encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line),n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type,tuple) or isinstance(type,list):
        type = " ".join(type)
    s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
    print s.encode('utf-8')

print >>sys.stderr, "Tokens:           ", freqlist.tokens()
print >>sys.stderr, "Types:            ", len(freqlist)
print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio()
Esempio n. 20
0
    elif o == "-e":
        encoding = a
    else:
        print("ERROR: Unknown option:", o, file=sys.stderr)
        sys.exit(1)

if not files:
    print >> sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename, 'r', encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line), n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type, tuple) or isinstance(type, list):
        type = " ".join(type)
    s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
        dist.information(type))
    print(s)

print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
print("Types:            ", len(freqlist), file=sys.stderr)
Esempio n. 21
0
class WordAlignment(object):
    def __init__(self, casesensitive=False):
        self.casesensitive = casesensitive

    def train(self, sourcefile, targetfile):
        sourcefile = open(sourcefile)
        targetfile = open(targetfile)

        self.sourcefreqlist = FrequencyList(None, self.casesensitive)
        self.targetfreqlist = FrequencyList(None, self.casesensitive)

        #frequency lists
        self.source2target = {}
        self.target2source = {}

        for sourceline, targetline in zip(sourcefile, targetfile):
            sourcetokens = sourceline.split()
            targettokens = targetline.split()

            self.sourcefreqlist.append(sourcetokens)
            self.targetfreqlist.append(targettokens)

            for sourcetoken in sourcetokens:
                if not sourcetoken in self.source2target:
                    self.source2target[sourcetoken] = FrequencyList(
                        targettokens, self.casesensitive)
                else:
                    self.source2target[sourcetoken].append(targettokens)

            for targettoken in targettokens:
                if not targettoken in self.target2source:
                    self.target2source[targettoken] = FrequencyList(
                        sourcetokens, self.casesensitive)
                else:
                    self.target2source[targettoken].append(sourcetokens)

        sourcefile.close()
        targetfile.close()

    def test(self, sourcefile, targetfile):
        sourcefile = open(sourcefile)
        targetfile = open(targetfile)

        #stage 2
        for sourceline, targetline in zip(sourcefile, targetfile):
            sourcetokens = sourceline.split()
            targettokens = targetline.split()

            S2Talignment = []
            T2Salignment = []

            for sourcetoken in sourcetokens:
                #which of the target-tokens is most frequent?
                besttoken = None
                bestscore = -1
                for i, targettoken in enumerate(targettokens):
                    if targettoken in self.source2target[sourcetoken]:
                        score = self.source2target[sourcetoken][
                            targettoken] / float(
                                self.targetfreqlist[targettoken])
                        if score > bestscore:
                            bestscore = self.source2target[sourcetoken][
                                targettoken]
                            besttoken = i
                S2Talignment.append(besttoken)  #TODO: multi-alignment?

            for targettoken in targettokens:
                besttoken = None
                bestscore = -1
                for i, sourcetoken in enumerate(sourcetokens):
                    if sourcetoken in self.target2source[targettoken]:
                        score = self.target2source[targettoken][
                            sourcetoken] / float(
                                self.sourcefreqlist[sourcetoken])
                        if score > bestscore:
                            bestscore = self.target2source[targettoken][
                                sourcetoken]
                            besttoken = i
                T2Salignment.append(besttoken)  #TODO: multi-alignment?

            yield sourcetokens, targettokens, S2Talignment, T2Salignment

        sourcefile.close()
        targetfile.close()
Esempio n. 22
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-


from pynlpl.statistics import FrequencyList
from pynlpl.textprocessors import crude_tokenizer, Classer
import sys
import codecs
import asizeof

freqlist = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f:
    line = crude_tokenizer(line.strip())
    freqlist.append(line)    
f.close()

print "FREQLIST:               " ,asizeof.asizeof(freqlist)




classer = Classer(freqlist)
print "CLASSER:                " ,asizeof.asizeof(classer)

classer2 = Classer(freqlist, False,True)
print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2)

freqlist2 = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f: