def test_freqlist_tokencount(self): """Frequency List (count tokens)""" global sentences f= FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(f.total,13)
def test_freqlist_caseinsens(self): """Bigram Frequency List (case insensitive)""" global sentences f = FrequencyList(None, False) for sentence in sentences: f.append(Windower(sentence, 2)) self.assertTrue((f[('is', 'a')] == 2 and f[('this', 'is')] == 1))
def test_freqlist_caseinsens(self): """Frequency List (case insensitive)""" global sentences f= FrequencyList(None, False) for sentence in sentences: f.append(sentence) self.assertTrue(( f['sentence'] == 2 and f['this'] == 2 and f['Test'] == 1 ))
def test_freqlist_caseinsens(self): """Bigram Frequency List (case insensitive)""" global sentences f= FrequencyList(None, False) for sentence in sentences: f.append(Windower(sentence,2)) self.assertTrue(( f[('is','a')] == 2 and f[('this','is')] == 1))
def test_freqlist_tokencount(self): """Frequency List (count tokens)""" global sentences f = FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(f.total, 13)
def test_freqlist_typecount(self): """Frequency List (count types)""" global sentences f = FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(len(f), 9)
def test_freqlist_typecount(self): """Frequency List (count types)""" global sentences f= FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(len(f),9)
def test_freqlist_caseinsens(self): """Frequency List (case insensitive)""" global sentences f = FrequencyList(None, False) for sentence in sentences: f.append(sentence) self.assertTrue((f['sentence'] == 2 and f['this'] == 2 and f['Test'] == 1))
def buildclasser(file): freqlist = FrequencyList() f = open(file,'r') for line in f: line = line.strip() freqlist.append(line.split(' ')) f.close() return Classer(freqlist)
def buildfromtext(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: with open(filename, 'r', encoding=encoding) as f: for line in f: tokens = line.strip().split() freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def buildfromtext(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: with open(filename, 'r',encoding=encoding) as f: for line in f: tokens = line.strip().split() freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def buildfromfolia(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: f = folia.Document(file=filename) for sentence in f.sentences(): tokens = sentence.toktext().split(' ') freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err), file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err),file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:",o,file=sys.stderr) sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
def buildclasser(): global DOTOKENIZE, ENCODING, outputprefix log("Counting unigrams (for classer) ...",stream=sys.stderr) freqlist = FrequencyList() f = open(corpusfile) for i, line in enumerate(f): if (i % 10000 == 0): log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) line = line.strip().split(' ') freqlist.append(['<begin>'] + line + ['<end>']) f.close() log("Building classer ...", stream=sys.stderr) classer = Classer(freqlist) classer.save(outputprefix + '.cls') log("\t" + str(len(classer)) + " classes found", stream=sys.stderr) return classer
def main(): parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1) parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true") parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8') parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)") args = parser.parse_args() if not args.files: print("No files specified", file=sys.stderr) sys.exit(1) freqlist = FrequencyList(None, args.caseinsensitive) for filename in args.files: f = io.open(filename,'r',encoding=args.encoding) for line in f: if args.ngramsize > 1: freqlist.append(Windower(crude_tokenizer(line),args.ngramsize)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Classer from pynlpl.statistics import FrequencyList import sys filename = sys.argv[1] print >>sys.stderr, "Counting tokens" f = open(filename) freqlist = FrequencyList() for i, line in enumerate(f): if (i % 10000 == 0): print >>sys.stderr, "\tLine " + str(i+1) line = ['<s>'] + line.strip().split(' ') + ['</s>'] freqlist.append(line) f.close() print >>sys.stderr, "Building classer" classer = Classer(freqlist, filesupport=True ) classer.save(filename + '.cls') print >>sys.stderr, "Encoding data" classer.encodefile(filename, filename + '.clsenc')
elif o == "-e": encoding = a else: print >>sys.stderr, "ERROR: Unknown option:",o sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print s.encode('utf-8') print >>sys.stderr, "Tokens: ", freqlist.tokens() print >>sys.stderr, "Types: ", len(freqlist) print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio()
elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr)
class WordAlignment(object): def __init__(self, casesensitive=False): self.casesensitive = casesensitive def train(self, sourcefile, targetfile): sourcefile = open(sourcefile) targetfile = open(targetfile) self.sourcefreqlist = FrequencyList(None, self.casesensitive) self.targetfreqlist = FrequencyList(None, self.casesensitive) #frequency lists self.source2target = {} self.target2source = {} for sourceline, targetline in zip(sourcefile, targetfile): sourcetokens = sourceline.split() targettokens = targetline.split() self.sourcefreqlist.append(sourcetokens) self.targetfreqlist.append(targettokens) for sourcetoken in sourcetokens: if not sourcetoken in self.source2target: self.source2target[sourcetoken] = FrequencyList( targettokens, self.casesensitive) else: self.source2target[sourcetoken].append(targettokens) for targettoken in targettokens: if not targettoken in self.target2source: self.target2source[targettoken] = FrequencyList( sourcetokens, self.casesensitive) else: self.target2source[targettoken].append(sourcetokens) sourcefile.close() targetfile.close() def test(self, sourcefile, targetfile): sourcefile = open(sourcefile) targetfile = open(targetfile) #stage 2 for sourceline, targetline in zip(sourcefile, targetfile): sourcetokens = sourceline.split() targettokens = targetline.split() S2Talignment = [] T2Salignment = [] for sourcetoken in sourcetokens: #which of the target-tokens is most frequent? besttoken = None bestscore = -1 for i, targettoken in enumerate(targettokens): if targettoken in self.source2target[sourcetoken]: score = self.source2target[sourcetoken][ targettoken] / float( self.targetfreqlist[targettoken]) if score > bestscore: bestscore = self.source2target[sourcetoken][ targettoken] besttoken = i S2Talignment.append(besttoken) #TODO: multi-alignment? for targettoken in targettokens: besttoken = None bestscore = -1 for i, sourcetoken in enumerate(sourcetokens): if sourcetoken in self.target2source[targettoken]: score = self.target2source[targettoken][ sourcetoken] / float( self.sourcefreqlist[sourcetoken]) if score > bestscore: bestscore = self.target2source[targettoken][ sourcetoken] besttoken = i T2Salignment.append(besttoken) #TODO: multi-alignment? yield sourcetokens, targettokens, S2Talignment, T2Salignment sourcefile.close() targetfile.close()
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.statistics import FrequencyList from pynlpl.textprocessors import crude_tokenizer, Classer import sys import codecs import asizeof freqlist = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f: line = crude_tokenizer(line.strip()) freqlist.append(line) f.close() print "FREQLIST: " ,asizeof.asizeof(freqlist) classer = Classer(freqlist) print "CLASSER: " ,asizeof.asizeof(classer) classer2 = Classer(freqlist, False,True) print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2) freqlist2 = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f: