def test_freqlist_caseinsens(self): """Bigram Frequency List (case insensitive)""" global sentences f = FrequencyList(None, False) for sentence in sentences: f.append(Windower(sentence, 2)) self.assertTrue((f[('is', 'a')] == 2 and f[('this', 'is')] == 1))
def test_freqlist_typecount(self): """Frequency List (count types)""" global sentences f= FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(len(f),9)
def process(filename): print >> sys.stderr, "Processing " + filename doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' + settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename, True) return freqlist
def test_freqlist_caseinsens(self): """Bigram Frequency List (case insensitive)""" global sentences f= FrequencyList(None, False) for sentence in sentences: f.append(Windower(sentence,2)) self.assertTrue(( f[('is','a')] == 2 and f[('this','is')] == 1))
def test_freqlist_caseinsens(self): """Frequency List (case insensitive)""" global sentences f= FrequencyList(None, False) for sentence in sentences: f.append(sentence) self.assertTrue(( f['sentence'] == 2 and f['this'] == 2 and f['Test'] == 1 ))
def test_freqlist_tokencount(self): """Frequency List (count tokens)""" global sentences f = FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(f.total, 13)
def test_freqlist_typecount(self): """Frequency List (count types)""" global sentences f = FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(len(f), 9)
def test_freqlist_tokencount(self): """Frequency List (count tokens)""" global sentences f= FrequencyList() for sentence in sentences: f.append(sentence) self.assertEqual(f.total,13)
def test_freqlist_caseinsens(self): """Frequency List (case insensitive)""" global sentences f = FrequencyList(None, False) for sentence in sentences: f.append(sentence) self.assertTrue((f['sentence'] == 2 and f['this'] == 2 and f['Test'] == 1))
def buildclasser(file): freqlist = FrequencyList() f = open(file,'r') for line in f: line = line.strip() freqlist.append(line.split(' ')) f.close() return Classer(freqlist)
def buildfromtext(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: with open(filename, 'r',encoding=encoding) as f: for line in f: tokens = line.strip().split() freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def buildfromtext(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: with open(filename, 'r', encoding=encoding) as f: for line in f: tokens = line.strip().split() freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def buildfromfolia(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: f = folia.Document(file=filename) for sentence in f.sentences(): tokens = sentence.toktext().split(' ') freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def process(filename): print >>sys.stderr, "Processing " + filename doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext() ]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext() ]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' +settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename,True) return freqlist
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "o:OE:htspwrq", ["help"]) except getopt.GetoptError as err: print(str(err),file=sys.stderr) usage() sys.exit(2) outputfile = None for o, a in opts: if o == '-h' or o == '--help': usage() sys.exit(0) elif o == '-e': settings.encoding = a elif o == '-E': settings.extension = a elif o == '-o': outputfile = a elif o == '-O': settings.autooutput = True elif o == '-s': settings.sentencemarkers = True elif o == '-r': settings.recurse = True elif o == '-q': settings.ignoreerrors = True else: raise Exception("No such option: " + o) if outputfile: outputfile = io.open(outputfile,'w',encoding=settings.encoding) if len(sys.argv) >= 2: freqlist = FrequencyList() for x in sys.argv[1:]: if os.path.isdir(x): processdir(x,freqlist) elif os.path.isfile(x): freqlist += process(x) else: print("ERROR: File or directory not found: " + x,file=sys.stderr) sys.exit(3) if outputfile: freqlist.save(outputfile, True) else: for line in freqlist.output("\t", True): print(line) else: print("ERROR: No files specified",file=sys.stderr) sys.exit(2)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "o:OE:htspwrq", ["help"]) except getopt.GetoptError as err: print(str(err), file=sys.stderr) usage() sys.exit(2) outputfile = None for o, a in opts: if o == '-h' or o == '--help': usage() sys.exit(0) elif o == '-e': settings.encoding = a elif o == '-E': settings.extension = a elif o == '-o': outputfile = a elif o == '-O': settings.autooutput = True elif o == '-s': settings.sentencemarkers = True elif o == '-r': settings.recurse = True elif o == '-q': settings.ignoreerrors = True else: raise Exception("No such option: " + o) if outputfile: outputfile = io.open(outputfile, 'w', encoding=settings.encoding) if len(sys.argv) >= 2: freqlist = FrequencyList() for x in sys.argv[1:]: if os.path.isdir(x): processdir(x, freqlist) elif os.path.isfile(x): freqlist += process(x) else: print("ERROR: File or directory not found: " + x, file=sys.stderr) sys.exit(3) if outputfile: freqlist.save(outputfile, True) else: for line in freqlist.output("\t", True): print(line) else: print("ERROR: No files specified", file=sys.stderr) sys.exit(2)
def load(self, filename): self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) f = io.open(filename,'r',encoding='utf-8') mode = False for line in f.readlines(): line = line.strip() if line: if not mode: if line != "[simplelanguagemodel]": raise Exception("File is not a SimpleLanguageModel") else: mode = 1 elif mode == 1: if line[:2] == 'n=': self.n = int(line[2:]) elif line[:12] == 'beginmarker=': self.beginmarker = line[12:] elif line[:10] == 'endmarker=': self.endmarker = line[10:] elif line[:10] == 'sentences=': self.sentences = int(line[10:]) elif line[:14] == 'casesensitive=': self.casesensitive = bool(int(line[14:])) self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) elif line == "[freqlistN]": mode = 2 else: raise Exception("Syntax error in language model file: ", line) elif mode == 2: if line == "[freqlistNm1]": mode = 3 else: try: type, count = line.split("\t") self.freqlistN.count(type.split(' '),int(count)) except: print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr) elif mode == 3: try: type, count = line.split("\t") self.freqlistNm1.count(type.split(' '),int(count)) except: print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr) if self.beginmarker: self._begingram = [self.beginmarker] * (self.n-1) if self.endmarker: self._endgram = [self.endmarker] * (self.n-1)
def load(self, filename): self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) f = io.open(filename, 'r', encoding='utf-8') mode = False for line in f.readlines(): line = line.strip() if line: if not mode: if line != "[simplelanguagemodel]": raise Exception("File is not a SimpleLanguageModel") else: mode = 1 elif mode == 1: if line[:2] == 'n=': self.n = int(line[2:]) elif line[:12] == 'beginmarker=': self.beginmarker = line[12:] elif line[:10] == 'endmarker=': self.endmarker = line[10:] elif line[:10] == 'sentences=': self.sentences = int(line[10:]) elif line[:14] == 'casesensitive=': self.casesensitive = bool(int(line[14:])) self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList( None, self.casesensitive) elif line == "[freqlistN]": mode = 2 else: raise Exception( "Syntax error in language model file: ", line) elif mode == 2: if line == "[freqlistNm1]": mode = 3 else: try: type, count = line.split("\t") self.freqlistN.count(type.split(' '), int(count)) except: print( "Warning, could not parse line whilst loading frequency list: ", line, file=stderr) elif mode == 3: try: type, count = line.split("\t") self.freqlistNm1.count(type.split(' '), int(count)) except: print( "Warning, could not parse line whilst loading frequency list: ", line, file=stderr) if self.beginmarker: self._begingram = [self.beginmarker] * (self.n - 1) if self.endmarker: self._endgram = [self.endmarker] * (self.n - 1)
def __init__(self, n=2, casesensitive = True, beginmarker = "<begin>", endmarker = "<end>"): self.casesensitive = casesensitive self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) assert isinstance(n,int) and n >= 2 self.n = n self.beginmarker = beginmarker self.endmarker = endmarker self.sentences = 0 if self.beginmarker: self._begingram = tuple([self.beginmarker] * (n-1)) if self.endmarker: self._endgram = tuple([self.endmarker] * (n-1))
def processdir(d, freqlist=None): if not freqlist: freqlist = FrequencyList() print("Searching in " + d, file=sys.stderr) for f in glob.glob(os.path.join(d, '*')): if f[-len(settings.extension) - 1:] == '.' + settings.extension: freqlist += process(f) elif settings.recurse and os.path.isdir(f): processdir(f, freqlist) return freqlist
def __init__(self, n=2, casesensitive=True, beginmarker="<begin>", endmarker="<end>"): self.casesensitive = casesensitive self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) assert isinstance(n, int) and n >= 2 self.n = n self.beginmarker = beginmarker self.endmarker = endmarker self.sentences = 0 if self.beginmarker: self._begingram = tuple([self.beginmarker] * (n - 1)) if self.endmarker: self._endgram = tuple([self.endmarker] * (n - 1))
def buildclasser(): global DOTOKENIZE, ENCODING, outputprefix log("Counting unigrams (for classer) ...",stream=sys.stderr) freqlist = FrequencyList() f = open(corpusfile) for i, line in enumerate(f): if (i % 10000 == 0): log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) line = line.strip().split(' ') freqlist.append(['<begin>'] + line + ['<end>']) f.close() log("Building classer ...", stream=sys.stderr) classer = Classer(freqlist) classer.save(outputprefix + '.cls') log("\t" + str(len(classer)) + " classes found", stream=sys.stderr) return classer
def process(filename): try: print("Processing " + filename, file=sys.stderr) doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' + settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename, True) except Exception as e: if settings.ignoreerrors: print("ERROR: An exception was raised whilst processing " + filename, e, file=sys.stderr) else: raise return freqlist
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err), file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
def train(self, sourcefile, targetfile): sourcefile = open(sourcefile) targetfile = open(targetfile) self.sourcefreqlist = FrequencyList(None, self.casesensitive) self.targetfreqlist = FrequencyList(None, self.casesensitive) #frequency lists self.source2target = {} self.target2source = {} for sourceline, targetline in zip(sourcefile, targetfile): sourcetokens = sourceline.split() targettokens = targetline.split() self.sourcefreqlist.append(sourcetokens) self.targetfreqlist.append(targettokens) for sourcetoken in sourcetokens: if not sourcetoken in self.source2target: self.source2target[sourcetoken] = FrequencyList( targettokens, self.casesensitive) else: self.source2target[sourcetoken].append(targettokens) for targettoken in targettokens: if not targettoken in self.target2source: self.target2source[targettoken] = FrequencyList( sourcetokens, self.casesensitive) else: self.target2source[targettoken].append(sourcetokens) sourcefile.close() targetfile.close()
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err),file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:",o,file=sys.stderr) sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
def process(filename): try: print("Processing " + filename,file=sys.stderr) doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext() ]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext() ]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' +settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename,True) except Exception as e: if settings.ignoreerrors: print("ERROR: An exception was raised whilst processing " + filename, e,file=sys.stderr) else: raise return freqlist
def main(): parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1) parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true") parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8') parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)") args = parser.parse_args() if not args.files: print("No files specified", file=sys.stderr) sys.exit(1) freqlist = FrequencyList(None, args.caseinsensitive) for filename in args.files: f = io.open(filename,'r',encoding=args.encoding) for line in f: if args.ngramsize > 1: freqlist.append(Windower(crude_tokenizer(line),args.ngramsize)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
elif o == '-o': outputfile = a elif o == '-O': settings.autooutput = True elif o == '-s': settings.sentencemarkers = True elif o == '-r': settings.recurse = True else: raise Exception("No such option: " + o) if outputfile: outputfile = codecs.open(outputfile,'w',settings.encoding) if len(sys.argv) >= 2: freqlist = FrequencyList() for x in sys.argv[1:]: if os.path.isdir(x): processdir(x,freqlist) elif os.path.isfile(x): freqlist += process(x) else: print >>sys.stderr, "ERROR: File or directory not found: " + x sys.exit(3) if outputfile: freqlist.save(outputfile, True) else: for line in freqlist.output("\t", True): print line else: print >>sys.stderr,"ERROR: No files specified"
for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Windower, crude_tokenizer from pynlpl.statistics import FrequencyList, Distribution import sys import codecs with codecs.open(sys.argv[1], 'r', 'utf-8') as file: freqlist = FrequencyList() for line in file: freqlist.append(Windower(crude_tokenizer(line), 2)) print "Type/Token Ratio: ", freqlist.typetokenratio() ### uncomment if you want to output the full frequency list: #for line in freqlist.output(): # print line.encode('utf-8') dist = Distribution(freqlist) for line in dist.output(): print line.encode('utf-8')
class SimpleLanguageModel: """This is a simple unsmoothed language model. This class can both hold and compute the model.""" def __init__(self, n=2, casesensitive = True, beginmarker = "<begin>", endmarker = "<end>"): self.casesensitive = casesensitive self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) assert isinstance(n,int) and n >= 2 self.n = n self.beginmarker = beginmarker self.endmarker = endmarker self.sentences = 0 if self.beginmarker: self._begingram = tuple([self.beginmarker] * (n-1)) if self.endmarker: self._endgram = tuple([self.endmarker] * (n-1)) def append(self, sentence): if isinstance(sentence, str) or isinstance(sentence, unicode): sentence = sentence.strip().split(' ') self.sentences += 1 for ngram in Windower(sentence,self.n, self.beginmarker, self.endmarker): self.freqlistN.count(ngram) for ngram in Windower(sentence,self.n-1, self.beginmarker, self.endmarker): self.freqlistNm1.count(ngram) def load(self, filename): self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) f = io.open(filename,'r',encoding='utf-8') mode = False for line in f.readlines(): line = line.strip() if line: if not mode: if line != "[simplelanguagemodel]": raise Exception("File is not a SimpleLanguageModel") else: mode = 1 elif mode == 1: if line[:2] == 'n=': self.n = int(line[2:]) elif line[:12] == 'beginmarker=': self.beginmarker = line[12:] elif line[:10] == 'endmarker=': self.endmarker = line[10:] elif line[:10] == 'sentences=': self.sentences = int(line[10:]) elif line[:14] == 'casesensitive=': self.casesensitive = bool(int(line[14:])) self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) elif line == "[freqlistN]": mode = 2 else: raise Exception("Syntax error in language model file: ", line) elif mode == 2: if line == "[freqlistNm1]": mode = 3 else: try: type, count = line.split("\t") self.freqlistN.count(type.split(' '),int(count)) except: print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr) elif mode == 3: try: type, count = line.split("\t") self.freqlistNm1.count(type.split(' '),int(count)) except: print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr) if self.beginmarker: self._begingram = [self.beginmarker] * (self.n-1) if self.endmarker: self._endgram = [self.endmarker] * (self.n-1) def save(self, filename): f = io.open(filename,'w',encoding='utf-8') f.write("[simplelanguagemodel]\n") f.write("n="+str(self.n)+"\n") f.write("sentences="+str(self.sentences)+"\n") f.write("beginmarker="+self.beginmarker+"\n") f.write("endmarker="+self.endmarker+"\n") f.write("casesensitive="+str(int(self.casesensitive))+"\n") f.write("\n") f.write("[freqlistN]\n") for line in self.freqlistN.output(): f.write(line+"\n") f.write("[freqlistNm1]\n") for line in self.freqlistNm1.output(): f.write(line+"\n") f.close() def scoresentence(self, sentence): return product([self[x] for x in Windower(sentence, self.n, self.beginmarker, self.endmarker)]) def __getitem__(self, ngram): assert len(ngram) == self.n nm1gram = ngram[:-1] if (self.beginmarker and nm1gram == self._begingram) or (self.endmarker and nm1gram == self._endgram): return self.freqlistN[ngram] / float(self.sentences) else: return self.freqlistN[ngram] / float(self.freqlistNm1[nm1gram])
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Windower, crude_tokenizer from pynlpl.statistics import FrequencyList, Distribution import sys import codecs with codecs.open(sys.argv[1],'r','utf-8') as file: freqlist = FrequencyList() for line in file: freqlist.append(Windower(crude_tokenizer(line),2)) print "Type/Token Ratio: ", freqlist.typetokenratio() ### uncomment if you want to output the full frequency list: #for line in freqlist.output(): # print line.encode('utf-8') dist = Distribution(freqlist) for line in dist.output(): print line.encode('utf-8')
#!/usr/bin/env python3 import sys from pynlpl.statistics import FrequencyList for filename in sys.argv[1:]: f_in = open(filename,'rt',encoding='utf-8') freqlist = FrequencyList() for line in f_in: fields = line.strip().split('\t') count = int(fields[1]) for lemma in fields[0].split(' '): freqlist.count(lemma, count) f_in.close() freqlist.save(filename + '.freqlist')
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.statistics import FrequencyList from pynlpl.textprocessors import crude_tokenizer, Classer import sys import codecs import asizeof freqlist = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f: line = crude_tokenizer(line.strip()) freqlist.append(line) f.close() print "FREQLIST: " ,asizeof.asizeof(freqlist) classer = Classer(freqlist) print "CLASSER: " ,asizeof.asizeof(classer) classer2 = Classer(freqlist, False,True) print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2) freqlist2 = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f:
from __future__ import print_function, unicode_literals, division, absolute_import import sys import os if __name__ == "__main__": sys.path.append(sys.path[0] + '/../..') os.environ['PYTHONPATH'] = sys.path[0] + '/../..' from pynlpl.formats.sonar import CorpusFiles, Corpus from pynlpl.statistics import FrequencyList sonardir = sys.argv[1] freqlist = FrequencyList() lemmapos_freqlist = FrequencyList() poshead_freqlist = FrequencyList() pos_freqlist = FrequencyList() for i, doc in enumerate(Corpus(sonardir)): print("#" + str(i) + " Processing " + doc.filename,file=sys.stderr) for word, id, pos, lemma in doc: freqlist.count(word) if lemma and pos: poshead = pos.split('(')[0] lemmapos_freqlist.count(lemma+'.'+poshead) poshead_freqlist.count(poshead) pos_freqlist.count(pos) freqlist.save('sonarfreqlist.txt')
#-*- coding:utf-8 -*- import sys import os if __name__ == "__main__": sys.path.append(sys.path[0] + '/../..') os.environ['PYTHONPATH'] = sys.path[0] + '/../..' from pynlpl.formats.sonar import CorpusFiles, Corpus from pynlpl.statistics import FrequencyList sonardir = sys.argv[1] freqlist = FrequencyList() lemmapos_freqlist = FrequencyList() poshead_freqlist = FrequencyList() pos_freqlist = FrequencyList() for i, doc in enumerate(Corpus(sonardir)): print >>sys.stderr, "#" + str(i) + " Processing " + doc.filename for word, id, pos, lemma in doc: freqlist.count(word) if lemma and pos: poshead = pos.split('(')[0] lemmapos_freqlist.count(lemma+'.'+poshead) poshead_freqlist.count(poshead) pos_freqlist.count(pos) freqlist.save('sonarfreqlist.txt')
class WordAlignment(object): def __init__(self, casesensitive=False): self.casesensitive = casesensitive def train(self, sourcefile, targetfile): sourcefile = open(sourcefile) targetfile = open(targetfile) self.sourcefreqlist = FrequencyList(None, self.casesensitive) self.targetfreqlist = FrequencyList(None, self.casesensitive) #frequency lists self.source2target = {} self.target2source = {} for sourceline, targetline in zip(sourcefile, targetfile): sourcetokens = sourceline.split() targettokens = targetline.split() self.sourcefreqlist.append(sourcetokens) self.targetfreqlist.append(targettokens) for sourcetoken in sourcetokens: if not sourcetoken in self.source2target: self.source2target[sourcetoken] = FrequencyList( targettokens, self.casesensitive) else: self.source2target[sourcetoken].append(targettokens) for targettoken in targettokens: if not targettoken in self.target2source: self.target2source[targettoken] = FrequencyList( sourcetokens, self.casesensitive) else: self.target2source[targettoken].append(sourcetokens) sourcefile.close() targetfile.close() def test(self, sourcefile, targetfile): sourcefile = open(sourcefile) targetfile = open(targetfile) #stage 2 for sourceline, targetline in zip(sourcefile, targetfile): sourcetokens = sourceline.split() targettokens = targetline.split() S2Talignment = [] T2Salignment = [] for sourcetoken in sourcetokens: #which of the target-tokens is most frequent? besttoken = None bestscore = -1 for i, targettoken in enumerate(targettokens): if targettoken in self.source2target[sourcetoken]: score = self.source2target[sourcetoken][ targettoken] / float( self.targetfreqlist[targettoken]) if score > bestscore: bestscore = self.source2target[sourcetoken][ targettoken] besttoken = i S2Talignment.append(besttoken) #TODO: multi-alignment? for targettoken in targettokens: besttoken = None bestscore = -1 for i, sourcetoken in enumerate(sourcetokens): if sourcetoken in self.target2source[targettoken]: score = self.target2source[targettoken][ sourcetoken] / float( self.sourcefreqlist[sourcetoken]) if score > bestscore: bestscore = self.target2source[targettoken][ sourcetoken] besttoken = i T2Salignment.append(besttoken) #TODO: multi-alignment? yield sourcetokens, targettokens, S2Talignment, T2Salignment sourcefile.close() targetfile.close()
settings.extension = a elif o == '-o': outputfile = a elif o == '-O': settings.autooutput = True elif o == '-s': settings.sentencemarkers = True elif o == '-r': settings.recurse = True else: raise Exception("No such option: " + o) if outputfile: outputfile = codecs.open(outputfile, 'w', settings.encoding) if len(sys.argv) >= 2: freqlist = FrequencyList() for x in sys.argv[1:]: if os.path.isdir(x): processdir(x, freqlist) elif os.path.isfile(x): freqlist += process(x) else: print >> sys.stderr, "ERROR: File or directory not found: " + x sys.exit(3) if outputfile: freqlist.save(outputfile, True) else: for line in freqlist.output("\t", True): print line else: print >> sys.stderr, "ERROR: No files specified"
class SimpleLanguageModel: """This is a simple unsmoothed language model. This class can both hold and compute the model.""" def __init__(self, n=2, casesensitive=True, beginmarker="<begin>", endmarker="<end>"): self.casesensitive = casesensitive self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) assert isinstance(n, int) and n >= 2 self.n = n self.beginmarker = beginmarker self.endmarker = endmarker self.sentences = 0 if self.beginmarker: self._begingram = tuple([self.beginmarker] * (n - 1)) if self.endmarker: self._endgram = tuple([self.endmarker] * (n - 1)) def append(self, sentence): if isinstance(sentence, str) or isinstance(sentence, unicode): sentence = sentence.strip().split(' ') self.sentences += 1 for ngram in Windower(sentence, self.n, self.beginmarker, self.endmarker): self.freqlistN.count(ngram) for ngram in Windower(sentence, self.n - 1, self.beginmarker, self.endmarker): self.freqlistNm1.count(ngram) def load(self, filename): self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList(None, self.casesensitive) f = io.open(filename, 'r', encoding='utf-8') mode = False for line in f.readlines(): line = line.strip() if line: if not mode: if line != "[simplelanguagemodel]": raise Exception("File is not a SimpleLanguageModel") else: mode = 1 elif mode == 1: if line[:2] == 'n=': self.n = int(line[2:]) elif line[:12] == 'beginmarker=': self.beginmarker = line[12:] elif line[:10] == 'endmarker=': self.endmarker = line[10:] elif line[:10] == 'sentences=': self.sentences = int(line[10:]) elif line[:14] == 'casesensitive=': self.casesensitive = bool(int(line[14:])) self.freqlistN = FrequencyList(None, self.casesensitive) self.freqlistNm1 = FrequencyList( None, self.casesensitive) elif line == "[freqlistN]": mode = 2 else: raise Exception( "Syntax error in language model file: ", line) elif mode == 2: if line == "[freqlistNm1]": mode = 3 else: try: type, count = line.split("\t") self.freqlistN.count(type.split(' '), int(count)) except: print( "Warning, could not parse line whilst loading frequency list: ", line, file=stderr) elif mode == 3: try: type, count = line.split("\t") self.freqlistNm1.count(type.split(' '), int(count)) except: print( "Warning, could not parse line whilst loading frequency list: ", line, file=stderr) if self.beginmarker: self._begingram = [self.beginmarker] * (self.n - 1) if self.endmarker: self._endgram = [self.endmarker] * (self.n - 1) def save(self, filename): f = io.open(filename, 'w', encoding='utf-8') f.write("[simplelanguagemodel]\n") f.write("n=" + str(self.n) + "\n") f.write("sentences=" + str(self.sentences) + "\n") f.write("beginmarker=" + self.beginmarker + "\n") f.write("endmarker=" + self.endmarker + "\n") f.write("casesensitive=" + str(int(self.casesensitive)) + "\n") f.write("\n") f.write("[freqlistN]\n") for line in self.freqlistN.output(): f.write(line + "\n") f.write("[freqlistNm1]\n") for line in self.freqlistNm1.output(): f.write(line + "\n") f.close() def scoresentence(self, sentence): return product([ self[x] for x in Windower(sentence, self.n, self.beginmarker, self.endmarker) ]) def __getitem__(self, ngram): assert len(ngram) == self.n nm1gram = ngram[:-1] if (self.beginmarker and nm1gram == self._begingram) or ( self.endmarker and nm1gram == self._endgram): return self.freqlistN[ngram] / float(self.sentences) else: return self.freqlistN[ngram] / float(self.freqlistNm1[nm1gram])
if __name__ == '__main__': try: inputdir = sys.argv[1] outputdir = sys.argv[2] threads = int(sys.argv[3]) except: print >>sys.stderr,"Syntax: sonar_postproc.py inputdir outputdir threads" sys.exit(2) cat_freqlist_word = FrequencyList() cat_freqlist_lemma = FrequencyList() cat_freqlist_lemmapos = FrequencyList() maxtasksperchild = 10 preindex = True prevcategory = None print >>sys.stderr,"Initialising (indexing)..." processor = folia.CorpusProcessor(inputdir, process, threads, 'folia.xml',"",lambda x: True, maxtasksperchild,preindex) print >>sys.stderr,"Processing..." for i, data in enumerate(processor): filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos = data if filepath: category = None for e in filepath.split('/'): if e[-4:] != '.xml' and e[:3] == 'WR-' or e[:3] == 'WS-':
for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print >>sys.stderr, "ERROR: Unknown option:",o sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
def process(data): global foliadir, indexlength, inputdir, outputdir curate = True filepath, args, kwargs = data outputfile = filepath.replace(inputdir, outputdir) if os.path.exists(outputfile): print >>sys.stderr, "Skipping curation of " + filepath + " (output file already exists)" curate = False s = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + filepath print >>sys.stderr, s if curate: #CURATION STEP replace = [ ('<pos-annotation annotator="tadpole" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/cgn"/>', '<pos-annotation annotator="frog" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000B-9"/>'), ('<lemma-annotation annotator="tadpole" annotatortype="auto" set="http://ilk.uvt.nl/folia/sets/lemmas-nl"/>', '<lemma-annotation annotator="frog" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000E-3"/>'), ('<entity-annotation set="sonar-ner"/>','<entity-annotation annotator="NERD" annotatortype="auto" set="hdl:1839/00-SCHM-0000-0000-000D-5"/>') ] try: f = codecs.open(filepath,'r','utf-8') outputlines = [] gapinsertpoint = 0 hasgap = False metadata = 0 for i, line in enumerate(f): if metadata == 0 and line.find('<?xml version') != -1: outputlines.append(line) outputlines.append('<?xml-stylesheet type="text/xsl" href="sonar-foliaviewer.xsl"?>\n') continue if metadata == 0 and line.find('<metadata src=') != -1: metadata = 1 if metadata == 1: if line.find('</metadata>') != -1: gapinsertpoint = i -1 metadata = 2 else: for source,target in replace: line = line.replace(source,target) if metadata == 2: if not hasgap and line.find('<gap ') != -1: hasgap = True outputlines.append(line) f.close() if hasgap and gapinsertpoint > 0: outputlines.insert(gapinsertpoint, '<gap-annotation />\n') dir = os.path.dirname(outputfile) if not os.path.isdir(dir): os.mkdir( os.path.dirname(outputfile)) tmpfile = filepath.replace(inputdir, outputdir) + '.tmp' f = codecs.open(tmpfile,'w','utf-8') for line in outputlines: f.write(line) f.close() try: os.rename(tmpfile, outputfile ) except: print >>sys.stderr,"Unable to write file " + outputfile except Exception as e: print >>sys.stderr,"ERROR: Got exception curating " + filepath + ": ", repr(e) #COUNT STEP freqlist_word = FrequencyList() freqlist_lemma = FrequencyList() freqlist_lemmapos = FrequencyList() try: for word in folia.Reader(filepath, folia.Word): try: freqlist_word.count(word.text()) except folia.NoSuchText: print >>sys.stderr, "ERROR: Got NoSuchText error on " + word.id + " !!!" continue try: if word.lemma(): freqlist_lemma.count(word.lemma()) if word.pos(): freqlist_lemmapos.count( (word.lemma(), word.pos()) ) except folia.NoSuchAnnotation: print >>sys.stderr, "ERROR: Got NoSuchAnnotation error on " + word.id + " !!!" continue except Exception as e: print >>sys.stderr,"ERROR: Got exception counting " + filepath + ": ", repr(e) return filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Classer from pynlpl.statistics import FrequencyList import sys filename = sys.argv[1] print >>sys.stderr, "Counting tokens" f = open(filename) freqlist = FrequencyList() for i, line in enumerate(f): if (i % 10000 == 0): print >>sys.stderr, "\tLine " + str(i+1) line = ['<s>'] + line.strip().split(' ') + ['</s>'] freqlist.append(line) f.close() print >>sys.stderr, "Building classer" classer = Classer(freqlist, filesupport=True ) classer.save(filename + '.cls') print >>sys.stderr, "Encoding data" classer.encodefile(filename, filename + '.clsenc')
def process(filepath): s = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + filepath print >> sys.stderr, s #COUNT STEP freqlist_word = FrequencyList() freqlist_lemma = FrequencyList() freqlist_lemmapos = FrequencyList() try: for word in folia.Reader(filepath, folia.Word): try: freqlist_word.count(word.text()) except folia.NoSuchText: print >> sys.stderr, "ERROR: Got NoSuchText error on " + word.id + " !!!" continue try: if word.lemma(): freqlist_lemma.count(word.lemma()) if word.pos(): freqlist_lemmapos.count((word.lemma(), word.pos())) except folia.NoSuchAnnotation: print >> sys.stderr, "ERROR: Got NoSuchAnnotation error on " + word.id + " !!!" continue except Exception as e: print >> sys.stderr, "ERROR: Got exception counting " + filepath + ": ", repr( e) return filepath, freqlist_word, freqlist_lemma, freqlist_lemmapos