def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err), file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err),file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:",o,file=sys.stderr) sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0): global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX, MINLENGTH log("Counting "+str(n)+"-grams ...", stream=sys.stderr) f.seek(0) gaps = list(consecutivegaps(n)) for i, line in enumerate(f): if (i % 10000 == 0): if linecount == 0: log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr) else: log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) else: line = [ x for x in line.strip().split(' ') if x ] for ngram in Windower(line,n): if DOCLASSER: ngram = tuple(classer.encodeseq(ngram)) if n - 1 in freqlist: count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1]) else: count = True if count: freqlist[n].count(ngram) if DOINDEX: try: index[ngram].add(i) except KeyError: index[ngram] = set((i,)) if DOSKIPGRAMS and n >= 2 and ngram[0] != '<begin>' and ngram[-1] != '<end>': for beginindex, length in gaps: preskip = ngram[:beginindex] postskip = ngram[beginindex+length:] if len(preskip) >= MINLENGTH and not (preskip in freqlist[len(preskip)]): continue #this skip-gram isn't going to make it over the min threshold if len(postskip) >= MINLENGTH and not (postskip in freqlist[len(postskip)]): continue #this skip-gram isn't going to make it over the min threshold skipgram = (preskip, postskip) body = ngram[beginindex:beginindex+length] if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later simpleskipgrams[n][skipgram] = {None: 1} else: simpleskipgrams[n][skipgram][None] += 1 if body in simpleskipgrams[n][skipgram]: if DOINDEX: simpleskipgrams[n][skipgram][body].add(i) else: simpleskipgrams[n][skipgram][body] += 1 else: if DOINDEX: simpleskipgrams[n][skipgram][body] = set((i,)) else: simpleskipgrams[n][skipgram][body] = 1 log("Found " + str(len(freqlist[n])) + " " + str(n) + "-grams and " + str(len(simpleskipgrams[n])) + " skip-grams", stream=sys.stderr) return i+1
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0): global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX log("Counting "+str(n)+"-grams ...", stream=sys.stderr) f.seek(0) for i, line in enumerate(f): if (i % 10000 == 0): if linecount == 0: log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr) else: log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) else: line = [ x for x in line.strip().split(' ') if x ] for ngram in Windower(line,n): if DOCLASSER: ngram = tuple(classer.encodeseq(ngram)) if n - 1 in freqlist: count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1]) else: count = True if count: freqlist[n].count(ngram) if DOINDEX: try: index[ngram].add(i) except KeyError: index[ngram] = set((i,)) if DOSKIPGRAMS and n >= 3 and ngram[0] != '<begin>' and ngram[-1] != '<end>': skipgram = ( (ngram[0],) , (ngram[-1],) ) body = tuple(ngram[1:-1]) if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later simpleskipgrams[n][skipgram] = {None: 1} else: simpleskipgrams[n][skipgram][None] += 1 if body in simpleskipgrams[n][skipgram]: if DOINDEX: simpleskipgrams[n][skipgram][body].add(i) else: simpleskipgrams[n][skipgram][body] += 1 else: if DOINDEX: simpleskipgrams[n][skipgram][body] = set((i,)) else: simpleskipgrams[n][skipgram][body] = 1 #simpleskipgrams[n].count( skipgram ) #try: # skips[skipgram].append( ngram[1:-1] ) #except: # skips[skipgram] = [ ngram[1:-1] ] log("Found " + str(len(freqlist[n])) + " " + str(n) + "-grams", stream=sys.stderr) return i+1
def main(): parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1) parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true") parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8') parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)") args = parser.parse_args() if not args.files: print("No files specified", file=sys.stderr) sys.exit(1) freqlist = FrequencyList(None, args.caseinsensitive) for filename in args.files: f = io.open(filename,'r',encoding=args.encoding) for line in f: if args.ngramsize > 1: freqlist.append(Windower(crude_tokenizer(line),args.ngramsize)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
def buildclasser(): global DOTOKENIZE, ENCODING, outputprefix log("Counting unigrams (for classer) ...",stream=sys.stderr) freqlist = FrequencyList() f = open(corpusfile) for i, line in enumerate(f): if (i % 10000 == 0): log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) line = line.strip().split(' ') freqlist.append(['<begin>'] + line + ['<end>']) f.close() log("Building classer ...", stream=sys.stderr) classer = Classer(freqlist) classer.save(outputprefix + '.cls') log("\t" + str(len(classer)) + " classes found", stream=sys.stderr) return classer
elif o == "-e": encoding = a else: print >>sys.stderr, "ERROR: Unknown option:",o sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print s.encode('utf-8') print >>sys.stderr, "Tokens: ", freqlist.tokens() print >>sys.stderr, "Types: ", len(freqlist) print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio()
elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr)
def test_tokenize(self): """Crude tokeniser""" global text self.assertEqual(crude_tokenizer("This is a test."),text)
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.statistics import FrequencyList from pynlpl.textprocessors import crude_tokenizer, Classer import sys import codecs import asizeof freqlist = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f: line = crude_tokenizer(line.strip()) freqlist.append(line) f.close() print "FREQLIST: " ,asizeof.asizeof(freqlist) classer = Classer(freqlist) print "CLASSER: " ,asizeof.asizeof(classer) classer2 = Classer(freqlist, False,True) print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2) freqlist2 = FrequencyList() f = codecs.open(sys.argv[1], 'r','utf-8') for line in f: