def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err), file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err),file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:",o,file=sys.stderr) sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
def parseDistribution(self, instance, start, end=None): dist = {} i = start + 1 if not end: end = len(instance) - 1 while i < end: #instance[i] != "}": label = instance[i] try: score = float(instance[i + 1].rstrip(",")) dist[label] = score except: print( "ERROR: pynlpl.input.timbl.TimblOutput -- Could not fetch score for class '" + label + "', expected float, but found '" + instance[i + 1].rstrip(",") + "'. Instance= " + " ".join(instance) + ".. Attempting to compensate...", file=stderr) i = i - 1 i += 2 if not dist: print( "ERROR: pynlpl.input.timbl.TimblOutput -- Did not find class distribution for ", instance, file=stderr) return Distribution(dist)
def append(self, word_id, senses,distance=0): # Commented by Ruben, there are some ID's that are repeated in all sonar test files... #assert (not word_id in self.data) if isinstance(senses, Distribution): self.data[word_id] = ( (x,y) for x,y in senses ) #PATCH UNDONE (#TODO: this is a patch, something's not right in Distribution?) self.distances[word_id]=distance if distance > self.maxDistance: self.maxDistance=distance return else: assert isinstance(senses, list) and len(senses) >= 1 self.distances[word_id]=distance if distance > self.maxDistance: self.maxDistance=distance if len(senses[0]) == 1: #not a (sense_id, confidence) tuple! compute equal confidence for all elements automatically: confidence = 1 / float(len(senses)) self.data[word_id] = [ (x,confidence) for x in senses ] else: fulldistr = True for sense, confidence in senses: if confidence == None: fulldistr = False break if fulldistr: self.data[word_id] = Distribution(senses) else: self.data[word_id] = senses
def main(): parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1) parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true") parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8') parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)") args = parser.parse_args() if not args.files: print("No files specified", file=sys.stderr) sys.exit(1) freqlist = FrequencyList(None, args.caseinsensitive) for filename in args.files: f = io.open(filename,'r',encoding=args.encoding) for line in f: if args.ngramsize > 1: freqlist.append(Windower(crude_tokenizer(line),args.ngramsize)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print s.encode('utf-8') print >>sys.stderr, "Tokens: ", freqlist.tokens() print >>sys.stderr, "Types: ", len(freqlist) print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio() print >>sys.stderr, "Entropy: ", dist.entropy()
print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Windower, crude_tokenizer from pynlpl.statistics import FrequencyList, Distribution import sys import codecs with codecs.open(sys.argv[1],'r','utf-8') as file: freqlist = FrequencyList() for line in file: freqlist.append(Windower(crude_tokenizer(line),2)) print "Type/Token Ratio: ", freqlist.typetokenratio() ### uncomment if you want to output the full frequency list: #for line in freqlist.output(): # print line.encode('utf-8') dist = Distribution(freqlist) for line in dist.output(): print line.encode('utf-8')
#!/usr/bin/env python #-*- coding:utf-8 -*- from pynlpl.textprocessors import Windower, crude_tokenizer from pynlpl.statistics import FrequencyList, Distribution import sys import codecs with codecs.open(sys.argv[1], 'r', 'utf-8') as file: freqlist = FrequencyList() for line in file: freqlist.append(Windower(crude_tokenizer(line), 2)) print "Type/Token Ratio: ", freqlist.typetokenratio() ### uncomment if you want to output the full frequency list: #for line in freqlist.output(): # print line.encode('utf-8') dist = Distribution(freqlist) for line in dist.output(): print line.encode('utf-8')