def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err), file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err),file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:",o,file=sys.stderr) sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1) parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true") parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8') parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)") args = parser.parse_args() if not args.files: print("No files specified", file=sys.stderr) sys.exit(1) freqlist = FrequencyList(None, args.caseinsensitive) for filename in args.files: f = io.open(filename,'r',encoding=args.encoding) for line in f: if args.ngramsize > 1: freqlist.append(Windower(crude_tokenizer(line),args.ngramsize)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)
sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print s.encode('utf-8') print >>sys.stderr, "Tokens: ", freqlist.tokens() print >>sys.stderr, "Types: ", len(freqlist) print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio() print >>sys.stderr, "Entropy: ", dist.entropy()
print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
sys.exit(1) if not files: print >>sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename,'r',encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line),n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type,tuple) or isinstance(type,list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(),file=sys.stderr) print("Types: ", len(freqlist),file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) print("Entropy: ", dist.entropy(),file=sys.stderr)