Ejemplo n.º 1
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive = False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:", o, file=sys.stderr)
            sys.exit(1)

    if not files:
        print >> sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename, 'r', encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line), n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type, tuple) or isinstance(type, list):
            type = " ".join(type)
        s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
            dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
    print("Types:            ", len(freqlist), file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
    print("Entropy:          ", dist.entropy(), file=sys.stderr)
Ejemplo n.º 2
0
def main():
    try:
        opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err),file=sys.stderr)
        usage()
        sys.exit(2)

    testsetsize = devsetsize = 0
    casesensitive = True
    encoding = 'utf-8'
    n = 1

    for o, a in opts:
        if o == "-n":
            n = int(a)
        elif o == "-i":
            casesensitive =  False
        elif o == "-e":
            encoding = a
        else:
            print("ERROR: Unknown option:",o,file=sys.stderr)
            sys.exit(1)

    if not files:
        print >>sys.stderr, "No files specified"
        sys.exit(1)

    freqlist = FrequencyList(None, casesensitive)
    for filename in files:
        f = codecs.open(filename,'r',encoding)
        for line in f:
            if n > 1:
                freqlist.append(Windower(crude_tokenizer(line),n))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type,tuple) or isinstance(type,list):
            type = " ".join(type)
        s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
    print("Types:            ", len(freqlist),file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
    print("Entropy:          ", dist.entropy(),file=sys.stderr)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1)
    parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true")
    parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8')
    parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")


    args = parser.parse_args()

    if not args.files:
        print("No files specified", file=sys.stderr)
        sys.exit(1)

    freqlist = FrequencyList(None, args.caseinsensitive)
    for filename in args.files:
        f = io.open(filename,'r',encoding=args.encoding)
        for line in f:
            if args.ngramsize > 1:
                freqlist.append(Windower(crude_tokenizer(line),args.ngramsize))
            else:
                freqlist.append(crude_tokenizer(line))

        f.close()

    dist = Distribution(freqlist)
    for type, count in freqlist:
        if isinstance(type,tuple) or isinstance(type,list):
            type = " ".join(type)
        s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
        print(s)

    print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
    print("Types:            ", len(freqlist),file=sys.stderr)
    print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
    print("Entropy:          ", dist.entropy(),file=sys.stderr)
Ejemplo n.º 4
0
        sys.exit(1)

if not files:
    print >>sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename,'r',encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line),n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type,tuple) or isinstance(type,list):
        type = " ".join(type)
    s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
    print s.encode('utf-8')

print >>sys.stderr, "Tokens:           ", freqlist.tokens()
print >>sys.stderr, "Types:            ", len(freqlist)
print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio()
print >>sys.stderr, "Entropy:          ", dist.entropy()


Ejemplo n.º 5
0
        print("ERROR: Unknown option:", o, file=sys.stderr)
        sys.exit(1)

if not files:
    print >> sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename, 'r', encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line), n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type, tuple) or isinstance(type, list):
        type = " ".join(type)
    s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(
        dist.information(type))
    print(s)

print("Tokens:           ", freqlist.tokens(), file=sys.stderr)
print("Types:            ", len(freqlist), file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr)
print("Entropy:          ", dist.entropy(), file=sys.stderr)
Ejemplo n.º 6
0
        sys.exit(1)

if not files:
    print >>sys.stderr, "No files specified"
    sys.exit(1)

freqlist = FrequencyList(None, casesensitive)
for filename in files:
    f = codecs.open(filename,'r',encoding)
    for line in f:
        if n > 1:
            freqlist.append(Windower(crude_tokenizer(line),n))
        else:
            freqlist.append(crude_tokenizer(line))

    f.close()

dist = Distribution(freqlist)
for type, count in freqlist:
    if isinstance(type,tuple) or isinstance(type,list):
        type = " ".join(type)
    s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
    print(s)

print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
print("Types:            ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy:          ", dist.entropy(),file=sys.stderr)