def process(dict_names, gram_fname, xml, encoding):
    """"""
    gram_file = open(gram_fname, "r")
    gram = gram_file.read()
    gram_file.close()
    lexicon = parse_corpus(dict_names, grammar=gram, encoding=encoding, errors="replace")
    mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon)
    analysis = analyse_dict(lexicon)
    if xml:
        indent(lexicon)
        out_file = open(xml, "w")
        out_file.write(ET.tostring(lexicon, encoding="UTF-8"))
        out_file.close()

    print "analysing files\n%s\n" % "\n".join(dict_names)
    if xml:
        print 'XML lexicon output in file "%s"\n' % xml
    print "====chunk grammar===="
    print gram
    print "\n"
    max_positions = 30
    for structure, patt_dict in analysis.items():
        print "\n\n===%s===: total= %d" % (structure, pattern_count(patt_dict))
        for pattern, positions in sorted(patt_dict.items(), key=lambda t: (-len(t[1]), t[0])):
            if len(positions) <= max_positions:
                pos_str = "Entries: %s" % ", ".join(positions)
            else:
                pos_str = "Too many entries to list."
            print "\t%5d:  %s %s" % (len(positions), ":".join(pattern), pos_str)
    print "\n\n"
    print "mkr\tcount\tnonblank"
    for mkr in mkr_counts:
        print "%s\t%5d\t%5d" % (mkr, mkr_counts.get(mkr, 0), nonblank_mkr_counts.get(mkr, 0))
Example #2
0
def process(dict_names, gram_fname, xml, encoding):
    """"""
    gram_file = open(gram_fname, 'r')
    gram = gram_file.read()
    gram_file.close()
    lexicon = parse_corpus(dict_names,
                           grammar=gram,
                           encoding=encoding,
                           errors='replace')
    mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon)
    analysis = analyse_dict(lexicon)
    if xml:
        indent(lexicon)
        out_file = open(xml, "w")
        out_file.write(ET.tostring(lexicon, encoding='UTF-8'))
        out_file.close()

    print 'analysing files\n%s\n' % '\n'.join(dict_names)
    if xml:
        print 'XML lexicon output in file "%s"\n' % xml
    print '====chunk grammar===='
    print gram
    print '\n'
    max_positions = 30
    for structure, patt_dict in analysis.items():
        print '\n\n===%s===: total= %d' % (structure, pattern_count(patt_dict))
        for pattern, positions in sorted(patt_dict.items(),
                                         key=lambda t: (-len(t[1]), t[0])):
            if len(positions) <= max_positions:
                pos_str = 'Entries: %s' % ', '.join(positions)
            else:
                pos_str = 'Too many entries to list.'
            print "\t%5d:  %s %s" % (len(positions), ':'.join(pattern),
                                     pos_str)
    print "\n\n"
    print 'mkr\tcount\tnonblank'
    for mkr in mkr_counts:
        print '%s\t%5d\t%5d' % (mkr, mkr_counts.get(
            mkr, 0), nonblank_mkr_counts.get(mkr, 0))
Example #3
0
#!/usr/bin/env python

"""
Build the corpus package index.  Usage:

  build_pkg_index.py <path-to-packages> <base-url> <output-file>
"""

xml_header = """<?xml version="1.0"?>
<?xml-stylesheet href="index.xsl" type="text/xsl"?>
"""

import sys
from nltk.downloader import build_index
from nltk.etree import ElementTree

if len(sys.argv) != 4:
    print "Usage: "
    print "build_pkg_index.py <path-to-packages> <base-url> <output-file>"
    sys.exit(-1)

ROOT, BASE_URL, OUT = sys.argv[1:]

index = build_index(ROOT, BASE_URL)
s = ElementTree.tostring(index)
out = open(OUT, 'w')
out.write(xml_header)
out.write(s)
out.close()

Example #4
0
#!/usr/bin/env python
"""
Build the corpus package index.  Usage:

  build_pkg_index.py <path-to-packages> <base-url> <output-file>
"""

xml_header = """<?xml version="1.0"?>
<?xml-stylesheet href="index.xsl" type="text/xsl"?>
"""

import sys
from nltk.downloader import build_index
from nltk.etree import ElementTree

if len(sys.argv) != 4:
    print "Usage: "
    print "build_pkg_index.py <path-to-packages> <base-url> <output-file>"
    sys.exit(-1)

ROOT, BASE_URL, OUT = sys.argv[1:]

index = build_index(ROOT, BASE_URL)
s = ElementTree.tostring(index)
out = open(OUT, 'w')
out.write(xml_header)
out.write(s)
out.close()