Example #1
0
 def countWords(self, which):
     cnt = counts.countWords(self.readText(which))
     cnt.printStat(sys.stdout)
     fname = os.path.join(self.target_, self.name + '-' + which + '.cov.gz')
     cnt.reportCoverage(uopen(fname, self.encoding, 'w'))
     fname = os.path.join(self.target_,
                          self.name + '-' + which + '.counts.gz')
     cnt.exportText(zopen(fname, 'w'))
Example #2
0
    def parse(self, trsFileName='-'):
        # add unknown speaker
        self.spkIdDict['unk'] = self.corpusParser.speaker({
            'id': 'unk',
            'type': 'unknown',
            'gender': 'unknown'
        })

        self.trsFileName = trsFileName
        trsFd = zopen(self.trsFileName, 'r')
        self.parser.parse(trsFd)
        zclose(trsFd)
        print trsFileName, '-->'
Example #3
0
 def parseFile(self, fname):
     fd = zopen(fname, 'r')
     data = fd.read()
     zclose(fd)
     self.reset()
     if data.startswith('<?xml'):
         self.parser.feed(data)
     else:
         self.parser.feed('<?xml version="1.0" encoding="ISO-8859-1"?>')
         self.parser.feed('<sprint>')
         self.parser.feed(data)
         self.parser.feed('</sprint>')
     self.parser.close()
Example #4
0
def main(options, args):
    if len(args) < 2:
        return

    counts = loadCounts(args[0])
    counts.printStat(sys.stdout)
    print

    mapper, newCounts = makeMapperIterated(
        counts, [mappersByName[what] for what in args[1:]])

    if options.map:
        mapper.store(options.map)
    if options.counts:
        newCounts.exportText(zopen(options.counts, 'w'))
Example #5
0
 def parse(self, path):
     fd = zopen(path, 'r')
     # determine encoding
     # - assume everthing to be in ascii-encoding until <?xml ...> is found
     # - assume <?xml ...> not to be splitted over several lines
     history = []
     reEncoding = re.compile(r'<\?xml[^>]* encoding="([^"]*)"')
     try:
         row = fd.next()
         history.append(row)
         while row.find('<?xml') == -1:
             row = fd.next()
     except StopIteration:
         print >> sys.stderr, 'Error: no xml header <?xml ...> found; "' + path + '" is probably not a proper xml-file.'
         sys.exit(1)
     m = reEncoding.search(row)
     if (m):
         encoding = m.group(1)
     else:
         encoding = self.__default_encoding__
         print >> sys.stderr, 'Warning: no encoding specified, use "' + encoding + '"'
     # parse file
     parser = self.__sax_parser__
     try:
         self.__handler__.startFile(path, encoding)
     except AttributeError:
         pass
     for row in history:
         parser.feed(row)
     del history
     for row in fd:
         parser.feed(row)
     try:
         self.__handler__.endFile(path)
     except AttributeError:
         pass
     zclose(fd)
Example #6
0
 def accept(self, filename, visitor):
     self.visitor = visitor
     self.corpusDir = os.path.dirname(filename)
     self.parse(zopen(filename).read())
     self.visitor = None
Example #7
0
 def store(self, filename):
     f = zopen(filename, 'w')
     for mm in self.maps:
         for phrase, replacement in mm.iteritems():
             print >> f, phrase, ' -> ', replacement
Example #8
0
 def __init__(self, filename=None):
     self.maps = []
     if filename:
         self.load(zopen(filename))