Example #1
0
    def words():
	for fname in args:
	    for line in uopen(fname, options.encoding):
		if line.startswith('#'): continue
		rank, count, cov1, cov2, word = line.split()
		count = int(count)
		yield word, count
Example #2
0
def main(options, args):
    def words():
	for fname in args:
	    for line in uopen(fname, options.encoding):
		if line.startswith('#'): continue
		rank, count, cov1, cov2, word = line.split()
		count = int(count)
		yield word, count

    tests = []

    if options.charset:
	charset = CharSet(options.charset)
	tests.append(CharacterTest(charset.charset))

    if options.auto_examples:
	autoStrangeness = PerplexityCalculator()
	autoStrangeness.estimateFromExamplesWithCounts(words())
	tests.append(autoStrangeness)

    if options.examples:
	modelStrangeness = findStrangeWords(options)
	tests.append(modelStrangeness)

    out = uopen('-', options.encoding, 'w')
    for word, count in words():
	modelPpl, modelMinProb = modelStrangeness(word)
	autoPpl,  autoMinProb  = autoStrangeness(word)
	print >> out, '%s\t%f\t%f\t%f\t%f\t%s' % (count, modelPpl, modelMinProb, autoPpl, autoMinProb, word)


    for word, count in words():
	for test in tests:
	    test(word, count)
Example #3
0
    def read(self, filename):
	"""
	Character set files are UTF-8 encoded, with three columns
	separated by tabulartor: Unicode codepoint in hexadecimal,
	character, English description
	"""
	for line in uopen(filename, 'UTF-8'):
	    fields = line.split('\t')
	    codepoint, char, description = fields
	    assert codepoint.startswith('U+')
	    codepoint = int(codepoint[2:], 16)
	    assert ord(char) == codepoint
	    self.charset.add(char)
Example #4
0
    def convert(self):
        """ reads the old cart tree and converts it to the new xml-format
		"""
        self._writer = openXml(self._pathToNewTree, self._newEncoding)
        self._setNumberOfCartClasses()
        self._reader = uopen(self._pathToOldTree)
        self._writer.open("decision-tree")
        self._convertPhonemeList()
        self._convertQuestionList()
        self._convertBinaryTree()
        self._writer.close("decision-tree")
        closeXml(self._writer)
        uclose(self._reader)
Example #5
0
 def openOutputFile(self, filename):
     if os.path.dirname(filename) and not os.path.exists(
             os.path.dirname(filename)):
         os.makedirs(os.path.dirname(filename))
     if os.path.exists(filename):
         if self.options.force_overwrite:
             print >> sys.stderr, 'File "%s" already exists.  Overwriting as requested.' % (
                 filename)
         else:
             print >> sys.stderr, 'Failed to create "%s". File already exists.' % filename
             return None
     f = uopen(filename, self.options.encoding, 'w')
     return f
Example #6
0
 def __init__(self, options):
     self.options = options
     try:
         self.selectFields(options.fields)
     except:
         print >> sys.stderr, 'error selecting fields:', options.fields
         raise
     if self.isActive():
         resultFile = uopen(options.resultFile, options.encoding, 'w')
         if options.bootlog:
             self.format = TableFormatter(resultFile, self.fields)
         elif options.gui:
             self.format = QtFormatter(resultFile, self.fields)
         else:
             self.format = PrettyFormatter(resultFile, self.fields)
Example #7
0
    def ensureMissingWordList(self):
	if not self.missingWordFile:
	    raise 'cannot get missing words'
	if self.exists(self.missingWordFile):
	    print >> sys.stderr, 'missing word file does already exist:', self.missingWordFile
	else:
	    if self.setCostaLog():
		print >> sys.stderr, 'get missing words from costa-log-file', self.costaLogFile
		self.missingWords = self.costaLog.getMissingWordList()
		fd = uopen(self.missingWordFile, self.encoding, 'w')
		print >> sys.stderr, 'store missing words:', fd.name
		for word in self.missingWords:
		    fd.write(word + '\n')
		uclose(fd)
	    else:
		raise 'cannot create missing word file', self.missingWordFile
Example #8
0
    def _setNumberOfCartClasses(self):
        """ reads old cart file and determines the maximal index of a cart class
		    this is needed, because the index of the silence class is set to this value """
        self._reader = uopen(self._pathToOldTree)
        reachedTree = False
        while True:
            text = self._reader.readline().strip()
            if not reachedTree:
                if len(text) > 0:
                    if text[0] == '(':
                        reachedTree = True
            elif text == '':
                break
            else:
                text = text.strip("()")
                tupel = map(int, text.split(','))
                if len(tupel) == 2:
                    self._maxIndexOfCartClasses = max(
                        self._maxIndexOfCartClasses, tupel[0])
        uclose(self._reader)
Example #9
0
def findStrangeWords(filename, encoding):
    """
    Generate a letter N-gram model, and determine the words with the
    highest perplexities according to this model.
    """

    picName = os.path.splitext(os.path.basename(filename))[0] + '.pic'
    if os.path.isfile(picName) and os.stat(picName)[stat.ST_MTIME] >= os.stat(filename)[stat.ST_MTIME]:
	f = open(picName, 'rb')
	orthographicPerplexity = pickle.load(f)
    else:
	orths = []
	for line in uopen(filename, encoding):
	    orths.append(line.strip())
	orthographicPerplexity = PerplexityCalculator()
	orthographicPerplexity.estimateFromExamples(orths)

	f = open(picName, 'wb')
	pickle.dump(orthographicPerplexity, f, pickle.HIGHEST_PROTOCOL)
	f.close()

    return orthographicPerplexity
Example #10
0
def openXml(filename, encoding = 'ascii'):
    xml = XmlWriter(uopen(filename, encoding, 'w'), encoding)
    return xml
Example #11
0
def openXml(filename, encoding = 'ascii', mode='w'):
    xml = XmlWriter(uopen(filename, encoding, mode), encoding)
    xml.begin()
    return xml
def main(argv):
    defaultEncoding = "utf8"

    usage = "usage: %prog [options] <corpus name> <corpus outfile>\n " + __doc__
    optionParser = OptionParser(usage=usage)
    optionParser.add_option("-E",
                            "--encoding",
                            default=defaultEncoding,
                            dest="encoding",
                            help="encoding [" + defaultEncoding + "]")
    optionParser.add_option(
        "-f",
        "--file",
        dest="sourcefilename",
        help=
        "source filename with at least sentence IDs and orthographies specified in a file"
    )
    optionParser.add_option("-F",
                            "--Field",
                            default="orth",
                            dest="orthfieldname",
                            help="orth field name, e.g. orth or zip [orth]")
    optionParser.add_option("-d",
                            "--delimiter",
                            default=";",
                            dest="delimiter",
                            help="field delimiter in the plain file [;]")
    optionParser.add_option(
        "-a",
        "--all",
        dest="splitall",
        action="store_true",
        help="split every record into a single corpus file")
    optionParser.add_option("-w",
                            "--wordcount",
                            dest="wordlist",
                            action="store_true",
                            help="print word list")
    optionParser.add_option("-v",
                            "--verbose",
                            dest="verbose",
                            action="store_true")
    (options, args) = optionParser.parse_args()

    if len(args) != 2:
        optionParser.error("incorrect number of arguments %d" % len(args))
        sys.exit()

    #set filenames
    corpusname = args[0]
    corpusFilename = args[1]

    if not corpusFilename.rfind(".corpus"):
        corpusFilename += ".corpus"
    recordingsFilename = corpusFilename[:corpusFilename.
                                        rfind('.')] + ".recordings"
    speakerDescriptionFilename = corpusFilename[:corpusFilename.
                                                rfind('.')] + ".speaker"

    if options.verbose:
        print "corpusFilename =", corpusFilename
        print "speakerDescriptionFilename =", speakerDescriptionFilename
        print "recordingsFilename =", recordingsFilename

    #create corpus include structure
    newCorpus = Corpus(corpusname, encoding="utf8")
    newCorpus.includeFile(os.path.abspath(speakerDescriptionFilename))
    newCorpus.includeFile(os.path.abspath(recordingsFilename))
    newCorpus.save(corpusFilename)

    # create default speaker description file
    speakers = []
    speakerDescription = Speaker(corpusname, encoding="utf8")

    #read sentence IDs and structure information
    sentenceIDsFile = uopen(options.sourcefilename, options.encoding, 'r')
    # sentenceIDsFile = open(options.sourcefilename,'r')
    firstLine = sentenceIDsFile.readline()[:-1]
    #   fieldList = unicode(firstLine, options.encoding).split(options.delimiter)
    fieldList = firstLine.split(options.delimiter)
    if options.verbose:
        print "structure:", firstLine, fieldList
    fieldMap = {}
    fieldId = 0
    for field in fieldList:
        fieldMap[field] = fieldId
        if options.verbose:
            print field, fieldId
        fieldId += 1

    #check required fields
    if not fieldMap.has_key('name') or not fieldMap.has_key(
            'video') or not fieldMap.has_key('orth'):
        print "ERROR: one or more required fields [name,video, and/or orth] are missing."
        keys = fieldMap.keys()
        keys.sort
        for key in keys:
            print key, fieldMap[key]
        sys.exit()

    #create recordings from data info file
    newRecordings = Corpus(corpusname, encoding="utf-8")
    sentenceCnt = 0
    for line in sentenceIDsFile:
        sentenceCnt += 1
        #       splitlist = unicode(line, options.encoding).strip().split(options.delimiter)
        splitlist = line.strip().split(options.delimiter)
        if options.verbose:
            for i in range(0, len(splitlist)):
                print i, splitlist[i]
        if len(splitlist) < len(fieldMap):
            if options.verbose:
                print "ERROR: data row '%s' is invalid and will be discarded." % (
                    splitlist)
        else:
            start = -1
            end = -1
            speakerName = "default"
            speakerGender = "male"
            recordOrth = ""
            recordTranslation = ""

            if (fieldMap.get("start") != None):
                start = splitlist[fieldMap['start']]
            if (fieldMap.get("end") != None):
                end = splitlist[fieldMap['end']]
            if (fieldMap.get("speaker") != None):
                speakerName = splitlist[fieldMap['speaker']]
            if (fieldMap.get("gender") != None):
                speakerGender = splitlist[fieldMap['gender']]
            if (fieldMap.get("translation") != None):
                recordTranslation = splitlist[fieldMap['translation']]

            # update speaker names
            if speakerName not in speakers:
                speakerDescription.addSpeakerDescription(
                    speakerName, speakerGender)
                speakers.append(speakerName)

            # add recording
            newRecordings.addRecording(
                splitlist[fieldMap['name']], splitlist[fieldMap['video']],
                start, end, speakerName,
                splitlist[fieldMap[options.orthfieldname]], False,
                recordTranslation)

    # close corpus and write to xml file
    uclose(sentenceIDsFile)
    newRecordings.save(recordingsFilename)
    speakerDescription.save(speakerDescriptionFilename)

    print "\n----------------------------------------------------------"
    print "corpus file               :'" + corpusFilename + "'"
    print "speaker description file  :'" + speakerDescriptionFilename + "'"
    print "recordings file           :'" + recordingsFilename + "'"
    print
Example #13
0
 def save(self, filename):
     out = uopen(filename, self.encoding, "w")
     self.doc.writexml(out, '', '  ', '\n', 'utf-8')
     uclose(out)
Example #14
0
 def save(self, filename):
     out = uopen(filename, self.encoding, 'w')
     self.doc.writexml(out, '', '  ', '\n', self.encoding)
     uclose(out)
def continueXml(filename, rootElement, encoding = 'utf-8'):
    xml = XmlAppendWriter(uopen(filename, encoding, 'a'), rootElement, encoding)
    return xml