def setVocabularyMeasures(self): lemmacats = list(itertools.chain(*[x.listLemmaCats() for x in self.paragraphs])) try: # as the fit can crash popt, pcov = calcPLex.calcPLex( lemmacats, lemmacat2freqrank) self.PLex = popt[0] except: self.PLex = 0.0 #print lemmacats popt, pcov = calcPLex.calcS( lemmacats, lemmacat2freqrank) self.S = popt[0] if len(lemmacats) >= 50: self.vocd = calcPLex.getVOCD(lemmacats) else: self.vocd = 0.0 self.mtld = calcPLex.getMTLD(lemmacats) if len(lemmacats) >= 42: self.hdd = calcHDD.calcHDD(lemmacats) else: self.hdd = 0.0 #self.maas = calcHDD.calcMaas(lemmacats) # don't think this works right atm vocabs, self.vocabOther, self.vocabUnk =\ calcPLex.calcLFP( lemmacats, lemmacat2freqrank, difficultybins = (1000,2000,3000,4000,8000)) self.vocab1k, self.vocab2k, self.vocab3k, self.vocab4k, self.vocab8k = vocabs
for testxml in testxmls: print "loading xml", testxml tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml) # each tok2lemmacat goes to a list to 2-tuples # print tok2lemmacats.values() for lemmacats in tok2lemmacats.values(): wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats]) print wordforms print "tokens", len(wordforms) print "types", len(set(wordforms)) popt, pcov = calcPLex.calcPLex( wordforms, lemmacat2freqrank) print "popt", popt print "pcov", pcov print "S" popt, pcov = calcPLex.calcS( wordforms, lemmacat2freqrank) print "popt", popt print "pcov", pcov vocd = calcPLex.getVOCD( wordforms ) mtld = calcPLex.getMTLD( wordforms ) print "vocd:", vocd print "mtld:", mtld