def setVocabularyMeasures(self): lemmacats = list(itertools.chain(*[x.listLemmaCats() for x in self.paragraphs])) try: # as the fit can crash popt, pcov = calcPLex.calcPLex( lemmacats, lemmacat2freqrank) self.PLex = popt[0] except: self.PLex = 0.0 #print lemmacats popt, pcov = calcPLex.calcS( lemmacats, lemmacat2freqrank) self.S = popt[0] if len(lemmacats) >= 50: self.vocd = calcPLex.getVOCD(lemmacats) else: self.vocd = 0.0 self.mtld = calcPLex.getMTLD(lemmacats) if len(lemmacats) >= 42: self.hdd = calcHDD.calcHDD(lemmacats) else: self.hdd = 0.0 #self.maas = calcHDD.calcMaas(lemmacats) # don't think this works right atm vocabs, self.vocabOther, self.vocabUnk =\ calcPLex.calcLFP( lemmacats, lemmacat2freqrank, difficultybins = (1000,2000,3000,4000,8000)) self.vocab1k, self.vocab2k, self.vocab3k, self.vocab4k, self.vocab8k = vocabs
for testxml in testxmls: print "loading xml", testxml tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml) # each tok2lemmacat goes to a list to 2-tuples # print tok2lemmacats.values() for lemmacats in tok2lemmacats.values(): wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats]) print wordforms print "tokens", len(wordforms) print "types", len(set(wordforms)) popt, pcov = calcPLex.calcPLex( wordforms, lemmacat2freqrank) print "popt", popt print "pcov", pcov print "S" popt, pcov = calcPLex.calcS( wordforms, lemmacat2freqrank) print "popt", popt print "pcov", pcov vocd = calcPLex.getVOCD( wordforms ) mtld = calcPLex.getMTLD( wordforms ) print "vocd:", vocd print "mtld:", mtld
def getVariable(self, variable, params, resources ): if variable == "paragraphs": return len(self.paragraphStarts) elif variable == "sentences": return len(self.sentences) elif variable == "words": return self.getNWords() elif variable == "sentsPerPara": #print "sents per para", params[0] return getDiffStatistic( self.paragraphStarts +[len(self.sentences)], params[0] ) elif variable == "wordsPerSent": return getCountStatistic( [len(x.uniquetokens) for x in self.sentences], params[0] ) elif variable == "lettersPerWord": #print [y for x in self.sentences for y in x.uniquetokens] # we need the if requirement to remove punctuation etc. return getCountStatistic( [len(re.findall(ur'[^\W\d_]', y, flags=re.UNICODE)) for x in self.sentences for y in x.uniquetokens if len(re.findall(ur'[^\W\d_]', y, flags=re.UNICODE)) > 0], params[0] ) elif variable == "syllablesPerWord": #print "syllables per word", [x.listSyllablesPerWord() for x in self.sentences] return getCountStatistic( flatten2LevelList([x.listSyllablesPerWord() for x in self.sentences]), params[0] ) elif variable == "PLex": # as the fit can crash #print "plexing" try: popt, pcov = calcPLex.calcPLex( self.lemmacats, resources["lemmacat2freqrank"], difficultRank=params[0]) #print "plex", popt, pcov return popt[0] except: print "P_Lex fit problem" return 0.0 elif variable == "S": popt, pcov = calcPLex.calcS( self.lemmacats, resources["lemmacat2freqrank"]) return popt[0] elif variable == "altS": #return self.getAltSValues( resources["lemmacat2freqrank"] ) popt, pcov = calcPLex.calcAB( self.lemmacats, resources["lemmacat2freqrank"]) return popt elif variable == "vocd": if len(self.lemmacats) >= 50: #print "vocd", self.lemmacats #print calcPLex.getVOCD(self.lemmacats), calcPLex.calcVOCD(self.lemmacats) return calcPLex.calcVOCD(self.lemmacats) else: return -1.0 elif variable == "mtld": return calcPLex.calcMTLD(self.lemmacats, params[0]) elif variable == "hdd": if len(self.lemmacats) >= 42: return calcHDD.calcHDD(self.lemmacats) return 0.0 elif variable == "LFP": #vocabs, vocabOther, vocabUnk =\ #print "lfp:", params #print calcPLex.calcLFP( self.lemmacats, resources["lemmacat2freqrank"], difficultybins = params) return calcPLex.calcLFP( self.lemmacats, resources["lemmacat2freqrank"], difficultybins = params) elif variable == "spellcorr": return 1.0* sum([x.spellingcorrections for x in self.sentences])/self.getNWords() elif variable == "meltdiff": return 1.0* sum([x.meltdiffs for x in self.sentences])/self.getNWords() elif variable == "meanmelt": # gets the geometic mean: return math.pow( product(flatten2LevelList([x.meltconfidences for x in self.sentences])), 1.0/self.getNWords()) elif variable == "parsed": # need a param either 'full', 'corrected' or 'robust' #print "parsed:", params[0], type(params[0]) #print [x.parsed for x in self.sentences] #print [type(x.parsed) for x in self.sentences] return 1.0* len( [x.parsed for x in self.sentences if x.parsed == params[0]])/len(self.sentences) elif variable == "weightPerWord": return 1.0* sum([x.weightperword*len(x.uniquetokens) for x in self.sentences])/self.getNWords() elif variable == "verb": #print params[0] #print self.vanalysis.keys() if params[0] in self.vanalysis: return 1.0*self.vanalysis[params[0]]/self.vgroups else: return 0.0 elif variable == "clause": #print "CLAUSE", params[0], self.vanalysis[params[0]] #print "CLAUSE", self.vanalysis.keys() if params[0] in self.vanalysis: return 1.0*self.vanalysis[params[0]]/len(self.sentences) else: return 0.0 elif variable == "w2vct": return getCohesionVariables.getCohesionVariables(resources["word2vecModel"], self.ddagSentences) elif variable == "treeTypesPerSent": return getCountStatistic( [len(x.trees.keys()) for x in self.sentences], *params ) elif variable == "TreeTypesHDD": #print "hdd", self.trees.values() return calcHDD.calcHDDfromFreq(self.trees.values()) elif variable == "TreeTypesYuleK": #print "yulek", self.trees.values() return calcHDD.calcYuleKfromFreq(self.trees.values()) elif variable == "noVerbSentences": return 1.0*len([x.hasnomainverb for x in self.sentences if x.hasnomainverb > 0])/len(self.sentences) elif variable == "toksBeforeMainVerb": return 1.0*sum([x.wordsbeforemainverb for x in self.sentences if x.wordsbeforemainverb >= 0])/ \ len([x.wordsbeforemainverb for x in self.sentences if x.wordsbeforemainverb >= 0]) # currently no sentence boundaries: elif variable == "bigramLogProbs": return nGramModel.analyseTokens(flatten2LevelList(self.ddagSentences), resources["nGramDict"], resources["nmoGramDict"], resources["nGramCounts"])