コード例 #1
0
 def setVocabularyMeasures(self):
     lemmacats = list(itertools.chain(*[x.listLemmaCats() for x in self.paragraphs]))
     try:
         # as the fit can crash
         popt, pcov = calcPLex.calcPLex( lemmacats, lemmacat2freqrank)
         self.PLex = popt[0]
     except:
         self.PLex = 0.0
     #print lemmacats
     popt, pcov = calcPLex.calcS( lemmacats, lemmacat2freqrank)
     self.S = popt[0]
     if len(lemmacats) >= 50:
         self.vocd = calcPLex.getVOCD(lemmacats)
     else:
         self.vocd = 0.0
     self.mtld = calcPLex.getMTLD(lemmacats)
     if len(lemmacats) >= 42:
         self.hdd = calcHDD.calcHDD(lemmacats)
     else:
         self.hdd = 0.0
     #self.maas = calcHDD.calcMaas(lemmacats) # don't think this works right atm
     vocabs, self.vocabOther, self.vocabUnk =\
         calcPLex.calcLFP( lemmacats, lemmacat2freqrank, difficultybins = (1000,2000,3000,4000,8000))
     self.vocab1k, self.vocab2k, self.vocab3k, self.vocab4k, self.vocab8k = vocabs
コード例 #2
0
for testxml in testxmls:
    print "loading xml", testxml
    tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml)
    # each tok2lemmacat goes to a list to 2-tuples
    # print tok2lemmacats.values()
    for lemmacats in tok2lemmacats.values():
        wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats])

print wordforms
print "tokens", len(wordforms)
print "types", len(set(wordforms))
popt, pcov = calcPLex.calcPLex( wordforms, lemmacat2freqrank)

print "popt", popt
print "pcov", pcov

print "S"
popt, pcov = calcPLex.calcS( wordforms, lemmacat2freqrank)

print "popt", popt
print "pcov", pcov


vocd = calcPLex.getVOCD( wordforms )
mtld = calcPLex.getMTLD( wordforms )

print "vocd:", vocd
print "mtld:", mtld


コード例 #3
0
 def getVariable(self, variable, params, resources ):
     if variable == "paragraphs":
         return len(self.paragraphStarts)
     elif variable == "sentences":
         return len(self.sentences)
     elif variable == "words":
         return self.getNWords()
     elif variable == "sentsPerPara":
         #print "sents per para", params[0]
         return getDiffStatistic( self.paragraphStarts +[len(self.sentences)], params[0] )
     elif variable == "wordsPerSent":
         return getCountStatistic( [len(x.uniquetokens) for x in self.sentences], params[0] )
     elif variable == "lettersPerWord":
         #print [y for x in self.sentences for y in x.uniquetokens]
         # we need the if requirement to remove punctuation etc.
         return getCountStatistic( [len(re.findall(ur'[^\W\d_]', y, flags=re.UNICODE))
                                    for x in self.sentences for y in x.uniquetokens
                                    if len(re.findall(ur'[^\W\d_]', y, flags=re.UNICODE)) > 0], params[0] )
     elif variable == "syllablesPerWord":
         #print "syllables per word", [x.listSyllablesPerWord() for x in self.sentences]
         return getCountStatistic( flatten2LevelList([x.listSyllablesPerWord() for x in self.sentences]), params[0] )
     elif variable == "PLex":
         # as the fit can crash
         #print "plexing"
         try:
             popt, pcov = calcPLex.calcPLex( self.lemmacats, resources["lemmacat2freqrank"], difficultRank=params[0])
             #print "plex", popt, pcov
             return popt[0]
         except:
             print "P_Lex fit problem"
             return 0.0
     elif variable == "S":
         popt, pcov = calcPLex.calcS( self.lemmacats, resources["lemmacat2freqrank"])
         return popt[0]
     elif variable == "altS":
         #return self.getAltSValues( resources["lemmacat2freqrank"] )
         popt, pcov = calcPLex.calcAB( self.lemmacats, resources["lemmacat2freqrank"])
         return popt
     elif variable == "vocd":
         if len(self.lemmacats) >= 50:
             #print "vocd", self.lemmacats
             #print  calcPLex.getVOCD(self.lemmacats), calcPLex.calcVOCD(self.lemmacats)
             return calcPLex.calcVOCD(self.lemmacats)
         else:
             return -1.0
     elif variable == "mtld":
         return calcPLex.calcMTLD(self.lemmacats, params[0])
     elif variable == "hdd":
         if len(self.lemmacats) >= 42:
             return calcHDD.calcHDD(self.lemmacats)
         return 0.0
     elif variable == "LFP":
         #vocabs, vocabOther, vocabUnk =\
         #print "lfp:", params
         #print calcPLex.calcLFP( self.lemmacats, resources["lemmacat2freqrank"], difficultybins = params)
         return calcPLex.calcLFP( self.lemmacats, resources["lemmacat2freqrank"], difficultybins = params)
     elif variable == "spellcorr":
         return 1.0* sum([x.spellingcorrections for x in self.sentences])/self.getNWords()
     elif variable == "meltdiff":
         return 1.0* sum([x.meltdiffs for x in self.sentences])/self.getNWords()
     elif variable == "meanmelt": # gets the geometic mean:
         return math.pow( product(flatten2LevelList([x.meltconfidences for x in self.sentences])), 1.0/self.getNWords())
     elif variable == "parsed":
         # need a param either 'full', 'corrected' or 'robust'
         #print "parsed:", params[0], type(params[0])
         #print [x.parsed for x in self.sentences]
         #print [type(x.parsed) for x in self.sentences]
         return 1.0* len( [x.parsed for x in self.sentences if x.parsed == params[0]])/len(self.sentences)
     elif variable == "weightPerWord":
         return 1.0* sum([x.weightperword*len(x.uniquetokens) for x in self.sentences])/self.getNWords()
     elif variable == "verb":
         #print params[0]
         #print self.vanalysis.keys()
         if params[0] in self.vanalysis:
             return 1.0*self.vanalysis[params[0]]/self.vgroups
         else:
             return 0.0
     elif variable == "clause":
         #print "CLAUSE", params[0], self.vanalysis[params[0]]
         #print "CLAUSE", self.vanalysis.keys()
         if params[0] in self.vanalysis:
             return 1.0*self.vanalysis[params[0]]/len(self.sentences)
         else:
             return 0.0
     elif variable == "w2vct":
         return getCohesionVariables.getCohesionVariables(resources["word2vecModel"], self.ddagSentences)
     elif variable == "treeTypesPerSent":
         return getCountStatistic( [len(x.trees.keys()) for x in self.sentences], *params )
     elif variable == "TreeTypesHDD":
         #print "hdd", self.trees.values()
         return calcHDD.calcHDDfromFreq(self.trees.values())
     elif variable == "TreeTypesYuleK":
         #print "yulek", self.trees.values()
         return calcHDD.calcYuleKfromFreq(self.trees.values())
     elif variable == "noVerbSentences":
         return 1.0*len([x.hasnomainverb for x in self.sentences if x.hasnomainverb > 0])/len(self.sentences)
     elif variable == "toksBeforeMainVerb":
         return 1.0*sum([x.wordsbeforemainverb for x in self.sentences if x.wordsbeforemainverb >= 0])/ \
                len([x.wordsbeforemainverb for x in self.sentences if x.wordsbeforemainverb >= 0])
     # currently no sentence boundaries:
     elif variable == "bigramLogProbs":
         return nGramModel.analyseTokens(flatten2LevelList(self.ddagSentences),
                                         resources["nGramDict"], resources["nmoGramDict"], resources["nGramCounts"])