def getDW(self, listOfDicts): #using difficult words in answerBody as a feature listOfDW = [] ts = readability.TextStatistics() for row in listOfDicts: listOfDW.append(ts.difficult_words(row['answerBody'])) return self.assignValue(listOfDW)
def getRead( self, listOfDicts): #using readabilty index of answerBody as a feature listOfRead = [] ts = readability.TextStatistics() for row in listOfDicts: listOfRead.append(ts.text_standard(row['answerBody'])) return self.assignValue(listOfRead)
import information import textProp import numpy as np from sklearn.svm import SVC features_train = [] labels_train = [] dataset.createDataset() data = dataset.data data = sorted(data, key=lambda k: k['qId']) count = 0 for row in data: featureVector = [] ts = readability.TextStatistics() tp = textProp.TextProperties() inf = information.Informativity() totalEntropy = 0 model, stats = inf.markov_model(inf.chars(row['answerBody']), 3) for prefix in stats: totalEntropy = totalEntropy + inf.entropy(stats, stats[prefix]) featureVector.append(abs(totalEntropy)) # information featureVector.append( tp.relevancy(row['answerBody'], row['qBody'], row['qTags'])) # relevancy featureVector.append(tp.UniqueWords(row['answerBody'])) # unique featureVector.append(tp.NonstopWords(row['answerBody'])) # nonstop featureVector.append(tp.subjective(row['answerBody'])) # subjectivity featureVector.append(row['answerScore'] * 100) # answerScore featureVector.append(row['answererDownvotes']) # downvotes