def vectorSpaceInit(self,questions,colName): questions.rewind() DBStore.dropColl(colName) print 'Creating vector space' for question in questions: DBStore.getDB()[colName].insert({'qID':question['qID'],'question':question['question'],'CoarseCode':self.__class__.coarseClassCode[question['coarse']],'FineCode':self.__class__.fineClassCode[question['fine']]}) print 'Done'
def vectorSpaceBuilder(self,questions,colName,featureName, common,insert): inputFile = DBStore.commonRoot+ '/words' +common+'.txt' input = open(inputFile,'r') featureDB = DBStore.getDB()[featureName] raw = DBStore.getDB()['raw'+colName] words = dict([(word.rstrip(),-1) for word in input]) i =0 questions.rewind() print words print "Start building vector" start = time.time() for question in questions: print i r = raw.find_one({'qID':question['qID']}) i=i+1 bagOfWords = words.copy() if insert['head']: for word in r['head']: if word is not None: try: bagOfWords[word] bagOfWords[word]=1 except KeyError: pass if insert['whWord']: try: bagOfWords[question['whWord']] bagOfWords[question['whWord']]=1 except KeyError: pass if insert['hypernym']: for word in r['hypernym']: try: bagOfWords[word] bagOfWords[word]=1 except KeyError: pass if insert['unigram']: tokenizeWords = question['lemma'] for word in tokenizeWords: try: if word.lower() not in ['who','what','how','where', 'when','why','who','which', 'whom','whose']: bagOfWords[word] bagOfWords[word]=1 except KeyError: pass featureDB.update({'qID':question['qID']},{"$set":bagOfWords},safe=False,multi=True) total = time.time() - start print "Finish building " + str(total)
def collinsHeadSenseExtractor(self,questions, colName,training): rawQuestions = DBStore.getDB()['raw'+colName] adaptedLesk = AdaptedLesk(6) i = 1 questions.rewind() p = re.compile('(?P<head1>.+)--(?P<head2>.+)') for question in questions: # line = "was:What was archy , and mehitabel ?" print i i = i + 1 headWord = question['head'] try: match = p.match(headWord[0]) if match: headWord[0] = match.group('head1') except StandardError: pass if headWord[0] is None \ or len(wordnet.synsets(headWord[0]))==0 \ or headWord[0] == 'null': headSense = "null" else: pos = DataRetrieval.replace(question['tagged'][headWord[0]]) if question['whWord'] == 'whWord-how': headSense = 'null' else: print question['tokenized'],headWord[0],question['tagged'] headSense = adaptedLesk.wsd(question['tokenized'],headWord[0],question['tagged']) rawQuestions.update({'qID':question['qID']},{"$set":{"headSense":headSense}},safe=True,multi=True)
def hypernimExtractor(self,colName,max): rawQuestions = DBStore.getDB()['raw'+colName] l = 0 for question in rawQuestions.find(): print question['headSense'] if question['headSense']=='null': selectedHypernyms = [] else: # sense = wordnet.synset(question['headSense']) # instances = sense.instance_hypernyms() # hypernyms = sense.hypernym_paths() # if len(instances) != 0: # j = 0 # for instance in instances: # j = j + 1 # if j>1: break; # insPath = instance.hypernym_paths() # k = 0 # for ins in insPath: # k = k + 1 # if k>1: break; # if len(ins)>max+1: # index = max+1 # else: # index = len(ins) # ins.reverse() # selected = [ins[k].lemma_names[0] for k in range(1,index)] # selectedHypernyms.extend(selected) # else: # j = 0 # for hypernym in hypernyms: # j = j + 1 # if j>1: break # if len(hypernym)>max+1: # index = max+1 # else: # index = len(hypernym) # hypernym.reverse() # selected = [hypernym[k].lemma_names[0] for k in range(1,index)] # selectedHypernyms.extend(selected) sense = wordnet.synset(question['headSense']) instance = sense.instance_hypernyms() hypernym = sense.hypernym_paths()[0] if len(instance) != 0: hypernym = instance[0].hypernym_paths()[0] if len(hypernym)>max+1: index = max+1 else: index = len(hypernym) hypernym.reverse() selectedHypernyms = [hypernym[k].lemma_names[0] for k in range(1,index)] print l l = l + 1 print question['question'] rawQuestions.update({'question':question['question']},{"$set":{"hypernym":selectedHypernyms}},safe=True,multi=True)
def insertFile2Database(self,colName='',training=True,parse=False): # posTagger = POSTagger() # posTagger.loadTagger() # tagger = posTagger.getTagger() lemmatizer = WordNetLemmatizer() DBStore.dropColl('raw'+colName) collection = DBStore.getDB()['raw'+colName] if training: readFileName = DBStore.trainingRoot + "\\raw_" + colName + ".txt" else: readFileName = DBStore.testingRoot + "\\raw_" + colName + ".txt" outputFileName = DBStore.commonRoot + "\\parsed_" + colName + ".txt" readFile = open(readFileName,'w') if colName=='': print 'Filename is needed' else: print 'Beginning insertion of ' +'raw' + colName start = time.time() if training: file = DBStore.trainingRoot+'/train_'+colName+'.label' else: file = DBStore.testingRoot+'/TREC_'+colName+'.label' input = open(file,'r') pattern = re.compile(r"(?P<coarse>\w+):(?P<fine>\w+) (?P<question>.+)"); i = 0 for line in input: print i i = i + 1 match = pattern.match(line) tokenizeWords = word_tokenize(match.group('question')) print match.group('question') p = re.compile('\.') #pattern for eliminating . p2 = re.compile('(1|2)\d\d\d') #pattern for grouping year tokenizeWords = [p.sub('',word) for word in tokenizeWords if word not in string.punctuation] taggedWords = dict(pos_tag(tokenizeWords)) lemmatizedQuestion = [lemmatizer.lemmatize(word,pos=self.replace(taggedWords[word])) for word in tokenizeWords] # print taggedWords # print match.group('question') pairQuestion = dict(zip(tokenizeWords,lemmatizedQuestion)) readFile.write(match.group('question')+'\n') collection.insert({"qID":i, "question": match.group('question'), "coarse":match.group("coarse"), "fine":match.group("coarse")+":"+match.group("fine"), "lemma":lemmatizedQuestion, "tagged":taggedWords, "tokenized":tokenizeWords, "pair":pairQuestion }) readFile.close() if parse: self.parseQuestion(readFileName, outputFileName) total = time.time()-start print 'End of insertion with total time '+ str(total)
def whWordExtractor(self,questions,colName): rawQuestions = DBStore.getDB()['raw'+colName] p = re.compile('.*(?P<whword>who|what|how|where|when|why|which|whom|whose) .*', re.IGNORECASE) questions.rewind() for question in questions: match = p.match(question['question']) try: whWord = 'whWord-'+match.group('whword').lower() except AttributeError: whWord = 'whWord-rest' print question['question'] + whWord rawQuestions.update({'qID':question['qID']},{"$set":{"whWord":whWord}},safe=True,multi=True)
def parseQuestion(self,colName,training): raw = DBStore.getDB()['raw'+colName].find() if training: final = DBStore.trainingRoot + '/parsed_'+colName+'.txt' outputFile = DBStore.trainingRoot +'/raw_'+colName+'.txt' else: final = DBStore.testingRoot + '/parsed_'+colName+'.txt' outputFile = DBStore.testingRoot +'/raw_'+colName+'.txt' f = open(outputFile,'w') for question in raw: f.write(question['question']+'\n') f.close startDir = os.getcwd() print 'Parsing' # command = "java -Xms512m -Xmx512m -jar berkeleyParser.jar -gr eng_sm6.gr -inputFile "+outputFile+" -outputFile "+final command = "execute.bat " + outputFile + " " + final # os.system(command) args = shlex.split(command) # call(args) p = Popen(args,bufsize=-1,shell=True)
def parseResult(missClass,colName,rawName,classType): questions = DBStore.getDB()[colName].find() raw = DBStore.getDB()[rawName] mc = {} for c in missClass: mc[c[0]] = [c[1],c[2]] i = 0 whTotal = {'whWord-who':0,'whWord-what':0,'whWord-how':0,'whWord-where':0,'whWord-rest':0, 'whWord-when':0,'whWord-why':0,'whWord-which':0,'whWord-whom':0,'whWord-whose':0} whCorrect = {'whWord-who':0,'whWord-what':0,'whWord-how':0,'whWord-where':0,'whWord-rest':0, 'whWord-when':0,'whWord-why':0,'whWord-which':0,'whWord-whom':0,'whWord-whose':0} if classType =='Fine': label = 'FineCode' index = ["ABBR:abb", "ABBR:exp", "DESC:def", "DESC:desc", "DESC:manner","DESC:reason", "ENTY:animal","ENTY:body", "ENTY:color", "ENTY:cremat", "ENTY:currency", "ENTY:dismed", "ENTY:event", "ENTY:food", "ENTY:instru","ENTY:lang", "ENTY:letter", "ENTY:other", "ENTY:plant","ENTY:product", "ENTY:religion", "ENTY:sport", "ENTY:substance", "ENTY:symbol", "ENTY:techmeth", "ENTY:veh", "ENTY:word", "ENTY:termeq", "HUM:ind", "HUM:title", "HUM:desc","HUM:gr", "LOC:country", "LOC:mount", "LOC:other", "LOC:state", "LOC:city", "NUM:code", "NUM:count", "NUM:date", "NUM:dist", "NUM:money", "NUM:ord", "NUM:other", "NUM:period", "NUM:perc", "NUM:speed", "NUM:temp", "NUM:volsize", "NUM:weight"] else: label = 'CoarseCode' index = ["ABBR","DESC", "ENTY", "HUM", "LOC", "NUM"] confusionMat = [[0.0 for col in range(len(index))]for row in range(len(index))] resultMat = [[0.0 for col in range(3)]for row in range(len(index))] i = 0 for question in questions: whWord = raw.find({'qID':question['qID']})[0]['whWord'] whTotal[whWord] = whTotal[whWord] + 1 if i not in mc.keys(): whCorrect[whWord] = whCorrect[whWord] + 1 predicted = actual = question[label] confusionMat[actual][predicted] = \ confusionMat[actual][predicted] + 1 else: if question[label]!=mc[i][0]: print 'Error' break else: predicted = mc[i][1] actual = mc[i][0] confusionMat[actual][predicted] = \ confusionMat[actual][predicted] + 1 i = i +1 print confusionMat for i in range(len(index)): truePos = confusionMat[i][i] falsePos = 0.0 for j in range(len(confusionMat)): if j == i: continue falsePos = falsePos + confusionMat[j][i] trueNeg = 0.0 for j in range(len(confusionMat)): for k in range(len(confusionMat)): if j == i or k == i: continue trueNeg = trueNeg + confusionMat[j][k] falseNeg = 0.0 for j in range(len(confusionMat)): if j == i: continue falseNeg = falseNeg + confusionMat[i][j] print i,truePos,falseNeg,falsePos,trueNeg if truePos+falsePos == 0.0: precision = 0.0 else: precision = truePos/(truePos+falsePos) if trueNeg+falsePos == 0.0: specificity = 0.0 else: specificity = trueNeg/(trueNeg+falsePos) if truePos+falseNeg == 0.0: sensitivity = 0.0 else: sensitivity = truePos/(truePos+falseNeg) resultMat[i] = [i,(truePos+falseNeg),precision,sensitivity,specificity] file1 = DBStore.commonRoot+"\\result1_"+colName+".csv" file2 = DBStore.commonRoot+"\\result2_"+colName+".csv" f = open(file1,'w') f.write('Class,#,Precision,Sensitivity,Specificity\n') for result in resultMat: f.write(index[result[0]]+','+str(result[1])+','+str(result[2]) + ',' + str(result[3]) + ',' + str(result[4])+'\n') f.close f = open(file2,'w') f.write('Wh-Word,Accuracy,Total Question,Correctly Predicted\n') for k,v in whTotal.iteritems(): if v == 0: continue percent = whCorrect[k]/v*100 f.write(k+','+str(percent)+','+str(v)+','+str(whCorrect[k])+'\n') f.close
def collinsHeadExtractor(self,colName,training): def whatPattern(question): p3 = re.compile("^[w|W]hat (is|are|was|were)( [A-Za-z]*)*( composed of| made of| made out of)( [A-Za-z]*)* \?$") if p3.match(question): return "ENTY:subs" p7 = re.compile("^[w|W]hat (is|are|was|were)( [A-Za-z]*)* used for \?$") if p7.match(question): return "DESC:reason_2" p1 = re.compile("^[w|W]hat (is|are|was|were|\'s)( a| an| the)*( \`\`)*( [A-Za-z]+[a-z\-]*[A-Za-z]+){1,2}( \'\')* \?$") if p1.match(question): return "DESC:def_1" p2 = re.compile("^[w|W]hat (do|does|did)( [A-Za-z]*)*( \`\`)*( [A-Za-z]*)*( \'\')*( means?)( [A-Za-z]*)* \?$") if p2.match(question): return "DESC:def_2" p4 = re.compile("^[w|W]hat (do|does|did) .* (do|does|did) \?$") if p4.match(question): return "DESC:desc" p5 = re.compile("^[w|W]hat do you (call|called) .*") p9 = re.compile("^[w|W]hat (is|\'s) the term .*") p10 = re.compile("^[w|W]hat (was|is|\'s) another name .*") if p5.match(question) or p9.match(question) or p10.match(question): return "ENTY:termeq" p6 = re.compile("^[w|W]hat (causes|cause|caused) .*") if p6.match(question): return "DESC:reason_1" p8 = re.compile("^[w|W]hat (do|does|did).* stand for \?$") if p8.match(question): return "ABBR:exp" p11 = re.compile('[w|W]hat (do|did|does) .* eat \?') if p11.match(question): return "ENTY:food" def whoPattern(question): p1 = re.compile("^[w|W]ho (is|are|was|were)( the)*( \`\`)*( [A-Z][a-z]+)+( \'\')* \?$") if p1.match(question): return "HUM:desc" p2 = re.compile("^[w|W]ho (is|was) .*") if p2.match(question): return "Hum:ind" if training: file = DBStore.trainingRoot+"\\headword_question"+colName+".txt" else: file = DBStore.testingRoot+"\\headword_question"+colName+".txt" f = open(file,'r') p = re.compile(r"(?P<head>.+)::(?P<question>.+)") i = 0 rawQuestions = DBStore.getDB()['raw'+colName] p2 = re.compile('\.') headWord = [] for line in f: # line = "was:What was archy , and mehitabel ?" print i i = i + 1 match = p.match(line) print match.group('question') head = p2.sub('',match.group('head')) question = match.group('question') raw = rawQuestions.find_one({'question':question}) if raw['whWord']=='whWord-what': pattern = whatPattern(question) elif raw['whWord']=='whWord-who': pattern = whoPattern(question) else: pattern = None if head.isupper() and (pattern == "DESC:def_1" or pattern == 'DESC:def_2'): pattern = "ABBR:exp" if head == 'null': headWord = [None,pattern] elif pattern is None: headWord = [head,pattern] else: headWord = [None,pattern] rawQuestions.update({'question':question},{"$set":{"head":headWord}},safe=True,multi=True)
def classification(): filePath = DBStore.commonRoot + '/'+dbName + '_'+colName +'.txt' f = open(filePath,'w') def maxmeanstdv(x): from math import sqrt n, max, mean, std = len(x), 0, 0, 0 for a in x: if a>max: max = a mean = mean + a mean = mean / float(n) for a in x: std = std + (a - mean)**2 std = sqrt(std / float(n-1)) return max, mean, std #=============================================================================== # Running ELM #=============================================================================== print 'Running ELM' result = [] # for i in range(100,500,10): # print "i: " + str(i) # for j in range(0,20): # print " j: " + str(j) # acc, missClass = ELMClassify(numberOfHiddenNeuron=i,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType) # result.append(acc) # max,mean,std = maxmeanstdv(result) # result = [] # print max,mean,std # for i in range(0,20): # print i # acc, missClass = ELMClassify(numberOfHiddenNeuron=1200,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType) # result.append(acc) # max,mean,std = maxmeanstdv(result) # pickle.dump(result,f) # f.close() # print str(max), str(mean), str(std) # ###Cross Validation # meanSeed = 0 # meanMax = 0 # for i in range(0,1000): # print 'Seed: ' + str(i) # result = [] # train_data = loadtxt(DBStore.trainingRoot+"/vector_"+ 'feature'+common+'_'+classType +".txt") # for j in range(0,5): # X = train_data[:,1:size(train_data,1)] # Y = train_data[:,0] # data = KFold(len(Y), k=2) # for train, test in data: # # print X[train],Y[train] # random.seed(i) # acc, missClass = ELMClassify(numberOfHiddenNeuron=210, # p=X[train], t=Y[train], # tv_t=Y[test],tv_p=X[test], # ) # result.append(acc) # random.seed(1+j) # shuffle(train_data) # max,mean,std = maxmeanstdv(result) # print max,mean # if mean>meanMax: # meanSeed = i # meanMax = mean # print 'New Maximum: ' + str(mean) + ' with seed: ' + str(i) #### Checking best hidden neuron # maxHidden = 0 max = 0 for i in range(1000,1301,50): print 'Hidden neuron: ' + str(i) random.seed(31) acc, missClass = ELMClassify(numberOfHiddenNeuron=i,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType) print acc if acc>max: maxHidden = i max = acc print 'New Maximum: ' + str(acc) + ' with hidden: ' + str(i) print maxHidden, max # # Checking best seed start = time.time() maxSeed = 0 max = 0 for i in range(1,51): print 'Seed: ' + str(i) random.seed(i) acc, missClass = ELMClassify(numberOfHiddenNeuron=maxHidden,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType) print acc result.append(acc) if acc>max: maxSeed = i max = acc print 'New Maximum: ' + str(acc) + ' with seed: ' + str(i) max,mean,std = maxmeanstdv(result) print mean,std,max,maxSeed # random.seed(maxSeed) acc, missClass = ELMClassify(numberOfHiddenNeuron=1150,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType) print acc # ResultProcessor.parseResult(missClass, 'feature'+colName+'_'+classType,'raw'+colName, classType) for c in missClass: rawQuestions = DBStore.getDB()['raw'+colName] question = rawQuestions.find()[c[0]] print question['question'] + ' ' + question[classType.lower()] + ' ' + str(question['head']) + ' ' + str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2])