Beispiel #1
0
 def vectorSpaceInit(self,questions,colName):
     questions.rewind()
     DBStore.dropColl(colName)
     print 'Creating vector space'
     for question in questions:
         DBStore.getDB()[colName].insert({'qID':question['qID'],'question':question['question'],'CoarseCode':self.__class__.coarseClassCode[question['coarse']],'FineCode':self.__class__.fineClassCode[question['fine']]})
     print 'Done'
Beispiel #2
0
    def insertFile2Database(self,colName='',training=True,parse=False):
#        posTagger = POSTagger()
#        posTagger.loadTagger()
#        tagger = posTagger.getTagger()
        lemmatizer = WordNetLemmatizer()
        DBStore.dropColl('raw'+colName)
        collection = DBStore.getDB()['raw'+colName]
        if training:
            readFileName = DBStore.trainingRoot + "\\raw_" + colName + ".txt"
        else:
            readFileName = DBStore.testingRoot + "\\raw_" + colName + ".txt"
        outputFileName = DBStore.commonRoot + "\\parsed_" + colName + ".txt"
        readFile = open(readFileName,'w')
        if colName=='':
            print 'Filename is needed'
        else:
            print 'Beginning insertion of ' +'raw' + colName
            start = time.time()
            if training:
                file = DBStore.trainingRoot+'/train_'+colName+'.label'
            else:
                file = DBStore.testingRoot+'/TREC_'+colName+'.label'
            input = open(file,'r')
            pattern = re.compile(r"(?P<coarse>\w+):(?P<fine>\w+) (?P<question>.+)");
            i = 0
            for line in input:
                print i
                i = i + 1
                match = pattern.match(line)
                tokenizeWords = word_tokenize(match.group('question'))
                print match.group('question')
                p = re.compile('\.') #pattern for eliminating .
                p2 = re.compile('(1|2)\d\d\d') #pattern for grouping year
                tokenizeWords = [p.sub('',word) for word in tokenizeWords if word not in string.punctuation]
                taggedWords = dict(pos_tag(tokenizeWords))
                lemmatizedQuestion = [lemmatizer.lemmatize(word,pos=self.replace(taggedWords[word])) for word in tokenizeWords]
#                print taggedWords
#                print match.group('question')
                pairQuestion = dict(zip(tokenizeWords,lemmatizedQuestion))
                readFile.write(match.group('question')+'\n')
                collection.insert({"qID":i,
                                   "question": match.group('question'),
                                   "coarse":match.group("coarse"),
                                   "fine":match.group("coarse")+":"+match.group("fine"),
                                   "lemma":lemmatizedQuestion,
                                   "tagged":taggedWords,
                                   "tokenized":tokenizeWords,
                                   "pair":pairQuestion
                                        })
            readFile.close()
            if parse:
                self.parseQuestion(readFileName, outputFileName)
            total = time.time()-start
            print 'End of insertion with total time '+ str(total)      
Beispiel #3
0
 def vectorSpaceBuilder(self,questions,colName,featureName, common,insert):
     inputFile = DBStore.commonRoot+ '/words' +common+'.txt'
     input = open(inputFile,'r') 
     featureDB = DBStore.getDB()[featureName]
     raw = DBStore.getDB()['raw'+colName]
     words = dict([(word.rstrip(),-1) for word in input])
     i =0
     questions.rewind()
     print words
     print "Start building vector"
     start = time.time()
     for question in questions:
         print i
         r = raw.find_one({'qID':question['qID']})
         i=i+1
         bagOfWords = words.copy()
         if insert['head']:
             for word in r['head']:
                 if word is not None:
                     try:
                         bagOfWords[word]
                         bagOfWords[word]=1
                     except KeyError:
                         pass
         if insert['whWord']:
             try:
                 bagOfWords[question['whWord']]
                 bagOfWords[question['whWord']]=1
             except KeyError:
                 pass
         if insert['hypernym']:
             for word in r['hypernym']:
                 try:
                     bagOfWords[word]
                     bagOfWords[word]=1
                 except KeyError:
                     pass
         if insert['unigram']:
             tokenizeWords = question['lemma']
             for word in tokenizeWords:
                 try:
                     if word.lower() not in ['who','what','how','where',
                                         'when','why','who','which',
                                         'whom','whose']:
                         bagOfWords[word]
                         bagOfWords[word]=1
                 except KeyError:
                     pass
         featureDB.update({'qID':question['qID']},{"$set":bagOfWords},safe=False,multi=True)
     total = time.time() - start
     print "Finish building " + str(total)
Beispiel #4
0
 def extractData(self,name,training,common,svmFormat=False,classType='Coarse'):
     helperFile =  DBStore.commonRoot + '/words'+common+'.txt'
     if training:
         if not svmFormat:
             outputFile = DBStore.trainingRoot + '/vector_'+name+'.txt'
         else:
             outputFile = DBStore.trainingRoot + '/vector_SVM_'+name+'.txt'
     else:
         if not svmFormat:
             outputFile = DBStore.testingRoot + '/vector_'+name+'.txt'
         else:
             outputFile = DBStore.testingRoot + '/vector_SVM_'+name+'.txt'
             
     output = open(outputFile,'w')
     helper = open(helperFile,'r')
     helperWord = [word for word in helper]
     query = DBStore.queryDB(name)
     i = 0
     for question in query:
         print i
         i=i+1
         output.write(str(question[classType+'Code']) +" ")
         j = 1
         if svmFormat:
             for word in helperWord:
                 output.write(str(j)+":" +str(question[word.rstrip()]) + " ") 
                 j=j+1
         else:
             for word in helperWord:
                 output.write(str(question[word.rstrip()]) + " ") 
         helper.seek(0)
         output.write("\n")
     output.close()
Beispiel #5
0
    def collinsHeadSenseExtractor(self,questions, colName,training):
        rawQuestions = DBStore.getDB()['raw'+colName]
        adaptedLesk = AdaptedLesk(6)
        i = 1
        questions.rewind()
        p = re.compile('(?P<head1>.+)--(?P<head2>.+)')
        for question in questions:
#        line = "was:What was archy , and mehitabel ?"
            print i
            i = i + 1
            headWord = question['head']
            try:
                match = p.match(headWord[0])
                if match:
                    headWord[0] = match.group('head1')
            except StandardError:
                    pass
            if headWord[0] is None \
                or len(wordnet.synsets(headWord[0]))==0 \
                or headWord[0] == 'null':
                headSense = "null"
            else:
                pos = DataRetrieval.replace(question['tagged'][headWord[0]])
                if question['whWord'] ==  'whWord-how':
                    headSense = 'null'
                else:
                    print question['tokenized'],headWord[0],question['tagged']
                    headSense = adaptedLesk.wsd(question['tokenized'],headWord[0],question['tagged'])
            rawQuestions.update({'qID':question['qID']},{"$set":{"headSense":headSense}},safe=True,multi=True)
Beispiel #6
0
    def hypernimExtractor(self,colName,max):
        rawQuestions = DBStore.getDB()['raw'+colName]
        l = 0 
        for question in rawQuestions.find():
            print question['headSense']
            if question['headSense']=='null':
                selectedHypernyms = []
            else:
#                sense = wordnet.synset(question['headSense'])
#                instances = sense.instance_hypernyms()
#                hypernyms = sense.hypernym_paths()
#                if len(instances) != 0:
#                    j = 0
#                    for instance in instances:
#                        j = j + 1
#                        if j>1: break;
#                        insPath = instance.hypernym_paths()
#                        k = 0
#                        for ins in insPath:
#                            k = k + 1
#                            if k>1: break;
#                            if len(ins)>max+1:
#                                index = max+1
#                            else:
#                                index = len(ins)
#                            ins.reverse()
#                            selected = [ins[k].lemma_names[0] for k in range(1,index)]
#                        selectedHypernyms.extend(selected)
#                else:
#                    j = 0
#                    for hypernym in hypernyms:
#                        j = j + 1
#                        if j>1: break
#                        if len(hypernym)>max+1:
#                            index = max+1
#                        else:
#                            index = len(hypernym)
#                        hypernym.reverse()
#                        selected = [hypernym[k].lemma_names[0] for k in range(1,index)]
#                    selectedHypernyms.extend(selected)
                sense = wordnet.synset(question['headSense'])
                instance = sense.instance_hypernyms()
                hypernym = sense.hypernym_paths()[0]
                if len(instance) != 0:
                    hypernym = instance[0].hypernym_paths()[0]
                if len(hypernym)>max+1:
                    index = max+1
                else:
                    index = len(hypernym)
                hypernym.reverse()
                selectedHypernyms = [hypernym[k].lemma_names[0] for k in range(1,index)]
            print l
            l = l + 1
            print question['question']
            rawQuestions.update({'question':question['question']},{"$set":{"hypernym":selectedHypernyms}},safe=True,multi=True)
Beispiel #7
0
 def whWordExtractor(self,questions,colName):
     rawQuestions = DBStore.getDB()['raw'+colName]
     p = re.compile('.*(?P<whword>who|what|how|where|when|why|which|whom|whose) .*', re.IGNORECASE)
     questions.rewind()
     for question in questions:
         match = p.match(question['question'])
         try:
             whWord = 'whWord-'+match.group('whword').lower()
         except AttributeError:
             whWord = 'whWord-rest'
         print question['question'] + whWord
         rawQuestions.update({'qID':question['qID']},{"$set":{"whWord":whWord}},safe=True,multi=True)
Beispiel #8
0
    def parseQuestion(self,colName,training):
        raw = DBStore.getDB()['raw'+colName].find()
        if training:
            final = DBStore.trainingRoot + '/parsed_'+colName+'.txt'
            outputFile = DBStore.trainingRoot +'/raw_'+colName+'.txt'
        else:
            final = DBStore.testingRoot + '/parsed_'+colName+'.txt'
            outputFile = DBStore.testingRoot +'/raw_'+colName+'.txt'
        f = open(outputFile,'w')
        for question in raw:
            f.write(question['question']+'\n')
        f.close
        startDir = os.getcwd()
        print 'Parsing'
#        command = "java  -Xms512m -Xmx512m -jar berkeleyParser.jar -gr eng_sm6.gr -inputFile "+outputFile+" -outputFile "+final
        command = "execute.bat " + outputFile + " " + final
#        os.system(command)
        args = shlex.split(command)
#        call(args)
        p = Popen(args,bufsize=-1,shell=True)
Beispiel #9
0
def parseResult(missClass,colName,rawName,classType):
    questions = DBStore.getDB()[colName].find()
    raw = DBStore.getDB()[rawName]
    mc = {}
    for c in missClass:
        mc[c[0]] = [c[1],c[2]]
    i = 0
    whTotal = {'whWord-who':0,'whWord-what':0,'whWord-how':0,'whWord-where':0,'whWord-rest':0,
               'whWord-when':0,'whWord-why':0,'whWord-which':0,'whWord-whom':0,'whWord-whose':0}
    whCorrect = {'whWord-who':0,'whWord-what':0,'whWord-how':0,'whWord-where':0,'whWord-rest':0,
               'whWord-when':0,'whWord-why':0,'whWord-which':0,'whWord-whom':0,'whWord-whose':0}
    if classType =='Fine':
        label = 'FineCode'
        index = ["ABBR:abb", "ABBR:exp",
             "DESC:def", "DESC:desc",
             "DESC:manner","DESC:reason",
             "ENTY:animal","ENTY:body", 
             "ENTY:color", "ENTY:cremat",
             "ENTY:currency", "ENTY:dismed",
             "ENTY:event", "ENTY:food",
             "ENTY:instru","ENTY:lang",
             "ENTY:letter", "ENTY:other",
             "ENTY:plant","ENTY:product",
             "ENTY:religion", "ENTY:sport",
             "ENTY:substance", "ENTY:symbol", 
             "ENTY:techmeth", "ENTY:veh",
             "ENTY:word", "ENTY:termeq",
             "HUM:ind", "HUM:title",
             "HUM:desc","HUM:gr",
             "LOC:country", "LOC:mount",
             "LOC:other", "LOC:state",
             "LOC:city", 
             "NUM:code", "NUM:count", 
             "NUM:date", "NUM:dist",
             "NUM:money", "NUM:ord",
             "NUM:other", "NUM:period",
             "NUM:perc", "NUM:speed",
             "NUM:temp", "NUM:volsize",
             "NUM:weight"]
    else:
        label = 'CoarseCode'
        index = ["ABBR","DESC", "ENTY", "HUM", "LOC",
                           "NUM"]
    confusionMat = [[0.0 for col in range(len(index))]for row in range(len(index))]
    resultMat = [[0.0 for col in range(3)]for row in range(len(index))]
    i = 0
    for question in questions:
        whWord = raw.find({'qID':question['qID']})[0]['whWord']
        whTotal[whWord] = whTotal[whWord] + 1 
        if i not in mc.keys():
            whCorrect[whWord] = whCorrect[whWord] + 1 
            predicted = actual = question[label]
            confusionMat[actual][predicted] = \
            confusionMat[actual][predicted] + 1 
        else:
            if question[label]!=mc[i][0]:
                print 'Error'
                break
            else:
                predicted = mc[i][1]
                actual = mc[i][0]
                confusionMat[actual][predicted] = \
                confusionMat[actual][predicted] + 1 
        i = i +1
    print confusionMat
    for i in range(len(index)):
        truePos = confusionMat[i][i]
        falsePos = 0.0
        for j in range(len(confusionMat)):
            if j == i: continue
            falsePos = falsePos + confusionMat[j][i] 
        trueNeg = 0.0
        for j in range(len(confusionMat)):
            for k in range(len(confusionMat)):
                if j == i or k == i: continue
                trueNeg = trueNeg + confusionMat[j][k]
        falseNeg = 0.0
        for j in range(len(confusionMat)):
            if j == i: continue
            falseNeg = falseNeg + confusionMat[i][j]
        print i,truePos,falseNeg,falsePos,trueNeg
        
        if truePos+falsePos == 0.0:
            precision = 0.0
        else:    
            precision = truePos/(truePos+falsePos)
        if trueNeg+falsePos == 0.0:
            specificity = 0.0
        else:
            specificity = trueNeg/(trueNeg+falsePos)
        if truePos+falseNeg == 0.0:
            sensitivity = 0.0
        else:
            sensitivity = truePos/(truePos+falseNeg)
        resultMat[i] = [i,(truePos+falseNeg),precision,sensitivity,specificity]
    file1 = DBStore.commonRoot+"\\result1_"+colName+".csv"
    file2 = DBStore.commonRoot+"\\result2_"+colName+".csv"
    f = open(file1,'w')
    f.write('Class,#,Precision,Sensitivity,Specificity\n')
    for result in resultMat:
        f.write(index[result[0]]+','+str(result[1])+','+str(result[2]) + ',' + str(result[3]) + ',' + str(result[4])+'\n')
    f.close
    f = open(file2,'w')
    f.write('Wh-Word,Accuracy,Total Question,Correctly Predicted\n')
    for k,v in whTotal.iteritems():
        if v == 0: continue
        percent = whCorrect[k]/v*100
        f.write(k+','+str(percent)+','+str(v)+','+str(whCorrect[k])+'\n')
    f.close
Beispiel #10
0
    def collinsHeadExtractor(self,colName,training):
        def whatPattern(question):
            p3 = re.compile("^[w|W]hat (is|are|was|were)( [A-Za-z]*)*( composed of| made of| made out of)( [A-Za-z]*)* \?$")
            if p3.match(question):
                return "ENTY:subs"
            p7 = re.compile("^[w|W]hat (is|are|was|were)( [A-Za-z]*)* used for \?$")
            if p7.match(question):
                return "DESC:reason_2"
            p1 = re.compile("^[w|W]hat (is|are|was|were|\'s)( a| an| the)*( \`\`)*( [A-Za-z]+[a-z\-]*[A-Za-z]+){1,2}( \'\')* \?$")
            if p1.match(question):
                return "DESC:def_1"
            p2 = re.compile("^[w|W]hat (do|does|did)( [A-Za-z]*)*( \`\`)*( [A-Za-z]*)*( \'\')*( means?)( [A-Za-z]*)* \?$")
            if p2.match(question):
                return "DESC:def_2"
            p4 = re.compile("^[w|W]hat (do|does|did) .* (do|does|did) \?$")
            if p4.match(question):
                return "DESC:desc"
            p5 = re.compile("^[w|W]hat do you (call|called) .*")
            p9 = re.compile("^[w|W]hat (is|\'s) the term .*")
            p10 = re.compile("^[w|W]hat (was|is|\'s) another name .*")
            if p5.match(question) or p9.match(question) or p10.match(question):
                return "ENTY:termeq"
            p6 = re.compile("^[w|W]hat (causes|cause|caused) .*")
            if p6.match(question):
                return "DESC:reason_1"
            p8 = re.compile("^[w|W]hat (do|does|did).* stand for \?$")
            if p8.match(question):
                return "ABBR:exp"
            p11 = re.compile('[w|W]hat (do|did|does) .* eat \?')
            if p11.match(question):
                return "ENTY:food"

        def whoPattern(question):
            p1 = re.compile("^[w|W]ho (is|are|was|were)( the)*( \`\`)*( [A-Z][a-z]+)+( \'\')* \?$")
            if p1.match(question):
                return "HUM:desc"
            p2 = re.compile("^[w|W]ho (is|was) .*")
            if p2.match(question):
                return "Hum:ind"
        if training:
            file = DBStore.trainingRoot+"\\headword_question"+colName+".txt"
        else:
            file = DBStore.testingRoot+"\\headword_question"+colName+".txt"
        f = open(file,'r')
        p = re.compile(r"(?P<head>.+)::(?P<question>.+)")
        i = 0
        rawQuestions = DBStore.getDB()['raw'+colName]
        p2 = re.compile('\.')
        headWord = []
        for line in f:
#        line = "was:What was archy , and mehitabel ?"
            print i
            i = i + 1
            match = p.match(line)
            print match.group('question')
            head = p2.sub('',match.group('head'))
            question = match.group('question')
            raw = rawQuestions.find_one({'question':question})
            if raw['whWord']=='whWord-what':
                pattern = whatPattern(question)
            elif raw['whWord']=='whWord-who':
                pattern = whoPattern(question)
            else:
                pattern = None
            if head.isupper() and (pattern == "DESC:def_1" or pattern == 'DESC:def_2'):
                pattern = "ABBR:exp"
            if head == 'null':
                headWord = [None,pattern]
            elif pattern is None:
                headWord = [head,pattern]
            else:
                headWord = [None,pattern]
            rawQuestions.update({'question':question},{"$set":{"head":headWord}},safe=True,multi=True)
Beispiel #11
0
def classification():
    filePath = DBStore.commonRoot + '/'+dbName + '_'+colName +'.txt'
    f = open(filePath,'w')
    def maxmeanstdv(x):
        from math import sqrt
        n, max, mean, std = len(x), 0, 0, 0
        for a in x:
            if a>max:
                max = a
            mean = mean + a
        mean = mean / float(n)
        for a in x:
            std = std + (a - mean)**2
        std = sqrt(std / float(n-1))
        return max, mean, std
    #===============================================================================
    # Running ELM
    #===============================================================================

    print 'Running ELM'
    result = []
#    for i in range(100,500,10):
#        print "i: " + str(i)
#        for j in range(0,20):
#            print "   j: " + str(j)
#            acc, missClass = ELMClassify(numberOfHiddenNeuron=i,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType)
#            result.append(acc)
#        max,mean,std = maxmeanstdv(result)
#        result = []
#        print max,mean,std
    
#    for i in range(0,20):
#        print i
#        acc, missClass = ELMClassify(numberOfHiddenNeuron=1200,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType)
#        result.append(acc)
#    max,mean,std = maxmeanstdv(result)
#    pickle.dump(result,f)
#    f.close()
#    print str(max), str(mean), str(std)
#

###Cross Validation
#    meanSeed = 0
#    meanMax = 0
#    for i in range(0,1000):
#        print 'Seed: ' + str(i)
#        result = []
#        train_data = loadtxt(DBStore.trainingRoot+"/vector_"+ 'feature'+common+'_'+classType +".txt")
#        for j in range(0,5):
#            X = train_data[:,1:size(train_data,1)]
#            Y = train_data[:,0]
#            data = KFold(len(Y), k=2)
#            for train, test in data:
#        #        print X[train],Y[train]
#                random.seed(i)
#                acc, missClass = ELMClassify(numberOfHiddenNeuron=210,
#                                                 p=X[train], t=Y[train],
#                                                 tv_t=Y[test],tv_p=X[test],
#                                                 )
#                result.append(acc)
#            random.seed(1+j)
#            shuffle(train_data)
#        max,mean,std = maxmeanstdv(result)
#        print max,mean
#        if mean>meanMax:
#            meanSeed = i
#            meanMax = mean
#            print 'New Maximum: ' + str(mean) + ' with seed: ' + str(i) 
        
#### Checking best hidden neuron
#
    maxHidden = 0
    max = 0
    for i in range(1000,1301,50):
        print 'Hidden neuron: ' + str(i) 
        random.seed(31)
        acc, missClass = ELMClassify(numberOfHiddenNeuron=i,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType)
        print acc
        if acc>max:
            maxHidden = i
            max = acc
            print 'New Maximum: ' + str(acc) + ' with hidden: ' + str(i) 
    print maxHidden, max 
#    
# Checking best seed
    start = time.time()
    maxSeed = 0
    max = 0
    for i in range(1,51):
        print 'Seed: ' + str(i) 
        random.seed(i)
        acc, missClass = ELMClassify(numberOfHiddenNeuron=maxHidden,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType)
        print acc
        result.append(acc)
        if acc>max:
            maxSeed = i
            max = acc
            print 'New Maximum: ' + str(acc) + ' with seed: ' + str(i) 
    max,mean,std = maxmeanstdv(result)
    print mean,std,max,maxSeed
#    
    random.seed(maxSeed)
    acc, missClass = ELMClassify(numberOfHiddenNeuron=1150,train='feature'+common+'_'+classType, test='feature'+colName+'_'+classType)
    print acc
#    ResultProcessor.parseResult(missClass, 'feature'+colName+'_'+classType,'raw'+colName, classType)

    for c in missClass:
        rawQuestions = DBStore.getDB()['raw'+colName]
        question = rawQuestions.find()[c[0]]
        print question['question'] + ' ' + question[classType.lower()] + ' ' + str(question['head']) + ' ' + str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2])
Beispiel #12
0
#    m = svm_train(y,x,"-c 4 -t 0")
#    print 'SVM training time: ' + str(time.time()-start)
#    start = time.time()
#    p_label, p_acc, p_val = svm_predict(y_test,x_test,m)
#    print 'SVM testing time: ' + str(time.time()-start)
#    print 'Done'


insert = {'unigram':False,'hypernym':False,'head':True,'whWord':True}
classType = 'Coarse'
dbName = 'QAnlp'
for key,value in insert.iteritems():
    if value:
        dbName = dbName + '_' + key
                
DBStore.init(dbName)
dataprep = DataRetrieval()
termExtractor = BagOfWords()
featureExtractor = FeatureExtractor()
colName = '5500'
common = '5500'
training = True
questions = DBStore.queryDB('raw'+colName)
#===============================================================================
# Insert Raw File to Database
#===============================================================================


#dataprep.insertFile2Database(colName,training)
#parsingBerkeley()
#featureInit()
Beispiel #13
0
#    print 'Running SVM'
#    y,x = svm_read_problem(DBStore.trainingRoot+'/vector_SVM_'+common+'.txt')
#    y_test,x_test = svm_read_problem(DBStore.testingRoot+'/vector_SVM_'+colName+'.txt')
#    m = svm_train(y,x,"-c 4 -t 0")
#    p_label, p_acc, p_val = svm_predict(y_test,x_test,m)
#    print 'Done'


insert = {"unigram": True, "hypernym": True, "head": True, "whWord": True}
classType = "Coarse"
dbName = "QAnlp"
for key, value in insert.iteritems():
    if value:
        dbName = dbName + "_" + key

DBStore.init(dbName)
dataprep = DataRetrieval()
termExtractor = BagOfWords()
featureExtractor = FeatureExtractor()
colName = "5500"
common = "5500"
training = True
questions = DBStore.queryDB("raw" + colName)
# ===============================================================================
# Insert Raw File to Database
# ===============================================================================


# dataprep.insertFile2Database(colName,training)
# parsingBerkeley()
featureInit()