def collinsHeadSenseExtractor(self,questions, colName,training): rawQuestions = DBStore.getDB()['raw'+colName] adaptedLesk = AdaptedLesk(6) i = 1 questions.rewind() p = re.compile('(?P<head1>.+)--(?P<head2>.+)') for question in questions: # line = "was:What was archy , and mehitabel ?" print i i = i + 1 headWord = question['head'] try: match = p.match(headWord[0]) if match: headWord[0] = match.group('head1') except StandardError: pass if headWord[0] is None \ or len(wordnet.synsets(headWord[0]))==0 \ or headWord[0] == 'null': headSense = "null" else: pos = DataRetrieval.replace(question['tagged'][headWord[0]]) if question['whWord'] == 'whWord-how': headSense = 'null' else: print question['tokenized'],headWord[0],question['tagged'] headSense = adaptedLesk.wsd(question['tokenized'],headWord[0],question['tagged']) rawQuestions.update({'qID':question['qID']},{"$set":{"headSense":headSense}},safe=True,multi=True)
def featureInsertion(): # if training: #=============================================================================== # Generating bag of words #=============================================================================== # print "Generating bag of words" # termExtractor.bagOfWordBuilder(questions,'words'+colName,insert) # print "Done" #=============================================================================== # Inserting bag of words to vector space #=============================================================================== print 'Inserting bag of words to vector space' # termExtractor.vectorSpaceBuilder(questions, colName, 'feature'+colName+'_'+classType,common,insert) print 'Done' #=============================================================================== # Generating data for classifier #=============================================================================== print 'Generating data for classifier' dataRetrieval = DataRetrieval() dataRetrieval.extractData('feature'+colName+'_'+classType,training,common,False,classType) dataRetrieval.extractData('feature'+colName+'_'+classType,training,common,True,classType) print 'Done'
def featureInsertion(): if training: # =============================================================================== # Generating bag of words # =============================================================================== print "Generating bag of words" termExtractor.bagOfWordBuilder(questions, "words" + colName, insert) print "Done" # =============================================================================== # Inserting bag of words to vector space # =============================================================================== print "Inserting bag of words to vector space" termExtractor.vectorSpaceBuilder(questions, colName, "feature" + colName + "_" + classType, common, insert) print "Done" # =============================================================================== # Generating data for classifier # =============================================================================== print "Generating data for classifier" dataRetrieval = DataRetrieval() dataRetrieval.extractData("feature" + colName + "_" + classType, training, common, False, classType) # dataRetrieval.extractData('feature'+colName+'_'+classType,training,common,True,classType) print "Done"
def replaceTag(tag): pos = DataRetrieval.replace(tag) if pos=='a': pos = 'n' return pos