Python Question.tokenize Examples

Programming Language: Python

Namespace/Package Name: question

Class/Type: Question

Method/Function: tokenize

Examples at hotexamples.com: 1

Python Question.tokenize - 1 examples found. These are the top rated real world Python examples of question.Question.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Question(30)

addAnswer(5)

add_alternative(5)

get_active_question_by_id(4)

count(3)

answer(2)

NER(1)

get_question_only_by_id(1)

fromMsg(1)

getFromId(1)

getList(1)

getQuestions(1)

get_question_by_revision_id(1)

multi_line_answer(1)

is_null(1)

multi_choice(1)

single_choice(1)

single_line_answer(1)

text(1)

tokenize(1)

update_question_by_id(1)

update_question_by_id_and_create_revision(1)

valid_answer(1)

flush(1)

change_case(1)

convert(1)

add_correct(1)

__init__(1)

_addQuestion(1)

_displayAllQuestions(1)

add(1)

addAssumption(1)

add_answer(1)

add_choice(1)

add_correct_answer(1)

ParseLine(1)

add_description(1)

add_q(1)

all(1)

answer_NP_type(1)

answer_or_default(1)

build(1)

by_id(1)

validate(1)

Example #1

Show file

File: question_features.py Project: honeybread/MLProject_QB_Spring2015

class QuestionFeatureExtractor(object):
    def __init__(self, granularity = 'word'):
        self.granularity = granularity
        with open("../../data/sparse_mega_dict2.txt", "r") as fp:
            self.sparse_mega_dict = pickle.load(fp)
    
    def __call__(self, qid):
        self.question = Question(qid)
        self.qid = int(qid)
        self.qtext = self.question.text
        #self.answer = self.question.answer
        #self.category = self.question.category
        #self.sentences = self.question.get_sentences()
        #self.tokens = self.question.tokenize()
        self.features = defaultdict(dict)
        
    def feat_vectorizer(self, dict_of_dict):
        for item in dict_of_dict:
            dict_of_dict[item] = dict_of_dict[item].values()
        return dict_of_dict

    def caps_cumulative(self):
        '''assigns number of words[cumulative] starting with capital letters'''
        word_position = 0.
        num_cap = 0.
        
        if self.granularity == 'word': 
            for word in self.tokens:
                if word.isupper():
                    num_cap += 1.
                
                self.features[word_position].update({"c_cap": num_cap})  
                word_position += 1.
            
        if self.granularity == 'question':
            for word in self.tokens:
                if word.isupper():
                    num_cap += 1.
                
            self.features[self.qid].update({"c_cap": num_cap})  
            
        
        
    def sentence_position(self):
        '''assigns the sentence position containing the word for word granularity
           [or]  assigns number of sentences in question for question granularity'''
        word_position = 0.
        sent_position = 0.
        
        if self.granularity == 'word': 
            for sent in self.sentences:
                tokens = self.question.tokenize(othertext = sent)
                for token in tokens:
                    self.features[word_position].update({"s_pos": sent_position})
                    word_position += 1.
                    
                sent_position += 1.
                
        if self.granularity == 'question':
            self.features[self.qid].update({"s_pos": len(self.sentences)})  
      
  
    
    def sparse_features(self):
        '''builds cumulative parts of speech feature for word granularity
            or total parts of speech for question granularity'''
        print self.qid
        sparse_dict = OrderedDict()
        
        if not self.sparse_mega_dict.has_key(self.qid):
            self.features = None
            return
            
            
        for sent_parse_dict in self.sparse_mega_dict[self.qid]['sentences']:
            for lst in sent_parse_dict['words']:
                sparse_dict.update({(lst[0], lst[1]['CharacterOffsetBegin']):(lst[1]['PartOfSpeech'], lst[1]['NamedEntityTag']) })

        
        word_position = 0.
        pos_occurences_so_far = {u'CC':0,u'CD':0,u'DT':0,u'EX':0,u'FW':0,u'IN':0,\
        u'JJ':0,u'JJR':0,u'JJS':0,u'LS':0,u'MD':0,u'NN':0,u'NNS':0,u'NNP':0,u'NNPS':0,\
        u'PDT':0,u'PRP':0,u'PRP$':0,u'RB':0,u'RBR':0,u'RBS':0,u'RP':0,u'SYM':0,\
        u'TO':0,u'UH':0,u'VB':0,u'VBD':0,u'VBG':0,u'VBN':0,u'VBP':0,u'VBZ':0,\
        u'WDT':0,u'WP$':0,u'WRB':0, u'POS': 0, u'WP': 0, u',': 0, u'!':0, u'"':0, \
        u'#':0, u'$':0, u'%':0, u'&':0, u'\\':0, u'\'':0, u'(':0, u')':0, u'*':0,\
        u'+':0, u'-':0, u'.':0, u'/':0, u':':0, u';':0, u'<':0, u'>':0, u'=':0, \
        u'?': 0, u'@':0, u'[':0, u']':0, u'^':0, u'_':0, u'~':0, u'{':0, u'}':0, u'|':0,\
        u'``':0, u"''": 0, u'-LRB-':0, u'O':0}
        
        ner_occurences_so_far =  { u'PERSON':0,u'LOCATION':0,u'ORGANIZATION':0, u'MISC':0,\
        u'DATE': 0, u'TIME':0, 'DURATION':0, u'SET':0, \
        u'MONEY':0, 'PERCENT':0, u'NUMBER':0, u'ORDINAL':0,\
        u'O':0 }
        
        if self.granularity == 'word': 
            for word, char_pos in sparse_dict:
                pos, ner = sparse_dict[(word, char_pos)]
                if not pos_occurences_so_far.has_key(pos):
                    pos = u'O'
                if not ner_occurences_so_far.has_key(ner):
                    ner = u'O'
                pos_occurences_so_far[pos] += 1.
                ner_occurences_so_far[ner] += 1.
                word_position = self.qtext.count(" ",0, int(char_pos))
                self.features[word_position].update(pos_occurences_so_far)
                self.features[word_position].update(ner_occurences_so_far)
            print self.features
            print "\n\n\n"
                
        if self.granularity == 'question':
            for word, char_pos in sparse_dict:
                pos, ner = sparse_dict[(word, char_pos)]
                if not pos_occurences_so_far.has_key(pos):
                    pos = u'O'
                if not ner_occurences_so_far.has_key(ner):
                    ner = u'O'
                pos_occurences_so_far[pos] += 1.
                ner_occurences_so_far[ner] += 1.

            
            self.features[self.qid].update(pos_occurences_so_far)
            self.features[self.qid].update(ner_occurences_so_far)
            


    def NER(self, allow = 'all', restrict = None):
        '''builds cumulative NER feature for word granularity
            or total NER for question granularity'''
           
        ner_dict = OrderedDict()
        
        if not self.sparse_mega_dict.has_key(self.qid):
            self.features = None
            return
            
        for sent_parse_dict in self.sparse_mega_dict[self.qid]['sentences']:
            for lst in sent_parse_dict['words']:
                ner_dict.update({(lst[0], lst[1]['CharacterOffsetBegin']):lst[1]['NamedEntityTag']})
        
        word_position = 0.
        ner_occurences_so_far =  {u'PERSON':0,u'LOCATION':0,u'ORGANIZATION':0, u'MISC':0,\
                                  u'DATE': 0, u'TIME':0, 'DURATION':0, u'SET':0, \
                                  u'MONEY':0, 'PERCENT':0, u'NUMBER':0, u'ORDINAL':0,\
                                  u'O':0}
        
        if self.granularity == 'word': 
            for word, char_pos in ner_dict:
                ner = ner_dict[(word, char_pos)]

                ner_occurences_so_far[ner] += 1.
                word_position = self.qtext.count(" ",0, int(char_pos)) + 1.
                self.features[word_position].update(ner_occurences_so_far)
                
                # restrict or allow only certain NER
                if restrict != None:
                    for item in restrict:
                        self.features[word_position].pop(item)
                if allow != 'all':
                    ner_occurences_allowed = {}
                    for item in allow:
                        ner_occurences_allowed[item] = ner_occurences_so_far[item]
                    ner_occurences_so_far = ner_occurences_allowed
                    self.features[word_position] = ner_occurences_so_far
            
                
        if self.granularity == 'question':
            for word, char_pos in ner_dict:
                ner_occurences_so_far[ner_dict[(word, char_pos)]] += 1.
                word_position = self.qtext.count(" ",0, int(char_pos)) + 1.
            
            ner = ner_dict[(word, char_pos)]
            # restrict or allow only certain NER
            if restrict != None:
                for item in restrict:
                    ner_occurences_so_far.pop(item)
            if allow != 'all':
                ner_occurences_allowed = {}
                for item in allow:
                    ner_occurences_allowed[item] = ner_occurences_so_far[item]
                ner_occurences_so_far = ner_occurences_allowed
            self.features[self.qid].update(ner_occurences_so_far)