Beispiel #1
0
 def _processing_raw_training_data2unigrams_and_tags(self):
     '''
     from lines data(WSAtom wrapped )to trainning data(ws needed)
     [ inner class function ]
     logic :
         we process self.raw_training_data . 
         and set self.training_unigrams_data , self.training_tags_data
         unigram_line_list : list , elements is also a list . the most inner element is the unigram . 
                             => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ...  ] 
         tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...]
 
     '''
     logging.info("processing raw training data to unigrams and tags .")
     if self.raw_training_data is None:
         logging.error("failed!")
         return
     self.training_unigrams_data = []
     self.training_tags_data = []
     for sentence in self.raw_training_data:
         unigram_line, tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(
             sentence)
         self.training_tags_data.append(tags)
         self.training_unigrams_data.append(unigram_line)
     if DEBUG:
         logger.debug("the 1st line : %s" % (u" ".join(
             [unicode(atom)
              for atom in self.training_unigrams_data[0]]).encode('utf8')))
         logger.debug("the 1st tag list : " + " ".join(
             [TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0]]))
         logger.debug("the 1st origin seg line : " + " ".join([
             WSAtomTranslator.trans_atom_gram_list2unicode_line(
                 atom_list).encode("utf8")
             for atom_list in self.raw_training_data[0]
         ]))
Beispiel #2
0
    def build_lexicon_match_state(self, instance):
        '''
        for every unigram , record :
        as a word head , the word 's max length
        as a word middle , the word's max length
        as a word end , the word's max length
        '''
        instance_len = len(instance)
        match_state = [[1] * 3
                       for i in range(instance_len)]  #! minimum length is 1
        for i in range(instance_len):
            j = min(i + LEXICON_MATCH_MAX_LENGTH, instance_len - 1)
            while j > i:
                test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line(
                    instance[i:j + 1])
                if test_word in self.lexicon:
                    word_len = j - i + 1
                    #! max length as the word head
                    match_state[i][0] = max(match_state[i][0], word_len)
                    #! max length as the word middle
                    for interval in range(i + 1, j):
                        match_state[interval][1] = max(
                            match_state[interval][1], word_len)
                    #! max length as the word end
                    match_state[j][2] = max(match_state[j][2], word_len)
                    break
                j -= 1

        return match_state
Beispiel #3
0
 def _processing_raw_training_data2unigrams_and_tags(self) :
     '''
     from lines data(WSAtom wrapped )to trainning data(ws needed)
     [ inner class function ]
     logic :
         we process self.raw_training_data . 
         and set self.training_unigrams_data , self.training_tags_data
         unigram_line_list : list , elements is also a list . the most inner element is the unigram . 
                             => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ...  ] 
         tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...]
 
     '''
     logging.info("processing raw training data to unigrams and tags .")
     if self.raw_training_data is None : 
         logging.error("failed!")
         return
     self.training_unigrams_data = []
     self.training_tags_data = []
     for sentence in self.raw_training_data :
         unigram_line , tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(sentence)
         self.training_tags_data.append(tags)
         self.training_unigrams_data.append(unigram_line)
     if DEBUG :
         logger.debug("the 1st line : %s" %( u" ".join(
                      [ unicode(atom) for atom in self.training_unigrams_data[0]] ).encode('utf8') ))
         logger.debug("the 1st tag list : " + " ".join([ TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0] ]))
         logger.debug("the 1st origin seg line : " + " ".join(
                      [WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_list).encode("utf8") 
                      for atom_list in self.raw_training_data[0]]))
Beispiel #4
0
    def build_lexicon_match_state(self , instance) :
        '''
        for every unigram , record :
        as a word head , the word 's max length
        as a word middle , the word's max length
        as a word end , the word's max length
        '''
        instance_len = len(instance)
        match_state = [ [ 1 ] * 3 for i in range(instance_len) ] #! minimum length is 1
        for i in range(instance_len) :
            j = min( i + LEXICON_MATCH_MAX_LENGTH , instance_len - 1 )
            while j > i :
                test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line(instance[i:j+1])
                if test_word in self.lexicon :
                    word_len = j - i + 1
                    #! max length as the word head 
                    match_state[i][0] = max( match_state[i][0] , word_len )
                    #! max length as the word middle
                    for interval in range(i+1 , j) :
                        match_state[interval][1] = max( match_state[interval][1] , word_len )
                    #! max length as the word end
                    match_state[j][2] = max( match_state[j][2] , word_len )
                    break
                j -= 1

        return match_state
Beispiel #5
0
 def _build_inner_lexicon(self , threshold=1.) :
     logging.info("build inner lexicon from training data .")
     if self.raw_training_data is None :
         logging.error('failed')
         return
     words_counter = Counter()
     for raw_instance in self.raw_training_data :
         #! len > 1 to ensure it is a lexicon
         unicode_instance = [ WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_instance_gram_list) 
                              for atom_instance_gram_list in raw_instance if len(atom_instance_gram_list) > 1 ]
         words_counter.update(unicode_instance)
     total_freq = sum(words_counter.viewvalues())
     lexicon_list = []
     if threshold < 1. :
         ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words . 
         ##! but it clearly will cause some words  were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad.
         ##! So do following logic to keep fair .
         ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )! 
         threshold_num = int( total_freq * threshold )
         pre_freq = INF
         words_has_same_freq = []
         freq_counter = 0
         for word , freq in words_counter.most_common() :
             if freq != pre_freq :
                 lexicon_list.extend(words_has_same_freq)
                 words_has_same_freq = []
                 pre_freq = freq
                 if freq_counter > threshold_num :
                     break
             words_has_same_freq.append(word)
             freq_counter += freq
         else :
             lexicon_list.extend(words_has_same_freq) #! if it is because all word is iterated , we should apend it !
     else :
         lexicon_list = words_counter.keys()
     logging.info( "inner lexicon info : %d/%d" %( len(lexicon_list) , len(words_counter) )  )
     
     if DEBUG :
         freq_in_lexicon = 0
         min_freq = INF
         for word in lexicon_list :
             word_freq = words_counter[word]
             freq_in_lexicon += word_freq
             if word_freq < min_freq :
                 min_freq = word_freq
         logger.debug("origin words count : " + str(len(words_counter)))
         logger.debug("lexicon count : " + str(len(lexicon_list)))
         logger.debug( ("thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d ),"
                        "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%" %( 
                         threshold_num , freq_in_lexicon , total_freq , min_freq , 
                         freq_in_lexicon / float(total_freq) , len(lexicon_list) / float(len(words_counter)) )) 
                     )
     self.inner_lexicon =  dict.fromkeys(lexicon_list) #! to make it more efficient 
Beispiel #6
0
    def _build_inner_lexicon(self, threshold=1.):
        logging.info("build inner lexicon from training data .")
        if self.raw_training_data is None:
            logging.error('failed')
            return
        words_counter = Counter()
        for raw_instance in self.raw_training_data:
            #! len > 1 to ensure it is a lexicon
            unicode_instance = [
                WSAtomTranslator.trans_atom_gram_list2unicode_line(
                    atom_instance_gram_list)
                for atom_instance_gram_list in raw_instance
                if len(atom_instance_gram_list) > 1
            ]
            words_counter.update(unicode_instance)
        total_freq = sum(words_counter.viewvalues())
        lexicon_list = []
        if threshold < 1.:
            ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words .
            ##! but it clearly will cause some words  were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad.
            ##! So do following logic to keep fair .
            ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )!
            threshold_num = int(total_freq * threshold)
            pre_freq = INF
            words_has_same_freq = []
            freq_counter = 0
            for word, freq in words_counter.most_common():
                if freq != pre_freq:
                    lexicon_list.extend(words_has_same_freq)
                    words_has_same_freq = []
                    pre_freq = freq
                    if freq_counter > threshold_num:
                        break
                words_has_same_freq.append(word)
                freq_counter += freq
            else:
                lexicon_list.extend(
                    words_has_same_freq
                )  #! if it is because all word is iterated , we should apend it !
        else:
            lexicon_list = words_counter.keys()
        logging.info("inner lexicon info : %d/%d" %
                     (len(lexicon_list), len(words_counter)))

        if DEBUG:
            freq_in_lexicon = 0
            min_freq = INF
            for word in lexicon_list:
                word_freq = words_counter[word]
                freq_in_lexicon += word_freq
                if word_freq < min_freq:
                    min_freq = word_freq
            logger.debug("origin words count : " + str(len(words_counter)))
            logger.debug("lexicon count : " + str(len(lexicon_list)))
            logger.debug((
                "thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d ),"
                "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%"
                % (threshold_num, freq_in_lexicon, total_freq,
                   min_freq, freq_in_lexicon / float(total_freq),
                   len(lexicon_list) / float(len(words_counter)))))
        self.inner_lexicon = dict.fromkeys(
            lexicon_list)  #! to make it more efficient