def _processing_raw_training_data2unigrams_and_tags(self): ''' from lines data(WSAtom wrapped )to trainning data(ws needed) [ inner class function ] logic : we process self.raw_training_data . and set self.training_unigrams_data , self.training_tags_data unigram_line_list : list , elements is also a list . the most inner element is the unigram . => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ... ] tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...] ''' logging.info("processing raw training data to unigrams and tags .") if self.raw_training_data is None: logging.error("failed!") return self.training_unigrams_data = [] self.training_tags_data = [] for sentence in self.raw_training_data: unigram_line, tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags( sentence) self.training_tags_data.append(tags) self.training_unigrams_data.append(unigram_line) if DEBUG: logger.debug("the 1st line : %s" % (u" ".join( [unicode(atom) for atom in self.training_unigrams_data[0]]).encode('utf8'))) logger.debug("the 1st tag list : " + " ".join( [TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0]])) logger.debug("the 1st origin seg line : " + " ".join([ WSAtomTranslator.trans_atom_gram_list2unicode_line( atom_list).encode("utf8") for atom_list in self.raw_training_data[0] ]))
def build_lexicon_match_state(self, instance): ''' for every unigram , record : as a word head , the word 's max length as a word middle , the word's max length as a word end , the word's max length ''' instance_len = len(instance) match_state = [[1] * 3 for i in range(instance_len)] #! minimum length is 1 for i in range(instance_len): j = min(i + LEXICON_MATCH_MAX_LENGTH, instance_len - 1) while j > i: test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line( instance[i:j + 1]) if test_word in self.lexicon: word_len = j - i + 1 #! max length as the word head match_state[i][0] = max(match_state[i][0], word_len) #! max length as the word middle for interval in range(i + 1, j): match_state[interval][1] = max( match_state[interval][1], word_len) #! max length as the word end match_state[j][2] = max(match_state[j][2], word_len) break j -= 1 return match_state
def _processing_raw_training_data2unigrams_and_tags(self) : ''' from lines data(WSAtom wrapped )to trainning data(ws needed) [ inner class function ] logic : we process self.raw_training_data . and set self.training_unigrams_data , self.training_tags_data unigram_line_list : list , elements is also a list . the most inner element is the unigram . => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ... ] tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...] ''' logging.info("processing raw training data to unigrams and tags .") if self.raw_training_data is None : logging.error("failed!") return self.training_unigrams_data = [] self.training_tags_data = [] for sentence in self.raw_training_data : unigram_line , tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(sentence) self.training_tags_data.append(tags) self.training_unigrams_data.append(unigram_line) if DEBUG : logger.debug("the 1st line : %s" %( u" ".join( [ unicode(atom) for atom in self.training_unigrams_data[0]] ).encode('utf8') )) logger.debug("the 1st tag list : " + " ".join([ TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0] ])) logger.debug("the 1st origin seg line : " + " ".join( [WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_list).encode("utf8") for atom_list in self.raw_training_data[0]]))
def build_lexicon_match_state(self , instance) : ''' for every unigram , record : as a word head , the word 's max length as a word middle , the word's max length as a word end , the word's max length ''' instance_len = len(instance) match_state = [ [ 1 ] * 3 for i in range(instance_len) ] #! minimum length is 1 for i in range(instance_len) : j = min( i + LEXICON_MATCH_MAX_LENGTH , instance_len - 1 ) while j > i : test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line(instance[i:j+1]) if test_word in self.lexicon : word_len = j - i + 1 #! max length as the word head match_state[i][0] = max( match_state[i][0] , word_len ) #! max length as the word middle for interval in range(i+1 , j) : match_state[interval][1] = max( match_state[interval][1] , word_len ) #! max length as the word end match_state[j][2] = max( match_state[j][2] , word_len ) break j -= 1 return match_state
def _build_inner_lexicon(self , threshold=1.) : logging.info("build inner lexicon from training data .") if self.raw_training_data is None : logging.error('failed') return words_counter = Counter() for raw_instance in self.raw_training_data : #! len > 1 to ensure it is a lexicon unicode_instance = [ WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_instance_gram_list) for atom_instance_gram_list in raw_instance if len(atom_instance_gram_list) > 1 ] words_counter.update(unicode_instance) total_freq = sum(words_counter.viewvalues()) lexicon_list = [] if threshold < 1. : ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words . ##! but it clearly will cause some words were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad. ##! So do following logic to keep fair . ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )! threshold_num = int( total_freq * threshold ) pre_freq = INF words_has_same_freq = [] freq_counter = 0 for word , freq in words_counter.most_common() : if freq != pre_freq : lexicon_list.extend(words_has_same_freq) words_has_same_freq = [] pre_freq = freq if freq_counter > threshold_num : break words_has_same_freq.append(word) freq_counter += freq else : lexicon_list.extend(words_has_same_freq) #! if it is because all word is iterated , we should apend it ! else : lexicon_list = words_counter.keys() logging.info( "inner lexicon info : %d/%d" %( len(lexicon_list) , len(words_counter) ) ) if DEBUG : freq_in_lexicon = 0 min_freq = INF for word in lexicon_list : word_freq = words_counter[word] freq_in_lexicon += word_freq if word_freq < min_freq : min_freq = word_freq logger.debug("origin words count : " + str(len(words_counter))) logger.debug("lexicon count : " + str(len(lexicon_list))) logger.debug( ("thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d )," "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%" %( threshold_num , freq_in_lexicon , total_freq , min_freq , freq_in_lexicon / float(total_freq) , len(lexicon_list) / float(len(words_counter)) )) ) self.inner_lexicon = dict.fromkeys(lexicon_list) #! to make it more efficient
def _build_inner_lexicon(self, threshold=1.): logging.info("build inner lexicon from training data .") if self.raw_training_data is None: logging.error('failed') return words_counter = Counter() for raw_instance in self.raw_training_data: #! len > 1 to ensure it is a lexicon unicode_instance = [ WSAtomTranslator.trans_atom_gram_list2unicode_line( atom_instance_gram_list) for atom_instance_gram_list in raw_instance if len(atom_instance_gram_list) > 1 ] words_counter.update(unicode_instance) total_freq = sum(words_counter.viewvalues()) lexicon_list = [] if threshold < 1.: ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words . ##! but it clearly will cause some words were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad. ##! So do following logic to keep fair . ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )! threshold_num = int(total_freq * threshold) pre_freq = INF words_has_same_freq = [] freq_counter = 0 for word, freq in words_counter.most_common(): if freq != pre_freq: lexicon_list.extend(words_has_same_freq) words_has_same_freq = [] pre_freq = freq if freq_counter > threshold_num: break words_has_same_freq.append(word) freq_counter += freq else: lexicon_list.extend( words_has_same_freq ) #! if it is because all word is iterated , we should apend it ! else: lexicon_list = words_counter.keys() logging.info("inner lexicon info : %d/%d" % (len(lexicon_list), len(words_counter))) if DEBUG: freq_in_lexicon = 0 min_freq = INF for word in lexicon_list: word_freq = words_counter[word] freq_in_lexicon += word_freq if word_freq < min_freq: min_freq = word_freq logger.debug("origin words count : " + str(len(words_counter))) logger.debug("lexicon count : " + str(len(lexicon_list))) logger.debug(( "thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d )," "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%" % (threshold_num, freq_in_lexicon, total_freq, min_freq, freq_in_lexicon / float(total_freq), len(lexicon_list) / float(len(words_counter))))) self.inner_lexicon = dict.fromkeys( lexicon_list) #! to make it more efficient