def build_lexicon_match_state(self, instance): ''' for every unigram , record : as a word head , the word 's max length as a word middle , the word's max length as a word end , the word's max length ''' instance_len = len(instance) match_state = [[1] * 3 for i in range(instance_len)] #! minimum length is 1 for i in range(instance_len): j = min(i + LEXICON_MATCH_MAX_LENGTH, instance_len - 1) while j > i: test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line( instance[i:j + 1]) if test_word in self.lexicon: word_len = j - i + 1 #! max length as the word head match_state[i][0] = max(match_state[i][0], word_len) #! max length as the word middle for interval in range(i + 1, j): match_state[interval][1] = max( match_state[interval][1], word_len) #! max length as the word end match_state[j][2] = max(match_state[j][2], word_len) break j -= 1 return match_state
def _processing_raw_training_data2unigrams_and_tags(self): ''' from lines data(WSAtom wrapped )to trainning data(ws needed) [ inner class function ] logic : we process self.raw_training_data . and set self.training_unigrams_data , self.training_tags_data unigram_line_list : list , elements is also a list . the most inner element is the unigram . => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ... ] tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...] ''' logging.info("processing raw training data to unigrams and tags .") if self.raw_training_data is None: logging.error("failed!") return self.training_unigrams_data = [] self.training_tags_data = [] for sentence in self.raw_training_data: unigram_line, tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags( sentence) self.training_tags_data.append(tags) self.training_unigrams_data.append(unigram_line) if DEBUG: logger.debug("the 1st line : %s" % (u" ".join( [unicode(atom) for atom in self.training_unigrams_data[0]]).encode('utf8'))) logger.debug("the 1st tag list : " + " ".join( [TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0]])) logger.debug("the 1st origin seg line : " + " ".join([ WSAtomTranslator.trans_atom_gram_list2unicode_line( atom_list).encode("utf8") for atom_list in self.raw_training_data[0] ]))
def _processing_raw_training_data2unigrams_and_tags(self) : ''' from lines data(WSAtom wrapped )to trainning data(ws needed) [ inner class function ] logic : we process self.raw_training_data . and set self.training_unigrams_data , self.training_tags_data unigram_line_list : list , elements is also a list . the most inner element is the unigram . => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ... ] tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...] ''' logging.info("processing raw training data to unigrams and tags .") if self.raw_training_data is None : logging.error("failed!") return self.training_unigrams_data = [] self.training_tags_data = [] for sentence in self.raw_training_data : unigram_line , tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(sentence) self.training_tags_data.append(tags) self.training_unigrams_data.append(unigram_line) if DEBUG : logger.debug("the 1st line : %s" %( u" ".join( [ unicode(atom) for atom in self.training_unigrams_data[0]] ).encode('utf8') )) logger.debug("the 1st tag list : " + " ".join([ TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0] ])) logger.debug("the 1st origin seg line : " + " ".join( [WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_list).encode("utf8") for atom_list in self.raw_training_data[0]]))
def build_lexicon_match_state(self , instance) : ''' for every unigram , record : as a word head , the word 's max length as a word middle , the word's max length as a word end , the word's max length ''' instance_len = len(instance) match_state = [ [ 1 ] * 3 for i in range(instance_len) ] #! minimum length is 1 for i in range(instance_len) : j = min( i + LEXICON_MATCH_MAX_LENGTH , instance_len - 1 ) while j > i : test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line(instance[i:j+1]) if test_word in self.lexicon : word_len = j - i + 1 #! max length as the word head match_state[i][0] = max( match_state[i][0] , word_len ) #! max length as the word middle for interval in range(i+1 , j) : match_state[interval][1] = max( match_state[interval][1] , word_len ) #! max length as the word end match_state[j][2] = max( match_state[j][2] , word_len ) break j -= 1 return match_state
def _build_inner_lexicon(self , threshold=1.) : logging.info("build inner lexicon from training data .") if self.raw_training_data is None : logging.error('failed') return words_counter = Counter() for raw_instance in self.raw_training_data : #! len > 1 to ensure it is a lexicon unicode_instance = [ WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_instance_gram_list) for atom_instance_gram_list in raw_instance if len(atom_instance_gram_list) > 1 ] words_counter.update(unicode_instance) total_freq = sum(words_counter.viewvalues()) lexicon_list = [] if threshold < 1. : ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words . ##! but it clearly will cause some words were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad. ##! So do following logic to keep fair . ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )! threshold_num = int( total_freq * threshold ) pre_freq = INF words_has_same_freq = [] freq_counter = 0 for word , freq in words_counter.most_common() : if freq != pre_freq : lexicon_list.extend(words_has_same_freq) words_has_same_freq = [] pre_freq = freq if freq_counter > threshold_num : break words_has_same_freq.append(word) freq_counter += freq else : lexicon_list.extend(words_has_same_freq) #! if it is because all word is iterated , we should apend it ! else : lexicon_list = words_counter.keys() logging.info( "inner lexicon info : %d/%d" %( len(lexicon_list) , len(words_counter) ) ) if DEBUG : freq_in_lexicon = 0 min_freq = INF for word in lexicon_list : word_freq = words_counter[word] freq_in_lexicon += word_freq if word_freq < min_freq : min_freq = word_freq logger.debug("origin words count : " + str(len(words_counter))) logger.debug("lexicon count : " + str(len(lexicon_list))) logger.debug( ("thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d )," "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%" %( threshold_num , freq_in_lexicon , total_freq , min_freq , freq_in_lexicon / float(total_freq) , len(lexicon_list) / float(len(words_counter)) )) ) self.inner_lexicon = dict.fromkeys(lexicon_list) #! to make it more efficient
class DatasetHandler(object): @staticmethod def is_readable(path): return (os.access(path, os.F_OK) and os.access(path, os.R_OK)) @staticmethod def is_writeable(path): if os.access(path, os.F_OK): return os.access(path, os.W_OK) #! path not exists , check dir path is writeable dir_path = os.path.dirname( os.path.abspath(path)) #!! os.path.abspath is needed ! #~ or an empty str is returned for dirname for a relative path return (os.access(dir_path, os.F_OK) and os.access(dir_path, os.W_OK)) @staticmethod def get_file_encoding(f): ''' get file's encoding ; sample and naive implementation Args : f : file Returns : encoding : str ; Attention : if failed , will cause to Exit ! ''' cur_g = f.tell() line = f.readline() f.seek(cur_g) encoding_list = [] if f.encoding is not None: encoding_list.append(f.encoding) encoding_list.extend(["utf8", "gb18030"]) uline = "" for encoding in encoding_list: try: uline = line.decode(encoding) except: uline = "" continue return encoding logging.error("failed to decode the training data . file path : '%s'" % (f.name)) print >> sys.stderr, "Exit" exit(1) @staticmethod def read_training_data(tf): ''' read lines from training dataset Args: tf : file object of training data Returns : data_lines : lines of dataset , each line is also a list , every element is also a list ! the most inner element is WSAtom . => [ [ [ WSAtom("like") , WSAtom("我") , ... ] , [WSAtom("一") , WSAtom("样")] ,... ] ] what is this ? -> the most inner list , is same as the N-grams of chars ! so as for every word , it is represented by a list of WSAtom . upper list is the sentence , the most outer is the list of sentence Why use WSAtom ? -> because we want an English word to be a `single representation` instead of `list of letters` ! ''' if type(tf) != file: try: tf = open(tf) except IOError, e: traceback.print_exc() exit(1) logging.info("reading training data from '%s'" % (tf.name)) data_lines = [] encoding = DatasetHandler.get_file_encoding(tf) WSAtom.set_encoding(encoding) for line in tf: line = line.strip() if len(line) == 0: continue uline = "" try: uline = line.decode(encoding) except: logging.warning("decoding dataset error : %s " % (line)) continue uline_parts = uline.split() atom_list = [] for uline_part in uline_parts: atom_list.append( WSAtomTranslator.trans_unicode_list2atom_gram_list( uline_part)) data_lines.append(atom_list) logging.info("%d lines read done ." % (len(data_lines))) tf.close() return data_lines
WSAtom.set_encoding(encoding) for line in df: line = line.strip() #if len(line) == 0 : #! still handle it ! # continue try: uline = line.decode(encoding) except UnicodeDecodeError, e: logging.warning("decoding dataset error : %s " % (line)) #continue uline = "" #! still handle it ! uline_parts = uline.split() atom_list = [] for uline_part in uline_parts: atom_list.append( WSAtomTranslator.trans_unicode_list2atom_gram_list( uline_part)) yield atom_list df.close() @staticmethod def read_predict_data(df): ''' An Iteration generator for predict data Args : df : file , or a path str Returns : atom_list : [ WSAtom , WSAtom , ... ] separator_position : list , the position where seperator exists ''' if not isinstance(df, file): try:
def _build_inner_lexicon(self, threshold=1.): logging.info("build inner lexicon from training data .") if self.raw_training_data is None: logging.error('failed') return words_counter = Counter() for raw_instance in self.raw_training_data: #! len > 1 to ensure it is a lexicon unicode_instance = [ WSAtomTranslator.trans_atom_gram_list2unicode_line( atom_instance_gram_list) for atom_instance_gram_list in raw_instance if len(atom_instance_gram_list) > 1 ] words_counter.update(unicode_instance) total_freq = sum(words_counter.viewvalues()) lexicon_list = [] if threshold < 1.: ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words . ##! but it clearly will cause some words were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad. ##! So do following logic to keep fair . ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )! threshold_num = int(total_freq * threshold) pre_freq = INF words_has_same_freq = [] freq_counter = 0 for word, freq in words_counter.most_common(): if freq != pre_freq: lexicon_list.extend(words_has_same_freq) words_has_same_freq = [] pre_freq = freq if freq_counter > threshold_num: break words_has_same_freq.append(word) freq_counter += freq else: lexicon_list.extend( words_has_same_freq ) #! if it is because all word is iterated , we should apend it ! else: lexicon_list = words_counter.keys() logging.info("inner lexicon info : %d/%d" % (len(lexicon_list), len(words_counter))) if DEBUG: freq_in_lexicon = 0 min_freq = INF for word in lexicon_list: word_freq = words_counter[word] freq_in_lexicon += word_freq if word_freq < min_freq: min_freq = word_freq logger.debug("origin words count : " + str(len(words_counter))) logger.debug("lexicon count : " + str(len(lexicon_list))) logger.debug(( "thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d )," "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%" % (threshold_num, freq_in_lexicon, total_freq, min_freq, freq_in_lexicon / float(total_freq), len(lexicon_list) / float(len(words_counter))))) self.inner_lexicon = dict.fromkeys( lexicon_list) #! to make it more efficient
encoding = DatasetHandler.get_file_encoding(df) WSAtom.set_encoding(encoding) for line in df : line = line.strip() #if len(line) == 0 : #! still handle it ! # continue try : uline = line.decode(encoding) except UnicodeDecodeError , e : logging.warning("decoding dataset error : %s " %(line)) #continue uline = "" #! still handle it ! uline_parts = uline.split() atom_list = [] for uline_part in uline_parts : atom_list.append(WSAtomTranslator.trans_unicode_list2atom_gram_list(uline_part)) yield atom_list df.close() @staticmethod def read_predict_data(df) : ''' An Iteration generator for predict data Args : df : file , or a path str Returns : atom_list : [ WSAtom , WSAtom , ... ] separator_position : list , the position where seperator exists ''' if not isinstance(df , file) : try :