def load_Lexicon(self, Lexicon_file): input = codecs.open(Lexicon_file, 'r', 'utf-8') in_json = json.load(input) input.close() self.feature_list = in_json['feature_list'] self.unigram = in_json['feature_unigram'] self.bigram = in_json['feature_bigram'] self.trigram = in_json['feature_trigram'] self.UNI_LEX = StrKeyDict2TupleKeyDict(in_json['UNI_LEX']) self.BI_LEX = StrKeyDict2TupleKeyDict(in_json['BI_LEX']) self.TRI_LEX = StrKeyDict2TupleKeyDict(in_json['TRI_LEX']) self.UNI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['UNI_LEX_weight']) self.BI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['BI_LEX_weight']) self.TRI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['TRI_LEX_weight']) self.TOPIC_LEX = in_json['TOPIC_LEX'] self.BASELINE_LEX = in_json['BASELINE_LEX'] if 'ValueMatchFeature' in in_json and in_json['ValueMatchFeature']: self.ValueMatchFeature = ValueMatchFeature(self.tagsets) self.ValueMatchFeature.Load(in_json['ValueMatchFeature']) else: self.ValueMatchFeature = None self.tokenizer_mode = in_json['tokenizer_mode'] self.tokenizer = tokenizer(self.tokenizer_mode) self.use_stemmer = in_json['use_stemmer'] self.stemmer = stemmer(self.use_stemmer) self.remove_stopwords = in_json['remove_stopwords'] self.remove_punctuation = in_json['remove_punctuation'] self.replace_num = in_json['replace_num'] self.ngram_builder = NGRAM_builder(self.remove_stopwords,self.remove_punctuation,self.replace_num) self._prepare_resources()
class feature(object): MY_ID = 'svc_feature' def __init__(self, tagsets, tokenizer_mode=None, use_stemmer=None, remove_stopwords=None): self.config = GetConfig() self.appLogger = logging.getLogger(self.MY_ID) # tokenizer if tokenizer_mode: self.tokenizer_mode = tokenizer_mode else: self.tokenizer_mode = self.config.get(self.MY_ID,'tokenizer_mode') self.appLogger.debug('tokenizer mode: %s' %(self.tokenizer_mode)) self.tokenizer = tokenizer(self.tokenizer_mode) # stemmer if use_stemmer == None: use_stemmer = self.tokenizer_mode = self.config.getboolean(self.MY_ID,'use_stemmer') self.appLogger.debug('use stemmer ? %s' %(use_stemmer)) self.use_stemmer = use_stemmer self.stemmer = stemmer(use_stemmer) # ngram builder if remove_stopwords == None: self.remove_stopwords = self.config.getboolean(self.MY_ID,'remove_stopwords') else: self.remove_stopwords = remove_stopwords self.remove_punctuation = self.config.getboolean(self.MY_ID,'remove_punctuation') self.replace_num = self.config.getboolean(self.MY_ID,'replace_num') self.ngram_builder = NGRAM_builder(self.remove_stopwords,self.remove_punctuation,self.replace_num) self.tagsets = tagsets self.feature_list = None self.unigram = False self.bigram = False self.trigram = False # feature vector self.UNI_LEX = None self.BI_LEX = None self.TRI_LEX = None self.UNI_LEX_weight = None self.BI_LEX_weight = None self.TRI_LEX_weight = None self.TOPIC_LEX = None self.BASELINE_LEX = None self.ValueMatchFeature = None self.TOPIC_LEX_offset = 0 self.UNI_LEX_offset = 0 self.BI_LEX_offset = 0 self.TRI_LEX_offset = 0 self.BASELINE_LEX_offset = 0 self.VMF_offset = 0 self.is_set = False def _set_offset(self): self.TOPIC_LEX_offset = 0 self.UNI_LEX_offset = 0 self.BI_LEX_offset = 0 self.TRI_LEX_offset = 0 self.BASELINE_LEX_offset = 0 if 'TOPIC' in self.feature_list: self.UNI_LEX_offset = self.TOPIC_LEX_offset + len(self.TOPIC_LEX) if self.unigram: self.BI_LEX_offset = self.UNI_LEX_offset + len(self.UNI_LEX) if self.bigram: self.TRI_LEX_offset = self.BI_LEX_offset + len(self.BI_LEX) if self.trigram: self.BASELINE_LEX_offset = self.TRI_LEX_offset + len(self.TRI_LEX) if 'BASELINE' in self.feature_list and self.ValueMatchFeature: self.VMF_offset = self.BASELINE_LEX_offset + self.ValueMatchFeature.GetFeatureSize() def _preprocessing(self, sent): ''' convert to lower type tokenization and stemming ''' sent = sent.lower() tokens = self.tokenizer.tokenize(sent) new_tokens = self.ngram_builder.PreReplace(tokens) new_tokens = [self.stemmer.stem(tk) for tk in new_tokens] return new_tokens def _prepare_resources(self): self._set_offset() self.is_set = True def load_Lexicon(self, Lexicon_file): input = codecs.open(Lexicon_file, 'r', 'utf-8') in_json = json.load(input) input.close() self.feature_list = in_json['feature_list'] self.unigram = in_json['feature_unigram'] self.bigram = in_json['feature_bigram'] self.trigram = in_json['feature_trigram'] self.UNI_LEX = StrKeyDict2TupleKeyDict(in_json['UNI_LEX']) self.BI_LEX = StrKeyDict2TupleKeyDict(in_json['BI_LEX']) self.TRI_LEX = StrKeyDict2TupleKeyDict(in_json['TRI_LEX']) self.UNI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['UNI_LEX_weight']) self.BI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['BI_LEX_weight']) self.TRI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['TRI_LEX_weight']) self.TOPIC_LEX = in_json['TOPIC_LEX'] self.BASELINE_LEX = in_json['BASELINE_LEX'] if 'ValueMatchFeature' in in_json and in_json['ValueMatchFeature']: self.ValueMatchFeature = ValueMatchFeature(self.tagsets) self.ValueMatchFeature.Load(in_json['ValueMatchFeature']) else: self.ValueMatchFeature = None self.tokenizer_mode = in_json['tokenizer_mode'] self.tokenizer = tokenizer(self.tokenizer_mode) self.use_stemmer = in_json['use_stemmer'] self.stemmer = stemmer(self.use_stemmer) self.remove_stopwords = in_json['remove_stopwords'] self.remove_punctuation = in_json['remove_punctuation'] self.replace_num = in_json['replace_num'] self.ngram_builder = NGRAM_builder(self.remove_stopwords,self.remove_punctuation,self.replace_num) self._prepare_resources() def save_Lexicon(self, Lexicon_file): output = codecs.open(Lexicon_file, 'w', 'utf-8') out_json = {} out_json['tokenizer_mode'] = self.tokenizer_mode out_json['use_stemmer'] = self.use_stemmer out_json['remove_stopwords'] = self.remove_stopwords out_json['remove_punctuation'] = self.remove_punctuation out_json['replace_num'] = self.replace_num out_json['feature_list'] = self.feature_list out_json['feature_unigram'] = self.unigram out_json['feature_bigram'] = self.bigram out_json['feature_trigram'] = self.trigram out_json['UNI_LEX'] = TupleKeyDict2StrKeyDict(self.UNI_LEX) out_json['BI_LEX'] = TupleKeyDict2StrKeyDict(self.BI_LEX) out_json['TRI_LEX'] = TupleKeyDict2StrKeyDict(self.TRI_LEX) out_json['UNI_LEX_weight'] = TupleKeyDict2StrKeyDict(self.UNI_LEX_weight) out_json['BI_LEX_weight'] = TupleKeyDict2StrKeyDict(self.BI_LEX_weight) out_json['TRI_LEX_weight'] = TupleKeyDict2StrKeyDict(self.TRI_LEX_weight) out_json['TOPIC_LEX'] = self.TOPIC_LEX out_json['BASELINE_LEX'] = self.BASELINE_LEX if self.ValueMatchFeature: out_json['ValueMatchFeature'] = self.ValueMatchFeature.Save() else: out_json['ValueMatchFeature'] = None json.dump(out_json, output, indent=4) output.close() def Stat_Lexicon(self, train_samples, label_samples, feature_list = ['TOPIC', 'NGRAM_u:b', 'BASELINE', 'VALUE_MATCH']): ''' train samples is a list of samples each item is a list , each item of the list is correspond to the feature list ''' if len(train_samples) != len(label_samples): self.appLogger.error('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples))) raise Exception('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples))) if len(train_samples) == 0: self.appLogger.error('Error: No samples!') raise Exception('Error: No samples!') self.feature_list = feature_list sample_field_num = len(train_samples[0]) if sample_field_num != len(self.feature_list): self.appLogger.error('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list))) raise Exception('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list))) ''' print feature_list print train_samples[0] print label_samples[0] ''' for feature in feature_list: if feature.startswith('NGRAM'): ngram_feature = feature[6:] tokens = ngram_feature.split(':') for t in tokens: if t == 'u': self.unigram = True continue elif t == 'b': self.bigram = True continue elif t == 't': self.trigram = True continue else: self.appLogger.error('Unknown ngram feature! %s' %(ngram_feature)) raise Exception('Unknown ngram feature! %s' %(ngram_feature)) for i, feature in enumerate(self.feature_list): if feature == 'TOPIC': #print i topic_samples = [train_sample[i] for train_sample in train_samples] #print topic_samples[0:3] self.TOPIC_LEX = self._stat_lexicon(topic_samples, threshold = 0) elif feature == 'BASELINE': #print i baseline_samples = [train_sample[i] for train_sample in train_samples] #print baseline_samples[0:3] self.BASELINE_LEX = self._stat_lexicon(baseline_samples, threshold = 0) elif feature.startswith('NGRAM'): #print i sent_samples = [train_sample[i] for train_sample in train_samples] #print sent_samples[0:3] unigram_lists = [] bigram_lists = [] trigram_lists = [] for sents in sent_samples: for sent in sents: #print sent tokens = self._preprocessing(sent) if self.unigram: unigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,1)) if self.bigram: bigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,2)) if self.trigram: trigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,3)) if self.unigram: self.UNI_LEX = self._stat_lexicon(unigram_lists, threshold=2) self.UNI_LEX_weight = self._calc_feature_weight(unigram_lists, label_samples, self.UNI_LEX, 'simple') if self.bigram: self.BI_LEX = self._stat_lexicon(bigram_lists, threshold=2) self.BI_LEX_weight = self._calc_feature_weight(bigram_lists, label_samples, self.BI_LEX,'simple') if self.trigram: self.TRI_LEX = self._stat_lexicon(trigram_lists, threshold=2) self.TRI_LEX_weight = self._calc_feature_weight(trigram_lists, label_samples, self.TRI_LEX,'simple') elif feature == 'VALUE_MATCH': self.ValueMatchFeature = ValueMatchFeature(self.tagsets) else: self.appLogger.error('Unknown feature! %s' %(feature)) raise Exception('Unknown feature! %s' %(feature)) return def _calc_feature_weight(self, feature_lists, label_samples, lexcion, method = 'simple'): lexicon_weight = {} if method == 'simple': for key in lexcion: lexicon_weight[key] = 1 elif method == 'IDF': for key in lexcion: lexicon_weight[key] = 0.0 N = len(feature_lists) for feature_list in feature_lists: f_list = list(set(feature_list)) for f in f_list: if f in lexcion: lexicon_weight[f] += 1 for f in lexicon_weight: lexicon_weight[f] = math.log(N/lexicon_weight[f]) else: self.appLogger.error('Unknown weight calculate method! %s' %(method)) raise Exception('Unknown weight calculate method! %s' %(method)) return lexicon_weight def _stat_lexicon(self, feature_lists, threshold): lexicon_count = {} for feature in feature_lists: for f in feature: if f in lexicon_count: lexicon_count[f] += 1 else: lexicon_count[f] = 0 lexicon_out = {} for f, count in lexicon_count.items(): if count > threshold: lexicon_out[f] = len(lexicon_out) + 1 return lexicon_out def ExtractFeatureFromTuple(self, feature_tuple): if len(feature_tuple) != len(self.feature_list): self.appLogger.error('size of feature_tuple and the feature_list mismatch! %d : %d' (len(feature_tuple), len(feature_list))) raise Exception('size of feature_tuple and the feature_list mismatch! %d : %d' (len(feature_tuple), len(feature_list))) feature_vector = {} for i, feature in enumerate(self.feature_list): if feature == 'TOPIC': for f in feature_tuple[i]: if f in self.TOPIC_LEX: idx = self.TOPIC_LEX_offset + self.TOPIC_LEX[f] if idx in feature_vector: feature_vector[idx] += 1 else: feature_vector[idx] = 1 elif feature == 'BASELINE': for f in feature_tuple[i]: if f in self.BASELINE_LEX: idx = self.BASELINE_LEX_offset + self.BASELINE_LEX[f] if idx in feature_vector: feature_vector[idx] += 1 else: feature_vector[idx] = 1 elif feature.startswith('NGRAM'): sents = feature_tuple[i] for sent in sents: tokens = self._preprocessing(sent) if self.unigram: for tk in self.ngram_builder.GenerateNGRAM(tokens,1): if tk in self.UNI_LEX: idx = self.UNI_LEX_offset + self.UNI_LEX[tk] weight = self.UNI_LEX_weight[tk] if idx in feature_vector: feature_vector[idx] += weight else: feature_vector[idx] = weight if self.bigram: for tk in self.ngram_builder.GenerateNGRAM(tokens,2): if tk in self.BI_LEX: idx = self.BI_LEX_offset + self.BI_LEX[tk] weight = self.BI_LEX_weight[tk] if idx in feature_vector: feature_vector[idx] += weight else: feature_vector[idx] = weight if self.trigram: for tk in self.ngram_builder.GenerateNGRAM(tokens,3): tk = '%s, %s, %s'%(tokens[j],tokens[j+1],tokens[j+2]) if key in self.TRI_LEX: idx = self.TRI_LEX_offset + self.TRI_LEX[tk] weight = self.TRI_LEX_weight[tk] if idx in feature_vector: feature_vector[idx] += weight else: feature_vector[idx] = weight elif feature == 'VALUE_MATCH': temp_feature_vec = {} topic = feature_tuple[i][0] sents = feature_tuple[i][1] for sent in sents: f = self.ValueMatchFeature.extract_trans_feature(sent,topic) temp_feature_vec = self.ValueMatchFeature.Merge2Features(temp_feature_vec, f) for idx, value in temp_feature_vec.items(): feature_vector[idx + self.VMF_offset] = value return feature_vector
def Stat_Lexicon(self, train_samples, label_samples, feature_list = ['TOPIC', 'NGRAM_u:b', 'BASELINE', 'VALUE_MATCH']): ''' train samples is a list of samples each item is a list , each item of the list is correspond to the feature list ''' if len(train_samples) != len(label_samples): self.appLogger.error('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples))) raise Exception('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples))) if len(train_samples) == 0: self.appLogger.error('Error: No samples!') raise Exception('Error: No samples!') self.feature_list = feature_list sample_field_num = len(train_samples[0]) if sample_field_num != len(self.feature_list): self.appLogger.error('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list))) raise Exception('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list))) ''' print feature_list print train_samples[0] print label_samples[0] ''' for feature in feature_list: if feature.startswith('NGRAM'): ngram_feature = feature[6:] tokens = ngram_feature.split(':') for t in tokens: if t == 'u': self.unigram = True continue elif t == 'b': self.bigram = True continue elif t == 't': self.trigram = True continue else: self.appLogger.error('Unknown ngram feature! %s' %(ngram_feature)) raise Exception('Unknown ngram feature! %s' %(ngram_feature)) for i, feature in enumerate(self.feature_list): if feature == 'TOPIC': #print i topic_samples = [train_sample[i] for train_sample in train_samples] #print topic_samples[0:3] self.TOPIC_LEX = self._stat_lexicon(topic_samples, threshold = 0) elif feature == 'BASELINE': #print i baseline_samples = [train_sample[i] for train_sample in train_samples] #print baseline_samples[0:3] self.BASELINE_LEX = self._stat_lexicon(baseline_samples, threshold = 0) elif feature.startswith('NGRAM'): #print i sent_samples = [train_sample[i] for train_sample in train_samples] #print sent_samples[0:3] unigram_lists = [] bigram_lists = [] trigram_lists = [] for sents in sent_samples: for sent in sents: #print sent tokens = self._preprocessing(sent) if self.unigram: unigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,1)) if self.bigram: bigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,2)) if self.trigram: trigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,3)) if self.unigram: self.UNI_LEX = self._stat_lexicon(unigram_lists, threshold=2) self.UNI_LEX_weight = self._calc_feature_weight(unigram_lists, label_samples, self.UNI_LEX, 'simple') if self.bigram: self.BI_LEX = self._stat_lexicon(bigram_lists, threshold=2) self.BI_LEX_weight = self._calc_feature_weight(bigram_lists, label_samples, self.BI_LEX,'simple') if self.trigram: self.TRI_LEX = self._stat_lexicon(trigram_lists, threshold=2) self.TRI_LEX_weight = self._calc_feature_weight(trigram_lists, label_samples, self.TRI_LEX,'simple') elif feature == 'VALUE_MATCH': self.ValueMatchFeature = ValueMatchFeature(self.tagsets) else: self.appLogger.error('Unknown feature! %s' %(feature)) raise Exception('Unknown feature! %s' %(feature)) return