def _prepare_resources(self): self.tuple_extractor = Tuple_Extractor() if self.tagsets: self.SubSeg_baseline = SubSegBaselineTracker(self.tagsets) self.baseline = BaselineTracker(self.tagsets) else: self.appLogger.error('Error: _prepare_resources(): Ontology tagsets not ready!') raise Exception('Error: _prepare_resources(): Ontology tagsets not ready!')
class slot_value_classifier(object): MY_ID = 'SLOT_VALUE_CLASSIFIER' def __init__(self): self.config = GetConfig() self.appLogger = logging.getLogger(self.MY_ID) self.models = {} self.model_keys = [] self.ontology_file = '' self.tagsets = None self.feature = None self.is_set = False def reset(self): self.models = {} self.model_keys = [] self.feature = None self.is_set = False def _prepare_resources(self): self.tuple_extractor = Tuple_Extractor() if self.tagsets: self.SubSeg_baseline = SubSegBaselineTracker(self.tagsets) self.baseline = BaselineTracker(self.tagsets) else: self.appLogger.error('Error: _prepare_resources(): Ontology tagsets not ready!') raise Exception('Error: _prepare_resources(): Ontology tagsets not ready!') def TrainFromDataSet(self, ontology_file, feature_list, dataset, model_dir, tokenizer_mode, use_stemmer, remove_stopwords): if not feature_list: self.appLogger('Error: feature list can not be empty!') raise Exception('Error: feature list can not be empty!') self._prepare_train(model_dir, ontology_file) # stat train samples label_samples, train_samples = self._stat_samples_from_dataset(dataset, feature_list) self._train_by_samples(model_dir, label_samples, train_samples, feature_list, tokenizer_mode, use_stemmer, remove_stopwords) def TrainFromSubSegments(self, ontology_file, feature_list, sub_segments, model_dir, tokenizer_mode, use_stemmer, remove_stopwords): if not feature_list: self.appLogger('Error: feature list can not be empty!') raise Exception('Error: feature list can not be empty!') self._prepare_train(model_dir, ontology_file) # stat train samples label_samples, train_samples = self._stat_samples_from_sub_segments(sub_segments, feature_list) self._train_by_samples(model_dir, label_samples, train_samples, feature_list, tokenizer_mode, use_stemmer, remove_stopwords) def TestFromDataSet(self, dataset, model_dir): self.LoadModel(model_dir) if not self.is_set: raise Exception('Can not load model from :%s' %(model_dir)) label_samples, test_samples = self._stat_samples_from_dataset(dataset, self.feature.feature_list) out_label_samples = [] for sample in test_samples: out_label = [] result, result_prob = self.PredictTuple(sample) for k,v in result.items(): if v == 1: out_label.append(k) out_label_samples.append(out_label) EvalMultiLabel(label_samples, out_label_samples) def TestFromSubSegments(self, sub_segments, model_dir): self.LoadModel(model_dir) if not self.is_set: raise Exception('Can not load model from :%s' %(model_dir)) label_samples, test_samples = self._stat_samples_from_sub_segments(sub_segments, self.feature.feature_list) out_label_samples = [] for sample in test_samples: out_label = [] result, result_prob = self.PredictTuple(sample) for k,v in result.items(): if v == 1: out_label.append(k) out_label_samples.append(out_label) EvalMultiLabel(label_samples, out_label_samples) def LoadModel(self, model_dir): # load config input = codecs.open(os.path.join(model_dir,'config.json'), 'r', 'utf-8') config_json = json.load(input) input.close() self.model_keys = config_json['tuples'] # load ontology self.ontology_file = os.path.join(model_dir,config_json['ontology_file']) self.tagsets = ontology_reader.OntologyReader(self.ontology_file).get_tagsets() # load feature self.feature = feature(self.tagsets) self.feature.load_Lexicon(os.path.join(model_dir,config_json['feature_lexicon_file'])) if not self.feature.is_set: raise Exception('Fail to load feature module!') # load svm model for key in self.model_keys: self.models[key] = load_model(os.path.join(model_dir, '%s.svm.m' %(key))) self._prepare_resources() self.is_set = True def PredictUtter(self, Utter, feature_list): sample_tuple = self._extract_utter_tuple(Utter, feature_list) #self.appLogger.debug('%s' %(sample_tuple.__str__())) return self.PredictTuple(sample_tuple) def PredictTuple(self, s_tuple): feature_vector = self.feature.ExtractFeatureFromTuple(s_tuple) result = {} result_prob = {} for key in self.model_keys: (label, label_prob) = self.svm_predict(self.models[key], feature_vector) result[key] = label result_prob[key] = label_prob # self.appLogger.debug('%s: label: %d, prob_dict:%s' %(key, label, label_prob)) return result, result_prob def svm_predict(self, model, feature_vector): if not feature_vector: prob_dict = {} for l in model.get_labels(): if l==0: prob_dict[l] = 1.0 else: prob_dict[l] = 0.0 return (0, prob_dict) bias = model.bias if bias >= 0: biasterm = feature_node(nr_feature+1, bias) else: biasterm = feature_node(-1, bias) is_prob_model = model.is_probability_model() x, idx = gen_feature_nodearray(feature_vector) x[-2] = biasterm nr_class = model.get_nr_class() if not is_prob_model: if nr_class <= 2: nr_classifier = 1 else: nr_classifier = nr_class dec_values = (c_double * nr_classifier)() label = liblinear.predict_values(model, x, dec_values) values = dec_values[:nr_classifier] labels = model.get_labels() value_dict = {} for l,v in zip(labels,values): value_dict[l] = v return (label, value_dict) else: prob_estimates = (c_double * nr_class)() label = liblinear.predict_probability(model, x, prob_estimates) probs = prob_estimates[:nr_class] labels = model.get_labels() prob_dict = {} for l,p in zip(labels,probs): prob_dict[l] = p return (label, prob_dict) def _train_by_samples(self, model_dir, label_samples, train_samples, feature_list, tokenizer_mode, use_stemmer, remove_stopwords): self.reset() # stat lexicon self.feature = feature(self.tagsets, tokenizer_mode, use_stemmer, remove_stopwords) self.feature.Stat_Lexicon(train_samples, label_samples, feature_list) # extract feature, build training data train_labels, train_feature_samples = self._build_svm_train_samples(label_samples, train_samples) # begin train print >>sys.stderr, 'train svm models...' self._train_svm_models(train_labels, train_feature_samples) # save model print >>sys.stderr, 'save models' self._save_models(model_dir, label_samples, train_samples, train_labels, train_feature_samples) print >>sys.stderr, 'Done!' def _extract_utter_tuple(self, utter, feature_list): ''' from utter extract feature tuple ''' train_sample = [] topic = utter['segment_info']['topic'] for i, feature in enumerate(feature_list): if feature == 'TOPIC': train_sample.append([topic]) elif feature == 'BASELINE': self.SubSeg_baseline.reset() self.SubSeg_baseline.addTrans(utter['transcript'], topic) baseline_out_label = self.SubSeg_baseline.frame train_sample.append(self.tuple_extractor.extract_tuple(baseline_out_label)) elif feature.startswith('NGRAM'): train_sample.append([utter['transcript']]) elif feature == 'VALUE_MATCH': train_sample.append((topic,[utter['transcript']])) else: self.appLogger.error('Unknown feature: %s' %(feature)) raise Exception('Unknown feature: %s' %(feature)) return train_sample def _extract_sub_seg_tuple(self, sub_seg, feature_list): ''' from sub_seg extract feature tuple ''' train_sample = [] topic = sub_seg['topic'] for i, feature in enumerate(feature_list): if feature == 'TOPIC': train_sample.append([topic]) elif feature == 'BASELINE': baseline_out_label = self.SubSeg_baseline.addSubSeg(sub_seg) train_sample.append(self.tuple_extractor.extract_tuple(baseline_out_label)) elif feature.startswith('NGRAM'): transcripts = [] for sent in sub_seg['utter_sents']: transcript = sent[sent.find(':')+2:] transcripts.append(transcript) train_sample.append(transcripts) elif feature == 'VALUE_MATCH': transcripts = [] for sent in sub_seg['utter_sents']: transcript = sent[sent.find(':')+2:] transcripts.append(transcript) train_sample.append((topic,transcripts)) else: self.appLogger.error('Unknown feature: %s' %(feature)) raise Exception('Unknown feature: %s' %(feature)) return train_sample def _prepare_train(self, model_dir, ontology_file): ''' deal with model dir read ontology file ''' if os.path.exists(model_dir): shutil.rmtree(model_dir,True) os.mkdir(model_dir) self.ontology_file = ontology_file self.tagsets = ontology_reader.OntologyReader(ontology_file).get_tagsets() self._prepare_resources() def _stat_samples_from_dataset(self, dataset, feature_list): # stat train samples label_samples = [] train_samples = [] for call in dataset: for (log_utter, label_utter) in call: if 'frame_label' in label_utter: frame_label = label_utter['frame_label'] label_samples.append(self.tuple_extractor.extract_tuple(frame_label)) train_samples.append(self._extract_utter_tuple(log_utter, feature_list)) return (label_samples, train_samples) def _stat_samples_from_sub_segments(self, sub_segments, feature_list): # stat train samples label_samples = [] train_samples = [] for session in sub_segments['sessions']: for sub_seg in session['sub_segments']: frame_label = sub_seg['frame_label'] label_samples.append(self.tuple_extractor.extract_tuple(frame_label)) train_samples.append(self._extract_sub_seg_tuple(sub_seg, feature_list)) return (label_samples, train_samples) def _build_svm_train_samples(self, label_samples, train_samples): for label_sample in label_samples: if isinstance(label_sample, dict): list_label_sample = label_sample["positive"] else: list_label_sample = label_sample for label in list_label_sample: if label not in self.models: self.models[label] = None self.model_keys = self.models.keys() train_feature_samples = [] for train_sample in train_samples: train_feature_samples.append(self.feature.ExtractFeatureFromTuple(train_sample)) train_labels = {} for key in self.model_keys: train_labels[key] = [0] * len(train_feature_samples) for i,label_sample in enumerate(label_samples): if isinstance(label_sample, dict): for key in list(set(label_sample["positive"])): train_labels[key][i] = 1 for key in list(set(label_sample["NpNn"])): train_labels[key][i] = None else: for key in list(set(label_sample)): train_labels[key][i] = 1 return (train_labels, train_feature_samples) def _train_svm_models(self, train_labels, train_feature_samples, param_str = '-s 0 -c 1'): for model_key in self.model_keys: print 'Train tuple: %s' %(model_key) labels_list = [] samples_list = [] for label, sample in zip(train_labels[model_key], train_feature_samples): if label != None: labels_list.append(label) samples_list.append(sample) prob = problem(labels_list, samples_list) param = parameter(param_str) self.models[model_key] = liblinear.train(prob, param) def _save_models(self, model_dir, label_samples, train_samples, train_labels, train_feature_samples): out_json = {} out_json['tuples'] = self.model_keys out_json['train_samples_file'] = 'train_samples.json' out_json['feature_lexicon_file'] = 'feature_lexicon.json' out_json['ontology_file'] = 'ontology.json' output = codecs.open(os.path.join(model_dir, 'config.json'), 'w', 'utf-8') json.dump(out_json, output, indent=4) output.close() # save ontology file shutil.copyfile(self.ontology_file, os.path.join(model_dir,out_json['ontology_file'])) # save train samples output = codecs.open(os.path.join(model_dir, out_json['train_samples_file']), 'w', 'utf-8') train_json = {} train_json['train_samples'] = train_samples train_json['label_samples'] = label_samples train_json['train_feature_samples'] = train_feature_samples train_json['train_labels'] = train_labels json.dump(train_json, output, indent=4) output.close() # save train sample nums output = codecs.open(os.path.join(model_dir, 'train_samples_number.json'), 'w', 'utf-8') train_number_json={} for key, labels in train_labels.items(): pos_num = 0 neg_num = 0 for label in labels: if label == 0: neg_num += 1 elif label == 1: pos_num += 1 train_number_json[key] = {0:neg_num, 1:pos_num} json.dump(train_number_json, output, indent=4) output.close() # save feature self.feature.save_Lexicon(os.path.join(model_dir, out_json['feature_lexicon_file'])) # save svm models for model_key in self.model_keys: save_model(os.path.join(model_dir, '%s.svm.m' %(model_key)), self.models[model_key])