Ejemplo n.º 1
0
	def _prepare_resources(self):
		self.tuple_extractor = Tuple_Extractor()
		if self.tagsets:
			self.SubSeg_baseline = SubSegBaselineTracker(self.tagsets)
			self.baseline = BaselineTracker(self.tagsets)
		else:
			self.appLogger.error('Error: _prepare_resources(): Ontology tagsets not ready!')
			raise Exception('Error: _prepare_resources(): Ontology tagsets not ready!')
Ejemplo n.º 2
0
class slot_value_classifier(object):
	MY_ID = 'SLOT_VALUE_CLASSIFIER'
	def __init__(self):
		self.config = GetConfig()
		self.appLogger = logging.getLogger(self.MY_ID)
		self.models = {}
		self.model_keys = []
		self.ontology_file = ''
		self.tagsets = None
		self.feature = None
		self.is_set = False

	def reset(self):
		self.models = {}
		self.model_keys = []
		self.feature = None
		self.is_set = False

	def _prepare_resources(self):
		self.tuple_extractor = Tuple_Extractor()
		if self.tagsets:
			self.SubSeg_baseline = SubSegBaselineTracker(self.tagsets)
			self.baseline = BaselineTracker(self.tagsets)
		else:
			self.appLogger.error('Error: _prepare_resources(): Ontology tagsets not ready!')
			raise Exception('Error: _prepare_resources(): Ontology tagsets not ready!')

	def TrainFromDataSet(self, ontology_file, feature_list, dataset, model_dir, tokenizer_mode, use_stemmer, remove_stopwords):
		if not feature_list:
			self.appLogger('Error: feature list can not be empty!')
			raise Exception('Error: feature list can not be empty!')
		self._prepare_train(model_dir, ontology_file)
		# stat train samples
		label_samples, train_samples = self._stat_samples_from_dataset(dataset, feature_list)
		self._train_by_samples(model_dir, label_samples, train_samples, feature_list, tokenizer_mode, use_stemmer, remove_stopwords)

	def TrainFromSubSegments(self, ontology_file, feature_list, sub_segments, model_dir, tokenizer_mode, use_stemmer, remove_stopwords):
		if not feature_list:
			self.appLogger('Error: feature list can not be empty!')
			raise Exception('Error: feature list can not be empty!')
		self._prepare_train(model_dir, ontology_file)
		# stat train samples
		label_samples, train_samples = self._stat_samples_from_sub_segments(sub_segments, feature_list)
		self._train_by_samples(model_dir, label_samples, train_samples, feature_list, tokenizer_mode, use_stemmer, remove_stopwords)


	def TestFromDataSet(self, dataset, model_dir):
		self.LoadModel(model_dir)
		if not self.is_set:
			raise Exception('Can not load model from :%s' %(model_dir))
		
		label_samples, test_samples = self._stat_samples_from_dataset(dataset, self.feature.feature_list)

		out_label_samples = []
		for sample in test_samples:
			out_label = []
			result, result_prob = self.PredictTuple(sample)
			for k,v in result.items():
				if v == 1:
					out_label.append(k)
			out_label_samples.append(out_label)

		EvalMultiLabel(label_samples, out_label_samples)

	
	def TestFromSubSegments(self, sub_segments, model_dir):
		self.LoadModel(model_dir)
		if not self.is_set:
			raise Exception('Can not load model from :%s' %(model_dir))
		label_samples, test_samples = self._stat_samples_from_sub_segments(sub_segments, self.feature.feature_list)

		out_label_samples = []
		for sample in test_samples:
			out_label = []
			result, result_prob = self.PredictTuple(sample)
			for k,v in result.items():
				if v == 1:
					out_label.append(k)
			out_label_samples.append(out_label)

		EvalMultiLabel(label_samples, out_label_samples)

	def LoadModel(self, model_dir):
		# load config
		input = codecs.open(os.path.join(model_dir,'config.json'), 'r', 'utf-8')
		config_json = json.load(input)
		input.close()
		self.model_keys = config_json['tuples']
		# load ontology
		self.ontology_file = os.path.join(model_dir,config_json['ontology_file'])
		self.tagsets = ontology_reader.OntologyReader(self.ontology_file).get_tagsets()
		# load feature
		self.feature = feature(self.tagsets)
		self.feature.load_Lexicon(os.path.join(model_dir,config_json['feature_lexicon_file']))
		if not self.feature.is_set:
			raise Exception('Fail to load feature module!')
		# load svm model
		for key in self.model_keys:
			self.models[key] = load_model(os.path.join(model_dir, '%s.svm.m' %(key)))
		self._prepare_resources()
		self.is_set = True

	def PredictUtter(self, Utter, feature_list):
		sample_tuple = self._extract_utter_tuple(Utter, feature_list)
		#self.appLogger.debug('%s' %(sample_tuple.__str__()))
		return self.PredictTuple(sample_tuple)

	def PredictTuple(self, s_tuple):
		feature_vector = self.feature.ExtractFeatureFromTuple(s_tuple)
		result = {}
		result_prob = {}
		for key in self.model_keys:
			(label, label_prob) = self.svm_predict(self.models[key], feature_vector)
			result[key] = label
			result_prob[key] = label_prob
			# self.appLogger.debug('%s: label: %d, prob_dict:%s' %(key, label, label_prob))
		return result, result_prob

	def svm_predict(self, model, feature_vector):
		if not feature_vector:
			prob_dict = {}
			for l in model.get_labels():
				if l==0:
					prob_dict[l] = 1.0
				else:
					prob_dict[l] = 0.0
			return (0, prob_dict)

		bias = model.bias
		if bias >= 0:
			biasterm = feature_node(nr_feature+1, bias)
		else:
			biasterm = feature_node(-1, bias)

		is_prob_model = model.is_probability_model()
		x, idx = gen_feature_nodearray(feature_vector)
		x[-2] = biasterm
		nr_class = model.get_nr_class()
		if not is_prob_model:
			if nr_class <= 2:
				nr_classifier = 1
			else:
				nr_classifier = nr_class
			dec_values = (c_double * nr_classifier)()

			label = liblinear.predict_values(model, x, dec_values)
			values = dec_values[:nr_classifier]
			
			labels = model.get_labels()
			value_dict = {}
			for l,v in zip(labels,values):
				value_dict[l] = v
			return (label, value_dict)
		else:
			prob_estimates = (c_double * nr_class)()
			label = liblinear.predict_probability(model, x, prob_estimates)
			probs = prob_estimates[:nr_class]

			labels = model.get_labels()
			prob_dict = {}
			for l,p in zip(labels,probs):
				prob_dict[l] = p
			return (label, prob_dict)

	def _train_by_samples(self, model_dir, label_samples, train_samples, feature_list, tokenizer_mode, use_stemmer, remove_stopwords):
		self.reset()
		# stat lexicon
		self.feature = feature(self.tagsets, tokenizer_mode, use_stemmer, remove_stopwords)
		self.feature.Stat_Lexicon(train_samples, label_samples, feature_list)
		# extract feature, build training data
		train_labels, train_feature_samples = self._build_svm_train_samples(label_samples, train_samples)
		# begin train
		print >>sys.stderr, 'train svm models...'
		self._train_svm_models(train_labels, train_feature_samples)
		# save model
		print >>sys.stderr, 'save models'
		self._save_models(model_dir, label_samples, train_samples, train_labels, train_feature_samples)
		print >>sys.stderr, 'Done!'
	
	def _extract_utter_tuple(self, utter, feature_list):
		'''
		from utter extract feature tuple
		'''
		train_sample = []
		topic = utter['segment_info']['topic']
		for i, feature in enumerate(feature_list):
			if feature == 'TOPIC':
				train_sample.append([topic])
			elif feature == 'BASELINE':
				self.SubSeg_baseline.reset()
				self.SubSeg_baseline.addTrans(utter['transcript'], topic)
				baseline_out_label = self.SubSeg_baseline.frame
				train_sample.append(self.tuple_extractor.extract_tuple(baseline_out_label))
			elif feature.startswith('NGRAM'):
				train_sample.append([utter['transcript']])
			elif feature == 'VALUE_MATCH':
				train_sample.append((topic,[utter['transcript']]))
			else:
				self.appLogger.error('Unknown feature: %s' %(feature))
				raise Exception('Unknown feature: %s' %(feature))
		return train_sample

	def _extract_sub_seg_tuple(self, sub_seg, feature_list):
		'''
		from sub_seg extract feature tuple
		'''
		train_sample = []
		topic = sub_seg['topic']
		for i, feature in enumerate(feature_list):
			if feature == 'TOPIC':
				train_sample.append([topic])
			elif feature == 'BASELINE':
				baseline_out_label = self.SubSeg_baseline.addSubSeg(sub_seg)
				train_sample.append(self.tuple_extractor.extract_tuple(baseline_out_label))
			elif feature.startswith('NGRAM'):
				transcripts = []
				for sent in sub_seg['utter_sents']:
					transcript = sent[sent.find(':')+2:]
					transcripts.append(transcript)
				train_sample.append(transcripts)
			elif feature == 'VALUE_MATCH':
				transcripts = []
				for sent in sub_seg['utter_sents']:
					transcript = sent[sent.find(':')+2:]
					transcripts.append(transcript)
				train_sample.append((topic,transcripts))
			else:
				self.appLogger.error('Unknown feature: %s' %(feature))
				raise Exception('Unknown feature: %s' %(feature))
		return train_sample

	def _prepare_train(self, model_dir, ontology_file):
		'''
		deal with model dir
		read ontology file
		'''
		if os.path.exists(model_dir):
			shutil.rmtree(model_dir,True)
		os.mkdir(model_dir)

		self.ontology_file = ontology_file
		self.tagsets = ontology_reader.OntologyReader(ontology_file).get_tagsets()
		self._prepare_resources()

	def _stat_samples_from_dataset(self, dataset, feature_list):
		# stat train samples
		label_samples = []
		train_samples = []
		for call in dataset:
			for (log_utter, label_utter) in call:
				if 'frame_label' in label_utter:
					frame_label = label_utter['frame_label']
					label_samples.append(self.tuple_extractor.extract_tuple(frame_label))
					train_samples.append(self._extract_utter_tuple(log_utter, feature_list))
		return (label_samples, train_samples)

	def _stat_samples_from_sub_segments(self, sub_segments, feature_list):
		# stat train samples
		label_samples = []
		train_samples = []
		for session in sub_segments['sessions']:
			for sub_seg in session['sub_segments']:
				frame_label = sub_seg['frame_label']
				label_samples.append(self.tuple_extractor.extract_tuple(frame_label))
				train_samples.append(self._extract_sub_seg_tuple(sub_seg, feature_list))
		return (label_samples, train_samples)

	def _build_svm_train_samples(self, label_samples, train_samples):
		for label_sample in label_samples:
			if isinstance(label_sample, dict):
				list_label_sample = label_sample["positive"]
			else:
				list_label_sample = label_sample
			for label in list_label_sample:
				if label not in self.models:
					self.models[label] = None
		self.model_keys = self.models.keys()

		train_feature_samples = []
		for train_sample in train_samples:
			train_feature_samples.append(self.feature.ExtractFeatureFromTuple(train_sample))

		train_labels = {}
		for key in self.model_keys:
			train_labels[key] = [0] * len(train_feature_samples)
		for i,label_sample in enumerate(label_samples):
			if isinstance(label_sample, dict):
				for key in list(set(label_sample["positive"])):
					train_labels[key][i] = 1
				for key in list(set(label_sample["NpNn"])):
					train_labels[key][i] = None
			else:
				for key in list(set(label_sample)):
					train_labels[key][i] = 1
		return (train_labels, train_feature_samples)

	def _train_svm_models(self, train_labels, train_feature_samples, param_str = '-s 0 -c 1'):
		for model_key in self.model_keys:
			print 'Train tuple: %s' %(model_key)
			labels_list = []
			samples_list = []
			for label, sample in zip(train_labels[model_key], train_feature_samples):
				if label != None:
					labels_list.append(label)
					samples_list.append(sample)

			prob = problem(labels_list, samples_list)
			param = parameter(param_str)
			self.models[model_key] = liblinear.train(prob, param)

	def _save_models(self, model_dir, label_samples, train_samples, train_labels, train_feature_samples):
		out_json = {}
		out_json['tuples'] = self.model_keys
		out_json['train_samples_file'] = 'train_samples.json'
		out_json['feature_lexicon_file'] = 'feature_lexicon.json'
		out_json['ontology_file'] = 'ontology.json'
		output = codecs.open(os.path.join(model_dir, 'config.json'), 'w', 'utf-8')
		json.dump(out_json, output, indent=4)
		output.close()

		# save ontology file
		shutil.copyfile(self.ontology_file, os.path.join(model_dir,out_json['ontology_file']))

		# save train samples
		output = codecs.open(os.path.join(model_dir, out_json['train_samples_file']), 'w', 'utf-8')
		train_json = {}
		train_json['train_samples'] = train_samples
		train_json['label_samples'] = label_samples
		train_json['train_feature_samples'] = train_feature_samples
		train_json['train_labels'] = train_labels
		json.dump(train_json, output, indent=4)
		output.close()

		# save train sample nums
		output = codecs.open(os.path.join(model_dir, 'train_samples_number.json'), 'w', 'utf-8')
		train_number_json={}
		for key, labels in train_labels.items():
			pos_num = 0
			neg_num = 0
			for label in labels:
				if label == 0:
					neg_num += 1
				elif label == 1:
					pos_num += 1
			train_number_json[key] = {0:neg_num, 1:pos_num}
		json.dump(train_number_json, output, indent=4)
		output.close()

		# save feature
		self.feature.save_Lexicon(os.path.join(model_dir, out_json['feature_lexicon_file']))

		# save svm models
		for model_key in self.model_keys:
			save_model(os.path.join(model_dir, '%s.svm.m' %(model_key)), self.models[model_key])