Exemple #1
0
	def convert(self, id, model):
		segmented = segmenter.morph_segments(model, id)
		mathed_segments = [seg for seg in segmented if bool(re.match(self.match, seg))]

		if len(mathed_segments) > self.max_num_segs:
			mathed_segments = mathed_segments[:self.max_num_segs]
			print "truncate %s" % id

		mapped_id = [self.voca.get(seg, self.voca.get('*')) for seg in mathed_segments]
		mapped_id.extend([self.voca.get('#')] * (self.max_num_segs - len(mathed_segments)))
		return mapped_id, mathed_segments
Exemple #2
0
def analyze(users, model, target='gender'):
	for user in users:
		segmented = segmenter.morph_segments(model, user.id)
		user.segmented = segmented
		print user

	analyzer = Analyzer(users, target)

	morph_prob_list = analyzer.morphs.values()
	morph_prob_list.sort(key=lambda x: x.mi, reverse=True)

	for morph_prob in morph_prob_list:
		print morph_prob
Exemple #3
0
def get_word_probs(users, model, voca, target, labels):
    for user in users:
        segmented = segmenter.morph_segments(morfessor, user.id, match='[a-z]+')
        user.segmented = segmented
    analyzer = Analyzer(users, target)

    probs = np.zeros((len(labels), len(voca)))
    for word, morph_stat in analyzer.morphs.iteritems():
        idx = voca.get(word)
        probs[:, idx] = [morph_stat.p_given_att.get(label) for label in labels]

    smoothed_to = min(morph_stat.p for morph_stat in analyzer.morphs.values())
    smoothed_probs = np.where(probs <= smoothed_to, smoothed_to, probs)
    return smoothed_probs, smoothed_to
Exemple #4
0
	def preprocess(self, ids, model, is_training):
		segmented_ids = list()

		word_freq = self.word_freq
		for id in ids:
			segmented = segmenter.morph_segments(model, id)
			matched_segments = [seg for seg in segmented if bool(re.match(self.match, seg))]

			if len(matched_segments) >= 0:
				segmented_ids.append(matched_segments)
				if is_training:
					for seg in matched_segments:
						word_freq[seg] += 1

			if len(matched_segments) > self.max_num_segs:
				if is_training:
					self.max_num_segs = len(matched_segments)
				else:
					matched_segments = matched_segments[:self.max_num_segs]
					print "truncate %s" % id

		if is_training:
			voca = dict(zip(word_freq.keys(), range(len(word_freq))))
			voca['*'] = len(word_freq)
			voca['#'] = len(word_freq) + 1
			self.voca = voca
			segmented_ids.append(['*'] * (self.max_num_segs - 1) + ['#'])
			self.n_training = len(segmented_ids)

		mapped_ids = list()
		for segmented_id in segmented_ids:
			mapped_id = [self.voca.get(seg, self.voca.get('*')) for seg in segmented_id]
			mapped_id.extend([self.voca.get('#')] * (self.max_num_segs - len(segmented_id)))
			mapped_ids.append(mapped_id)

		data = np.array(mapped_ids, dtype='int32')
		print data, data.shape
		return data, segmented_ids