def convert(self, id, model): segmented = segmenter.morph_segments(model, id) mathed_segments = [seg for seg in segmented if bool(re.match(self.match, seg))] if len(mathed_segments) > self.max_num_segs: mathed_segments = mathed_segments[:self.max_num_segs] print "truncate %s" % id mapped_id = [self.voca.get(seg, self.voca.get('*')) for seg in mathed_segments] mapped_id.extend([self.voca.get('#')] * (self.max_num_segs - len(mathed_segments))) return mapped_id, mathed_segments
def analyze(users, model, target='gender'): for user in users: segmented = segmenter.morph_segments(model, user.id) user.segmented = segmented print user analyzer = Analyzer(users, target) morph_prob_list = analyzer.morphs.values() morph_prob_list.sort(key=lambda x: x.mi, reverse=True) for morph_prob in morph_prob_list: print morph_prob
def get_word_probs(users, model, voca, target, labels): for user in users: segmented = segmenter.morph_segments(morfessor, user.id, match='[a-z]+') user.segmented = segmented analyzer = Analyzer(users, target) probs = np.zeros((len(labels), len(voca))) for word, morph_stat in analyzer.morphs.iteritems(): idx = voca.get(word) probs[:, idx] = [morph_stat.p_given_att.get(label) for label in labels] smoothed_to = min(morph_stat.p for morph_stat in analyzer.morphs.values()) smoothed_probs = np.where(probs <= smoothed_to, smoothed_to, probs) return smoothed_probs, smoothed_to
def preprocess(self, ids, model, is_training): segmented_ids = list() word_freq = self.word_freq for id in ids: segmented = segmenter.morph_segments(model, id) matched_segments = [seg for seg in segmented if bool(re.match(self.match, seg))] if len(matched_segments) >= 0: segmented_ids.append(matched_segments) if is_training: for seg in matched_segments: word_freq[seg] += 1 if len(matched_segments) > self.max_num_segs: if is_training: self.max_num_segs = len(matched_segments) else: matched_segments = matched_segments[:self.max_num_segs] print "truncate %s" % id if is_training: voca = dict(zip(word_freq.keys(), range(len(word_freq)))) voca['*'] = len(word_freq) voca['#'] = len(word_freq) + 1 self.voca = voca segmented_ids.append(['*'] * (self.max_num_segs - 1) + ['#']) self.n_training = len(segmented_ids) mapped_ids = list() for segmented_id in segmented_ids: mapped_id = [self.voca.get(seg, self.voca.get('*')) for seg in segmented_id] mapped_id.extend([self.voca.get('#')] * (self.max_num_segs - len(segmented_id))) mapped_ids.append(mapped_id) data = np.array(mapped_ids, dtype='int32') print data, data.shape return data, segmented_ids