def transform(self, text): text = normalize.normalize_askfm(text, h2z=False) # fn = {'pos': self.to_pos, 'pos1': self.to_pos1, 'pos2': self.to_pos2, # 'cform': self.to_cform, 'base': self.to_base} for surface, features in morph.parse(text, opt=self.dicdir): # yield {name: func(surface, features) for name, func in fn.iteritems()} yield {'base': self.to_base(surface, features)}
def generate_matrix(): D = [] y = [] fex = features.IpadicFeature() progress = 0 print('create feature dictionary') for q, a in load_corpus(): D.append(list(fex.transform(q))) a = normalize.normalize_askfm(a, h2z=False) y.append(isnot_shitsumon(a)) progress += 1 if progress % 100 == 0: print(progress) dv = DictVectorizer() dv.fit(itertools.chain(*D)) progress = 0 print('create feature vector') X = [] for ds in D: count = None for d in ds: v = dv.transform(d) if count is None: count = v else: count += v X.append(count) progress += 1 if progress % 100 == 0: print(progress) X = scipy.sparse.vstack(X) y = numpy.array(y) return X, y, dv
def main(): texts = load_all_texts() supervised = load_supervised_texts() while True: supervised_texts = supervised.keys() y = supervised.values() texts -= set(supervised_texts) X, lb = vectorize_with_learn(supervised_texts) clf = learn(X, y) cand = candidate(texts, clf, lb) for t in cand: n = normalize.normalize_askfm(t, h2z=False) print(n) while True: i = raw_input('クソリプ?(y/n) ') if i == 'y' or i == 'n': break if i == 'y': supervised[t] = True else: supervised[t] = False save_supervised_texts(supervised)