Ejemplo n.º 1
0
 def transform(self, text):
     text = normalize.normalize_askfm(text, h2z=False)
     # fn = {'pos': self.to_pos, 'pos1': self.to_pos1, 'pos2': self.to_pos2,
     #       'cform': self.to_cform, 'base': self.to_base}
     for surface, features in morph.parse(text, opt=self.dicdir):
         # yield {name: func(surface, features) for name, func in fn.iteritems()}
         yield {'base': self.to_base(surface, features)}
Ejemplo n.º 2
0
def generate_matrix():
    D = []
    y = []
    fex = features.IpadicFeature()
    progress = 0
    print('create feature dictionary')
    for q, a in load_corpus():
        D.append(list(fex.transform(q)))
        a = normalize.normalize_askfm(a, h2z=False)
        y.append(isnot_shitsumon(a))
        progress += 1
        if progress % 100 == 0:
            print(progress)

    dv = DictVectorizer()
    dv.fit(itertools.chain(*D))

    progress = 0
    print('create feature vector')
    X = []
    for ds in D:
        count = None
        for d in ds:
            v = dv.transform(d)
            if count is None:
                count = v
            else:
                count += v
        X.append(count)
        progress += 1
        if progress % 100 == 0:
            print(progress)
    X = scipy.sparse.vstack(X)
    y = numpy.array(y)
    return X, y, dv
Ejemplo n.º 3
0
def main():
    texts = load_all_texts()
    supervised = load_supervised_texts()
    while True:
        supervised_texts = supervised.keys()
        y = supervised.values()
        texts -= set(supervised_texts)
        X, lb = vectorize_with_learn(supervised_texts)
        clf = learn(X, y)
        cand = candidate(texts, clf, lb)
        for t in cand:
            n = normalize.normalize_askfm(t, h2z=False)
            print(n)
            while True:
                i = raw_input('クソリプ?(y/n) ')
                if i == 'y' or i == 'n':
                    break
            if i == 'y':
                supervised[t] = True
            else:
                supervised[t] = False
        save_supervised_texts(supervised)