from yatk import ir from yatk.ml.svm import SVM as Classifier # from yatk.ml.nb import NaiveBayes as Classifier con = sqlite3.connect('test.db') con.row_factory = sqlite3.Row cur = con.cursor() docs = [] cur.execute('select class, text from docs') for row in cur.fetchall(): docs.append((row['class'], row['text'])) index = ir.SentimentIndex('delta', 'bogram') index.get_class = lambda x: x[0] index.get_text = lambda x: x[1] index.build(docs) x = [] y = [] for doc in docs: x.append(index.weight(index.features(doc))) y.append(doc[0]) cl = Classifier() cl.train(x, y) cl.save('test.svm') index.save('test.index') con.close()
class Guesser: def __init__(self): self._cl = Classifier() def is_candidate(self, word): return True def make_class(self, word): pass def traverse(self, sentences): x = [] y = [] for sentence in sentences: for w in range(0, len(sentence)): word = sentence[w] if not self.is_candidate(word): continue x.append(self.gen_features(sentence, w)) y.append(self.make_class(word)) return (x, y) def train(self, sentences): (train_x, train_y) = self.traverse(sentences) self._cl.train(train_x, train_y) def predict(self, sentences): (test_x, test_y) = self.traverse(sentences) return (self._cl.predict(test_x), test_y) def test(self, sentences): (estim_y, test_y) = self.predict(sentences) return self._cl.evaluate(test_y, estim_y) def guess(self, word): return self._cl.predict([self.gen_features([(word, )], 0)])[0] def gen_features(self, sentence, w): word = sentence[w][0] x = {} x['p3:' + word[0:3]] = 1 x['p4:' + word[0:4]] = 1 x['p5:' + word[0:5]] = 1 x['p6:' + word[0:6]] = 1 # x['s1:' + word[-1:]] = 1 x['s2:' + word[-2:]] = 1 x['s3:' + word[-3:]] = 1 x['s4:' + word[-4:]] = 1 x['s5:' + word[-5:]] = 1 x['w:' + word] = 1 for i in range(1, 4): if w > i - 1: word = sentence[w - i][0] # x[str(i) + 'p3:' + prev[0:3]] = 1 # x[str(i) + 'p4:' + prev[0:4]] = 1 # x[str(i) + 'p5:' + prev[0:5]] = 1 # x[str(i) + 'p6:' + prev[0:6]] = 1 # x['s1:' + word[-1:]] = 1 x[str(i) + 's2:' + word[-2:]] = 1 x[str(i) + 's3:' + word[-3:]] = 1 x[str(i) + 's4:' + word[-4:]] = 1 # x[str(i) + 's5:' + prev[-5:]] = 1 x[str(i) + 'w:' + word] = 1 for i in range(1, 2): if w + i < len(sentence) - 1: word = sentence[w + i][0] # x[str(i) + 'p3:' + prev[0:3]] = 1 # x[str(i) + 'p4:' + prev[0:4]] = 1 # x[str(i) + 'p5:' + prev[0:5]] = 1 # x[str(i) + 'p6:' + prev[0:6]] = 1 # x['s1:' + word[-1:]] = 1 x[str(i) + '+s2:' + word[-2:]] = 1 x[str(i) + '+s3:' + word[-3:]] = 1 x[str(i) + '+s4:' + word[-4:]] = 1 # x[str(i) + 's5:' + prev[-5:]] = 1 x[str(i) + '+w:' + word] = 1 return x def save(self, path): self._cl.save(path) @staticmethod def load(path): obj = Guesser() obj._cl = Classifier.load(path) return obj
class Guesser: def __init__(self): self._cl = Classifier() def is_candidate(self, word): return True def make_class(self, word): pass def traverse(self, sentences): x = [] y = [] for sentence in sentences: for w in range(0, len(sentence)): word = sentence[w] if not self.is_candidate(word): continue x.append(self.gen_features(sentence, w)) y.append(self.make_class(word)) return (x, y) def train(self, sentences): (train_x, train_y) = self.traverse(sentences) self._cl.train(train_x, train_y) def predict(self, sentences): (test_x, test_y) = self.traverse(sentences) return (self._cl.predict(test_x), test_y) def test(self, sentences): (estim_y, test_y) = self.predict(sentences) return self._cl.evaluate(test_y, estim_y) def guess(self, word): return self._cl.predict([self.gen_features([(word,)], 0)])[0] def gen_features(self, sentence, w): word = sentence[w][0] x = {} x['p3:' + word[0:3]] = 1 x['p4:' + word[0:4]] = 1 x['p5:' + word[0:5]] = 1 x['p6:' + word[0:6]] = 1 # x['s1:' + word[-1:]] = 1 x['s2:' + word[-2:]] = 1 x['s3:' + word[-3:]] = 1 x['s4:' + word[-4:]] = 1 x['s5:' + word[-5:]] = 1 x['w:' + word] = 1 for i in range(1, 4): if w > i - 1: word = sentence[w - i][0] # x[str(i) + 'p3:' + prev[0:3]] = 1 # x[str(i) + 'p4:' + prev[0:4]] = 1 # x[str(i) + 'p5:' + prev[0:5]] = 1 # x[str(i) + 'p6:' + prev[0:6]] = 1 # x['s1:' + word[-1:]] = 1 x[str(i) + 's2:' + word[-2:]] = 1 x[str(i) + 's3:' + word[-3:]] = 1 x[str(i) + 's4:' + word[-4:]] = 1 # x[str(i) + 's5:' + prev[-5:]] = 1 x[str(i) + 'w:' + word] = 1 for i in range(1, 2): if w + i < len(sentence) - 1: word = sentence[w + i][0] # x[str(i) + 'p3:' + prev[0:3]] = 1 # x[str(i) + 'p4:' + prev[0:4]] = 1 # x[str(i) + 'p5:' + prev[0:5]] = 1 # x[str(i) + 'p6:' + prev[0:6]] = 1 # x['s1:' + word[-1:]] = 1 x[str(i) + '+s2:' + word[-2:]] = 1 x[str(i) + '+s3:' + word[-3:]] = 1 x[str(i) + '+s4:' + word[-4:]] = 1 # x[str(i) + 's5:' + prev[-5:]] = 1 x[str(i) + '+w:' + word] = 1 return x def save(self, path): self._cl.save(path) @staticmethod def load(path): obj = Guesser() obj._cl = Classifier.load(path) return obj