def test_simple_duplicates(self): dupe = Datapoint(phraseid="a", sentenceid="b", phrase="b a", sentiment="1") # Train has a lot of "2" sentiments train = [Datapoint(phraseid=str(i), sentenceid=str(i), phrase="a b", sentiment="2") for i in range(10)] train.append(dupe) test = [Datapoint(*dupe)] predictor = PhraseSentimentPredictor(duplicates=True) predictor.fit(train) predicted = predictor.predict(test)[0] self.assertEqual(predicted, "1")
def test_simple_predict(self): train, test = corpus.make_train_test_split("inhaler") predictor = PhraseSentimentPredictor() predictor.fit(train) predictions = predictor.predict(test) # Same amount of predictions than input values self.assertEqual(len(predictions), len(test)) # Predicted labels where seen during training train_labels = set(x.sentiment for x in train) predicted_labels = set(predictions) self.assertEqual(predicted_labels - train_labels, set())
def test_simple_error_matrix(self): train, test = corpus.make_train_test_split("reflektor", proportion=0.4) predictor = PhraseSentimentPredictor() predictor.fit(train) error = predictor.error_matrix(test) for real, predicted in error.keys(): self.assertNotEqual(real, predicted) score = predictor.score(test) assert score > 0, "Test is valid only if score is more than 0" N = float(len(test)) wrong = sum(len(xs) for xs in error.values()) self.assertEqual((N - wrong) / N, score)
def test_simple_duplicates(self): dupe = Datapoint(phraseid="a", sentenceid="b", phrase="b a", sentiment="1") # Train has a lot of "2" sentiments train = [ Datapoint(phraseid=str(i), sentenceid=str(i), phrase="a b", sentiment="2") for i in range(10) ] train.append(dupe) test = [Datapoint(*dupe)] predictor = PhraseSentimentPredictor(duplicates=True) predictor.fit(train) predicted = predictor.predict(test)[0] self.assertEqual(predicted, "1")
if __name__ == "__main__": import argparse import json import csv import sys from samr.corpus import iter_corpus, iter_test_corpus from samr.predictor import PhraseSentimentPredictor parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename") config = parser.parse_args() config = json.load(open(config.filename)) t1=int(round(time.time() *1000)) predictor = PhraseSentimentPredictor(**config) predictor.fit(list(iter_corpus())) t2=int(round(time.time() *1000)) test = list(iter_test_corpus()) prediction = predictor.predict2(test) t3=int(round(time.time() *1000)) print t2-t1, t3-t2 writer = csv.writer(sys.stdout) writer.writerow(("PhraseId", "Sentiment")) for datapoint, sentiment in zip(test, prediction): writer.writerow((datapoint.phraseid, sentiment))
value = float(value) except ValueError: pass new[key] = value return new if __name__ == "__main__": import argparse import json import csv import sys from samr.corpus import iter_corpus, iter_test_corpus from samr.predictor import PhraseSentimentPredictor parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename") config = parser.parse_args() config = json.load(open(config.filename)) predictor = PhraseSentimentPredictor(**config) predictor.fit(list(iter_corpus())) test = list(iter_test_corpus()) prediction = predictor.predict(test) writer = csv.writer(sys.stdout) writer.writerow(("PhraseId", "Sentiment")) for datapoint, sentiment in zip(test, prediction): writer.writerow((datapoint.phraseid, sentiment))
def test_fit_returns_self(self): train, _ = corpus.make_train_test_split("defiant order") predictor = PhraseSentimentPredictor() s = predictor.fit(train) self.assertEqual(predictor, s)
def report(self, score): new = time.time() self.i += 1 print("individual {}-th fold score={}% took {} seconds".format( self.i, score * 100, new - self.last)) self.last = new if __name__ == "__main__": import argparse import json from samr.evaluation import cross_validation from samr.predictor import PhraseSentimentPredictor parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename") config = parser.parse_args() config = json.load(open(config.filename)) factory = lambda: PhraseSentimentPredictor(**config) factory() # Run once to check config is ok report = PrintPartialCV() result = cross_validation(factory, seed="robot rock", callback=report.report) print("10-fold cross validation score {}%".format(result * 100))
def runThroughSetup(self, **kwargs): predictor = PhraseSentimentPredictor(**kwargs) predictor.fit(self.train[:self.samples]) return str(predictor.score(self.test))