def test_make_train_test_split_seed_works(self): a1, a2 = corpus.make_train_test_split("a") b1, b2 = corpus.make_train_test_split("b") c1, c2 = corpus.make_train_test_split("a") self.assertEqual(a1, c1) self.assertEqual(a2, c2) self.assertNotEqual(a1, b1) self.assertNotEqual(a2, b2)
def setUp(self): self.train, self.test = corpus.make_train_test_split("mySeed") self.realTest = corpus.iter_test_corpus() self.corpus = buildVocabulary(getPhrases(self.train + self.test + self.realTest)) # self.training = buildVocabulary(getPhrases(self.train)) # self.testing = buildVocabulary(getPhrases(self.test)) self.size = 50
def test_make_train_test_split_no_shared_sentences(self): """ Test that train and test don't share sent ids. """ train, test = corpus.make_train_test_split("semis") train_ids = set(x.sentenceid for x in train) test_ids = set(x.sentenceid for x in test) self.assertEqual(train_ids & test_ids, set())
def setUp(self): self.train, self.test = corpus.make_train_test_split("mySeed") self.samples = 50000 self.xTrain = buildVocabulary(getPhrases(self.train[:self.samples])) self.xTest = buildVocabulary(getPhrases(self.test[:self.samples])) self.size = 150 self.labelsTrain = getLabels(self.train) self.labelsTest = getLabels(self.test)
def setUp(self): self.train, self.test = corpus.make_train_test_split("mySeed") self.realTest = corpus.iter_test_corpus() self.corpus = buildVocabulary( getPhrases(self.train + self.test + self.realTest)) # self.training = buildVocabulary(getPhrases(self.train)) # self.testing = buildVocabulary(getPhrases(self.test)) self.size = 50
def cross_validation(factory, seed, K=10, callback=None): seed = str(seed) scores = [] for k in range(K): train, test = make_train_test_split(seed + str(k)) predictor = factory() predictor.fit(train) score = predictor.score(test) if callback: callback(score) scores.append(score) return sum(scores) / len(scores)
def test_simple_predict(self): train, test = corpus.make_train_test_split("inhaler") predictor = PhraseSentimentPredictor() predictor.fit(train) predictions = predictor.predict(test) # Same amount of predictions than input values self.assertEqual(len(predictions), len(test)) # Predicted labels where seen during training train_labels = set(x.sentiment for x in train) predicted_labels = set(predictions) self.assertEqual(predicted_labels - train_labels, set())
def test_simple_error_matrix(self): train, test = corpus.make_train_test_split("reflektor", proportion=0.4) predictor = PhraseSentimentPredictor() predictor.fit(train) error = predictor.error_matrix(test) for real, predicted in error.keys(): self.assertNotEqual(real, predicted) score = predictor.score(test) assert score > 0, "Test is valid only if score is more than 0" N = float(len(test)) wrong = sum(len(xs) for xs in error.values()) self.assertEqual((N - wrong) / N, score)
def setUp(self): self.train, self.test = corpus.make_train_test_split("mySeed") self.config = json.load(open("../data/model2.json")) # self.samples = len(self.train) self.samples = len(self.train)
def setUp(self): self.train, self.test = corpus.make_train_test_split("mySeed")
def test_make_train_test_split_simple(self): train, test = corpus.make_train_test_split("blitz") self.assertIn("word play", [x.phrase for x in train + test]) self.assertEqual(len(set(x.sentenceid for x in test)), 1) self.assertEqual(len(set(x.sentenceid for x in test + train)), 4)
def test_fit_returns_self(self): train, _ = corpus.make_train_test_split("defiant order") predictor = PhraseSentimentPredictor() s = predictor.fit(train) self.assertEqual(predictor, s)
def getTrainingAndTestSplitOnSize(size): train, test = corpus.make_train_test_split("mySeed") trainingDataWithLabels, trainingDataShortWithLabels = getWordListsGreaterThan(size, train) testDataWithLabels, testDataShortWithLabels = getWordListsGreaterThan(size, test) return trainingDataWithLabels, trainingDataShortWithLabels, testDataWithLabels, testDataShortWithLabels
def getVocabularyOfSizeGreaterThan(size): train, test = corpus.make_train_test_split("mySeed") trainingDataWithLabels, dummy = getWordListsGreaterThan(size, train) testDataWithLabels, dummy = getWordListsGreaterThan(size, test) return trainingDataWithLabels, testDataWithLabels