Beispiel #1
0
def test_kaggle():
	N = config.N
	reviews = parse_data.parse_training_file()
	popular_words = features.get_popular_words(reviews)
	word_lists = features.create_word_lists()
	sentiment_counts = likelihood.get_sentiment_counts(reviews)
	states = v.get_states(sentiment_counts)
	ngram_counts = prior_prob.gen_ngram_counts(reviews, N)
	l = prior_prob.train_interpolation(ngram_counts, N)
	vectors = likelihood.gen_sentiment_vectors(reviews, word_lists, popular_words)

	test_reviews = parse_data.parse_training_file('data/test_data_no_true_labels.txt')

	f = open('kaggle.txt', 'w')
	f.write("Id,answer\n")
	rid = 0

	for i in test_reviews:
		lines = []
		answer = []
		tags = []

		for line in test_reviews[i]['reviews']:
			lines.append(line[1])
			tags.append(test_reviews[i]['title'])

		M = v.calculate_viterbi_matrix(N, states, lines, ngram_counts, l, vectors, sentiment_counts, word_lists, popular_words, tags)
		result = v.backtrace_viterbi(M, states, ngram_counts, l)

		for res in result:
			if res == "pos":
				f.write(str(rid) + ",1\n")
			elif res == "neu":
				f.write(str(rid) + ",0\n")
			elif res == "neg":
				f.write(str(rid) + ",-1\n")
			else:
				raise Error("Unexpected error: unidentified state.")

			rid += 1

		print "On row " + str(rid)

	f.close()
	print "Done. Output saved to kaggle.txt."
Beispiel #2
0
def test_training():
	N = config.N
	reviews = parse_data.parse_training_file('data/training_data.txt', 0, 150)
	popular_words = features.get_popular_words(reviews)
	word_lists = features.create_word_lists()
	sentiment_counts = likelihood.get_sentiment_counts(reviews)
	states = v.get_states(sentiment_counts)
	ngram_counts = prior_prob.gen_ngram_counts(reviews, N)
	l = prior_prob.train_interpolation(ngram_counts, N)
	vectors = likelihood.gen_sentiment_vectors(reviews, word_lists, popular_words)

	test_reviews = parse_data.parse_training_file('data/training_data.txt', 150)

	correct = 0
	false = 0

	for i in test_reviews:
		lines = []
		answer = []
		tags = []

		for line in test_reviews[i]['reviews']:
			lines.append(line[1])
			tags.append(test_reviews[i]['title'])
			answer.append(line[0])

		M = v.calculate_viterbi_matrix(N, states, lines, ngram_counts, l, vectors, sentiment_counts, word_lists, popular_words, tags)
		result = v.backtrace_viterbi(M, states, ngram_counts, l)

		for j in range(len(answer)):
				if answer[j] == result[j]:
					correct += 1
				else:
					false += 1

		percent = float(correct) / float(correct + false)
	
	return percent