コード例 #1
0
ファイル: naivebayes.py プロジェクト: taehoonl/HashtagWeather
class NaiveBayes:

	def __init__(self, feature, vocab_size, positive_y, negative_y, positive_x, negative_x):
		self.parser = Parser()
		self.feature = feature 							# string... (ex) 'k1' 'w1'
		self.vocab_size = vocab_size

		self.total = float(len(positive_y) + len(negative_y))
		self.positive_y = float(len(positive_y)) 			# (Y = +1)
		self.negative_y = float(len(negative_y))			# (Y = -1)

		self.positive_x = positive_x					# (X = xi | Y = +1)
		self.negative_x = negative_x					# (X = xi | Y = -1)
		self.positive_sum = float(sum(positive_x.values()))
		self.negative_sum = float(sum(negative_x.values()))

	def classify(self, example):
		e = example
		if type(example) is str:
			e = self.parser.stem_sentence_porter(example)
		elif type(example) is list:
			pass
		else:
			print "example should be of type str or list of str"
		positive_score = self.score(e, True)
		negative_score = self.score(e, False)
		if positive_score > negative_score:
			return 1.0
		return -1.0
		# return positive_score
		# result = -1.0*(positive_score - negative_score)#/ (abs(positive_score) + abs(negative_score))
		# if result > 0:
		# 	return math.log10(result)
		# return -1.0

	def score(self, example, positive):
		py, px, tw = None, None, None
		if positive:
			py = self.positive_y / self.total
			px = self.positive_x
			tw = self.positive_sum
		else:
			py = self.negative_y / self.total
			px = self.negative_x
			tw = self.negative_sum
		# s = math.log10(py)
		s = py
		for e in example:
			if e in px:
				s *= float(px[e])/float(tw)
				# s += math.log10(float(px[e])/float(tw))
				# s += math.log10(float(1+px[e]) / float(self.vocab_size+tw))
		return s
コード例 #2
0
ファイル: node.py プロジェクト: taehoonl/HashtagWeather
class Node:
	def __init__(self, left=None, right=None, criterion=None, label=None, depth=None):
		# in our case, it is on whether the document has a word -- a string
		self.criterion = criterion
		self.left = left
		self.right = right
		self.label = label
		self.depth = depth
		self.parser = None

	def get_label(self, tweet, tweet_cleaned=False):
		if self.parser is None:
			self.parser = Parser()
		if not tweet_cleaned:
			tweet = self.parser.stem_sentence_porter(tweet)
			tweet_cleaned = True
		if self.criterion:
			if self.criterion in tweet:
				# if has, go right, else left
				return self.right.get_label(tweet, tweet_cleaned)
			else:
				return self.left.get_label(tweet, tweet_cleaned)
		return self.label
コード例 #3
0
ファイル: svm.py プロジェクト: taehoonl/HashtagWeather
class SVM:

	def __init__(self):
		self.parser = Parser()
		self.weather_models = []
		self.time_models = []
		self.is_weather_model = None
		self.default_data_features = []
		self.data = None
		self.index = None
		self.index_map = None
		self.threshold = 0.7
		self.weather_labels = ["clouds", "cold", "dry", "hot", "humid", "hurricane",
							   "I can't tell", "ice", "other", "rain", "snow", "storms",
							   "sun", "tornado", "wind"]

	def initialize_svm(self):
		# get file path, depending on the location from which the class is called
		cwd = os.getcwd()
		cwd = cwd.split('/')
		if cwd[len(cwd)-1] == 'src':
			index_file_path = '../data/svm/data.index'
			map_file_path = '../data/svm/data.map'
			models_file_path = '../data/svm/models/'
		else:
			index_file_path = 'data/svm/data.index'
			map_file_path = 'data/svm/data.map'
			models_file_path = 'data/svm/models/'
		self.load_all_models(models_file_path)
		if self.index is None:
			index = self.parser.load_pickled_data(index_file_path)
			index_map = self.parser.load_pickled_data(map_file_path)
			self.index = index
			self.index_map = index_map

	def load_all_models(self, path):

		filepath = path + 's5.model0.01'
		model = self.read_model(filepath)
		self.is_weather_model = model

		for i in range(4):
			filepath = path + 'new_c_w{}.model1'.format(i+1)
			model = self.read_model(filepath)
			self.time_models.append(model)

		for i in range(15):
			# filepath = path + 'new_c_k{}.model0.1'.format(i+1)
			filepath = path + 'k{}.model0.1'.format(i+1)
			model = self.read_model(filepath)
			self.weather_models.append(model)

	def load_data(self, rel_path):
		'''
		Loads data from a SVMLight file using the svmlight_loader
		library: https://github.com/mblondel/svmlight-loader
		Returns a list of the dataset and the labels
		'''
		abs_path = os.path.abspath(rel_path)

		(x_train, labels) = svml.load_svmlight_file(abs_path)
		return [x_train, labels]

	def combine_data(self, data):
		'''
		Returns a list that combines the point coordinates
		and their labels
		'''
		print 'Combining data...'
		combined_data = []
		labels = data[1]
		data_list = np.array(data[0].todense()).tolist()
		for i in range(len(labels)):
			combined_data.append([labels[i], data_list[i]])
			if i%100 == 0:
				print 'Combined {} data'.format(i)
		return combined_data

	def format_data(self, data):
		formatted_data = []
		print 'Formatting data...'

		default_data_features = []

		for i in range(len(data[0][1])):
			default_data_features.append((i+1, 0))

		data_num = 0
		for datum in data:
			nonzero_elements = np.nonzero(datum[1])[0]
			data_features = default_data_features[:]
			# pdb.set_trace()
			for e in nonzero_elements:
				data_features[e-1] = (e+1, datum[1][e])

			if data_num%100 == 0:
				print 'Formatted {} data'.format(data_num)
			data_num += 1
			formatted_data.append((datum[0], data_features))
		return formatted_data

	def format_for_svmlight(self, data):
		combined_data = self.combine_data(data)
		formatted_data = self.format_data(combined_data)
		return formatted_data

	def format_tweet_for_svmlight(self, tweet):
		data_features = []
		word_dict = {}
		for word in tweet:
			try:
				word_dict[word] += 1
			except:
				word_dict[word] = 1
		for word in tweet:
			try:
				idx = self.index_map[word]
				data_features.append((idx, word_dict[word]))
			except:
				pass
		return [(1, data_features)]


	def read_model(self, rel_path):
		abs_path = os.path.abspath(rel_path)
		model = svmlight.read_model(abs_path)
		return model

	def train(self, data, t=0, C=1.0):
		model = svmlight.learn(data, type="classifier", t=t, C=C)
		return model

	def get_weather_tweets(self, tweets):
		weather_tweets = []
		if not isinstance(tweets, list):
			tweets = [tweets]
		count = 0
		for tweet in tweets:
			count += 1
			formatted_tweet = self.parser.stem_sentence_porter(tweet)
			formatted_tweet = self.format_tweet_for_svmlight(formatted_tweet)
			c = svmlight.classify(self.is_weather_model, formatted_tweet)
			if count%100 == 0:
				print count
			if c[0] < 0:
				weather_tweets.append(tweet)
		return weather_tweets

	def classify(self, model, data):
		classifications = svmlight.classify(model, data)
		return classifications

	def classify_tweet(self, tweet):
		try:
			tweet = self.parser.stem_sentence_porter(tweet)
			formatted_tweet = self.format_tweet_for_svmlight(tweet)
			time_class = []
			weather_class = []
			for model in self.time_models:
				time_class.append(self.classify(model, formatted_tweet)[0])
			for model in self.weather_models:
				weather_class.append(self.classify(model, formatted_tweet)[0])
			return weather_class, time_class
		except:
			print 'You have yet to load the models.'
			print 'Please load all models with load_all_models()'
			return None

	def classify_tweets(self, tweets, formatted_tweets):
		weather_class = []
		tweet_dict = {}
		count = 0
		for model in self.weather_models:
			scores = self.classify(model, formatted_tweets)
			weather_class.append(scores)
			for i in range(len(scores)):
				if scores[i] > self.threshold:
					try:
						tweet_dict[self.weather_labels[count]].append(tweets[i])
					except:
						tweet_dict[self.weather_labels[count]] = [tweets[i]]
			count += 1
		results = []
		for i in range(len(weather_class)):
			results.append([sum(weather_class[i]), self.weather_labels[i]])
		return results, tweet_dict