class NaiveBayes: def __init__(self, feature, vocab_size, positive_y, negative_y, positive_x, negative_x): self.parser = Parser() self.feature = feature # string... (ex) 'k1' 'w1' self.vocab_size = vocab_size self.total = float(len(positive_y) + len(negative_y)) self.positive_y = float(len(positive_y)) # (Y = +1) self.negative_y = float(len(negative_y)) # (Y = -1) self.positive_x = positive_x # (X = xi | Y = +1) self.negative_x = negative_x # (X = xi | Y = -1) self.positive_sum = float(sum(positive_x.values())) self.negative_sum = float(sum(negative_x.values())) def classify(self, example): e = example if type(example) is str: e = self.parser.stem_sentence_porter(example) elif type(example) is list: pass else: print "example should be of type str or list of str" positive_score = self.score(e, True) negative_score = self.score(e, False) if positive_score > negative_score: return 1.0 return -1.0 # return positive_score # result = -1.0*(positive_score - negative_score)#/ (abs(positive_score) + abs(negative_score)) # if result > 0: # return math.log10(result) # return -1.0 def score(self, example, positive): py, px, tw = None, None, None if positive: py = self.positive_y / self.total px = self.positive_x tw = self.positive_sum else: py = self.negative_y / self.total px = self.negative_x tw = self.negative_sum # s = math.log10(py) s = py for e in example: if e in px: s *= float(px[e])/float(tw) # s += math.log10(float(px[e])/float(tw)) # s += math.log10(float(1+px[e]) / float(self.vocab_size+tw)) return s
class Node: def __init__(self, left=None, right=None, criterion=None, label=None, depth=None): # in our case, it is on whether the document has a word -- a string self.criterion = criterion self.left = left self.right = right self.label = label self.depth = depth self.parser = None def get_label(self, tweet, tweet_cleaned=False): if self.parser is None: self.parser = Parser() if not tweet_cleaned: tweet = self.parser.stem_sentence_porter(tweet) tweet_cleaned = True if self.criterion: if self.criterion in tweet: # if has, go right, else left return self.right.get_label(tweet, tweet_cleaned) else: return self.left.get_label(tweet, tweet_cleaned) return self.label
class SVM: def __init__(self): self.parser = Parser() self.weather_models = [] self.time_models = [] self.is_weather_model = None self.default_data_features = [] self.data = None self.index = None self.index_map = None self.threshold = 0.7 self.weather_labels = ["clouds", "cold", "dry", "hot", "humid", "hurricane", "I can't tell", "ice", "other", "rain", "snow", "storms", "sun", "tornado", "wind"] def initialize_svm(self): # get file path, depending on the location from which the class is called cwd = os.getcwd() cwd = cwd.split('/') if cwd[len(cwd)-1] == 'src': index_file_path = '../data/svm/data.index' map_file_path = '../data/svm/data.map' models_file_path = '../data/svm/models/' else: index_file_path = 'data/svm/data.index' map_file_path = 'data/svm/data.map' models_file_path = 'data/svm/models/' self.load_all_models(models_file_path) if self.index is None: index = self.parser.load_pickled_data(index_file_path) index_map = self.parser.load_pickled_data(map_file_path) self.index = index self.index_map = index_map def load_all_models(self, path): filepath = path + 's5.model0.01' model = self.read_model(filepath) self.is_weather_model = model for i in range(4): filepath = path + 'new_c_w{}.model1'.format(i+1) model = self.read_model(filepath) self.time_models.append(model) for i in range(15): # filepath = path + 'new_c_k{}.model0.1'.format(i+1) filepath = path + 'k{}.model0.1'.format(i+1) model = self.read_model(filepath) self.weather_models.append(model) def load_data(self, rel_path): ''' Loads data from a SVMLight file using the svmlight_loader library: https://github.com/mblondel/svmlight-loader Returns a list of the dataset and the labels ''' abs_path = os.path.abspath(rel_path) (x_train, labels) = svml.load_svmlight_file(abs_path) return [x_train, labels] def combine_data(self, data): ''' Returns a list that combines the point coordinates and their labels ''' print 'Combining data...' combined_data = [] labels = data[1] data_list = np.array(data[0].todense()).tolist() for i in range(len(labels)): combined_data.append([labels[i], data_list[i]]) if i%100 == 0: print 'Combined {} data'.format(i) return combined_data def format_data(self, data): formatted_data = [] print 'Formatting data...' default_data_features = [] for i in range(len(data[0][1])): default_data_features.append((i+1, 0)) data_num = 0 for datum in data: nonzero_elements = np.nonzero(datum[1])[0] data_features = default_data_features[:] # pdb.set_trace() for e in nonzero_elements: data_features[e-1] = (e+1, datum[1][e]) if data_num%100 == 0: print 'Formatted {} data'.format(data_num) data_num += 1 formatted_data.append((datum[0], data_features)) return formatted_data def format_for_svmlight(self, data): combined_data = self.combine_data(data) formatted_data = self.format_data(combined_data) return formatted_data def format_tweet_for_svmlight(self, tweet): data_features = [] word_dict = {} for word in tweet: try: word_dict[word] += 1 except: word_dict[word] = 1 for word in tweet: try: idx = self.index_map[word] data_features.append((idx, word_dict[word])) except: pass return [(1, data_features)] def read_model(self, rel_path): abs_path = os.path.abspath(rel_path) model = svmlight.read_model(abs_path) return model def train(self, data, t=0, C=1.0): model = svmlight.learn(data, type="classifier", t=t, C=C) return model def get_weather_tweets(self, tweets): weather_tweets = [] if not isinstance(tweets, list): tweets = [tweets] count = 0 for tweet in tweets: count += 1 formatted_tweet = self.parser.stem_sentence_porter(tweet) formatted_tweet = self.format_tweet_for_svmlight(formatted_tweet) c = svmlight.classify(self.is_weather_model, formatted_tweet) if count%100 == 0: print count if c[0] < 0: weather_tweets.append(tweet) return weather_tweets def classify(self, model, data): classifications = svmlight.classify(model, data) return classifications def classify_tweet(self, tweet): try: tweet = self.parser.stem_sentence_porter(tweet) formatted_tweet = self.format_tweet_for_svmlight(tweet) time_class = [] weather_class = [] for model in self.time_models: time_class.append(self.classify(model, formatted_tweet)[0]) for model in self.weather_models: weather_class.append(self.classify(model, formatted_tweet)[0]) return weather_class, time_class except: print 'You have yet to load the models.' print 'Please load all models with load_all_models()' return None def classify_tweets(self, tweets, formatted_tweets): weather_class = [] tweet_dict = {} count = 0 for model in self.weather_models: scores = self.classify(model, formatted_tweets) weather_class.append(scores) for i in range(len(scores)): if scores[i] > self.threshold: try: tweet_dict[self.weather_labels[count]].append(tweets[i]) except: tweet_dict[self.weather_labels[count]] = [tweets[i]] count += 1 results = [] for i in range(len(weather_class)): results.append([sum(weather_class[i]), self.weather_labels[i]]) return results, tweet_dict