class SVM: def __init__(self): self.parser = Parser() self.weather_models = [] self.time_models = [] self.is_weather_model = None self.default_data_features = [] self.data = None self.index = None self.index_map = None self.threshold = 0.7 self.weather_labels = ["clouds", "cold", "dry", "hot", "humid", "hurricane", "I can't tell", "ice", "other", "rain", "snow", "storms", "sun", "tornado", "wind"] def initialize_svm(self): # get file path, depending on the location from which the class is called cwd = os.getcwd() cwd = cwd.split('/') if cwd[len(cwd)-1] == 'src': index_file_path = '../data/svm/data.index' map_file_path = '../data/svm/data.map' models_file_path = '../data/svm/models/' else: index_file_path = 'data/svm/data.index' map_file_path = 'data/svm/data.map' models_file_path = 'data/svm/models/' self.load_all_models(models_file_path) if self.index is None: index = self.parser.load_pickled_data(index_file_path) index_map = self.parser.load_pickled_data(map_file_path) self.index = index self.index_map = index_map def load_all_models(self, path): filepath = path + 's5.model0.01' model = self.read_model(filepath) self.is_weather_model = model for i in range(4): filepath = path + 'new_c_w{}.model1'.format(i+1) model = self.read_model(filepath) self.time_models.append(model) for i in range(15): # filepath = path + 'new_c_k{}.model0.1'.format(i+1) filepath = path + 'k{}.model0.1'.format(i+1) model = self.read_model(filepath) self.weather_models.append(model) def load_data(self, rel_path): ''' Loads data from a SVMLight file using the svmlight_loader library: https://github.com/mblondel/svmlight-loader Returns a list of the dataset and the labels ''' abs_path = os.path.abspath(rel_path) (x_train, labels) = svml.load_svmlight_file(abs_path) return [x_train, labels] def combine_data(self, data): ''' Returns a list that combines the point coordinates and their labels ''' print 'Combining data...' combined_data = [] labels = data[1] data_list = np.array(data[0].todense()).tolist() for i in range(len(labels)): combined_data.append([labels[i], data_list[i]]) if i%100 == 0: print 'Combined {} data'.format(i) return combined_data def format_data(self, data): formatted_data = [] print 'Formatting data...' default_data_features = [] for i in range(len(data[0][1])): default_data_features.append((i+1, 0)) data_num = 0 for datum in data: nonzero_elements = np.nonzero(datum[1])[0] data_features = default_data_features[:] # pdb.set_trace() for e in nonzero_elements: data_features[e-1] = (e+1, datum[1][e]) if data_num%100 == 0: print 'Formatted {} data'.format(data_num) data_num += 1 formatted_data.append((datum[0], data_features)) return formatted_data def format_for_svmlight(self, data): combined_data = self.combine_data(data) formatted_data = self.format_data(combined_data) return formatted_data def format_tweet_for_svmlight(self, tweet): data_features = [] word_dict = {} for word in tweet: try: word_dict[word] += 1 except: word_dict[word] = 1 for word in tweet: try: idx = self.index_map[word] data_features.append((idx, word_dict[word])) except: pass return [(1, data_features)] def read_model(self, rel_path): abs_path = os.path.abspath(rel_path) model = svmlight.read_model(abs_path) return model def train(self, data, t=0, C=1.0): model = svmlight.learn(data, type="classifier", t=t, C=C) return model def get_weather_tweets(self, tweets): weather_tweets = [] if not isinstance(tweets, list): tweets = [tweets] count = 0 for tweet in tweets: count += 1 formatted_tweet = self.parser.stem_sentence_porter(tweet) formatted_tweet = self.format_tweet_for_svmlight(formatted_tweet) c = svmlight.classify(self.is_weather_model, formatted_tweet) if count%100 == 0: print count if c[0] < 0: weather_tweets.append(tweet) return weather_tweets def classify(self, model, data): classifications = svmlight.classify(model, data) return classifications def classify_tweet(self, tweet): try: tweet = self.parser.stem_sentence_porter(tweet) formatted_tweet = self.format_tweet_for_svmlight(tweet) time_class = [] weather_class = [] for model in self.time_models: time_class.append(self.classify(model, formatted_tweet)[0]) for model in self.weather_models: weather_class.append(self.classify(model, formatted_tweet)[0]) return weather_class, time_class except: print 'You have yet to load the models.' print 'Please load all models with load_all_models()' return None def classify_tweets(self, tweets, formatted_tweets): weather_class = [] tweet_dict = {} count = 0 for model in self.weather_models: scores = self.classify(model, formatted_tweets) weather_class.append(scores) for i in range(len(scores)): if scores[i] > self.threshold: try: tweet_dict[self.weather_labels[count]].append(tweets[i]) except: tweet_dict[self.weather_labels[count]] = [tweets[i]] count += 1 results = [] for i in range(len(weather_class)): results.append([sum(weather_class[i]), self.weather_labels[i]]) return results, tweet_dict