def get_timeline_count(tweets_file, keywords=None, timezones=None): # get dataframe l = Loader(tweets_file) data = l.get_dataframe() def valid_keyword(x): for keyword in keywords: if keyword in x: return True return False def valid_timezone(t): for timezone in timezones: if timezone in t: return True return False if not keywords is None: data = data[data["text"].apply(valid_keyword) == True] if not timezones is None: data = data[data["user_time_zone"].apply(valid_timezone) == True] print len(data.index) data["created_at"] = data["created_at"].astype("datetime64") a = data["created_at"].groupby(data["created_at"].dt.date).count() print a a = a.to_frame() return np.array(date2num(a.index)), np.array(a.values)
def train(conf): loader = Loader(conf['embedding'], conf['text']) data, label_str, word2vec = loader.load() data = data[:700] labels = np.array(label_str[:700], dtype=np.int32) classifier = BiLstm(2, conf['embedding']['sequence_length'], word2vec.vocab_size, word2vec.embed_size) trainer = Trainer(classifier, word2vec.embeddings) trainer.train(data, labels)
def predict(conf): loader = Loader(conf['embedding'], conf['text']) data, label_str, word2vec = loader.load() classifier = BiLstm(7, conf['embedding']['sequence_length'], word2vec.vocab_size, word2vec.embed_size) predictor = Predictor(classifier) res = predictor.predict(data) f = open('./data/breakdown_predict.pik', 'wb') print(res[0]) pickle.dump(res, f) f.close()
def train(conf): loader = Loader(conf['embedding'], conf['text']) data, label_str, word2vec = loader.load() labels = np.zeros_like(label_str) for idx, val in enumerate(label_str): if val in gender_mapping: labels[idx] = gender_mapping[val] else: labels[idx] = 0 classifier = BiLstm(4, conf['embedding']['sequence_length'], word2vec.vocab_size, word2vec.embed_size) trainer = Trainer(classifier, word2vec.embeddings) trainer.train(data, labels)
class TestLoader(unittest.TestCase): def setUp(self) -> None: logging.basicConfig(level=logging.CRITICAL) self._parser = MagicMock() self._downloader = MagicMock() self._loader = Loader(self._parser, self._downloader) def test_downloads_all_urls_returned_from_parser(self): urls = ["a", "b", "c"] self._parser.get_urls.return_value = urls self._loader.start() self.check_if_there_is_a_call_for_each_url(urls) def test_IF_url_parser_throws_error_THEN_return_false(self): self._parser.get_urls.side_effect = Exception("foo") success = self._loader.start() self.assertFalse(success) def check_if_there_is_a_call_for_each_url(self, urls): expected_calls = [call(url) for url in urls] self._downloader.download.assert_has_calls(expected_calls, any_order=True) self.assertEqual(3, self._downloader.download.call_count) def test_respects_continue_on_error_equals_false(self): urls = ["a", "b", "c"] self._parser.get_urls.return_value = urls self._downloader.download.side_effect = Exception("some error") self._loader.start() self.assertEqual(1, self._downloader.download.call_count) def test_respects_continue_on_error_equals_true(self): urls = ["a", "b", "c"] self._parser.get_urls.return_value = urls self._downloader.download.side_effect = Exception("some error") self._loader.start(continue_on_error=True) self.check_if_there_is_a_call_for_each_url(urls)
def test3(): l = Loader("./data/weight.json") print l.get_dataframe()
def test2(): l = Loader(data_dir + "brexit_data.json") print "Total number of tweets: ", len(l.get_tweets()) l.remove_retweets() print "Without retweets: ", len(l.get_tweets())
from loader.loader import Loader yamlsettings = Loader.loadSettings()
def main(): loader = Loader()
tweet = df.loc[idx, "text"] day = dates_set[i] for key in keys: if key in tweet: try: counts[day] += 1 except: counts[day] = 1 df = df.drop(idx) break return df, counts l = Loader("./data/May_16.csv") df = l.get_dataframe() df["created_at"] = pandas.to_datetime(df["created_at"]) dates = df["created_at"].dt.date.to_frame() dates = np.array(date2num(dates)).flatten() print "==== Size : {} ====".format(len(df.index)) # Remove tweets containing keywords mapped to a fixed sentiment df, counts_leave = count_for_set(df, dates, leave_keys) df, counts_other = count_for_set(df, dates, other_keys) df, counts_stay = count_for_set(df, dates, stay_keys) print "==== Size : {} ====".format(len(df.index)) print "Days : ", dates
def setUp(self) -> None: logging.basicConfig(level=logging.CRITICAL) self._parser = MagicMock() self._downloader = MagicMock() self._loader = Loader(self._parser, self._downloader)