def test_retweet_original_tweet(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv') json2csv_entities( self.infile, outfn, ['id'], 'retweeted_status', [ 'created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'text', 'truncated', 'user.id', ], gzip_compress=False, ) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_usermention(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.usermention.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.usermention.csv') json2csv_entities(self.infile, outfn, ['id', 'text'], 'user_mentions', ['id', 'screen_name'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_url(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.url.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.url.csv') json2csv_entities(self.infile, outfn, ['id'], 'urls', ['url', 'expanded_url'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_place_boundingbox(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.placeboundingbox.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.placeboundingbox.csv') json2csv_entities(self.infile, outfn, ['id', 'name'], 'place.bounding_box', ['coordinates'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_retweet_original_tweet(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv') json2csv_entities(self.infile, outfn, ['id'], 'retweeted_status', ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'text', 'truncated', 'user.id'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_usermention(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.usermention.csv.ref' outfn = tmp_path / 'tweets.20150430-223406.usermention.csv' json2csv_entities( infile, outfn, ['id', 'text'], 'user_mentions', ['id', 'screen_name'], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_hashtag(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.hashtag.csv.ref' outfn = tmp_path / 'tweets.20150430-223406.hashtag.csv' json2csv_entities( infile, outfn, ['id', 'text'], 'hashtags', ['text'], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_hashtag(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.hashtag.csv.ref" outfn = tmp_path / "tweets.20150430-223406.hashtag.csv" json2csv_entities( infile, outfn, ["id", "text"], "hashtags", ["text"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_usermention(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.usermention.csv.ref" outfn = tmp_path / "tweets.20150430-223406.usermention.csv" json2csv_entities( infile, outfn, ["id", "text"], "user_mentions", ["id", "screen_name"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_media(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.media.csv.ref" outfn = tmp_path / "tweets.20150430-223406.media.csv" json2csv_entities( infile, outfn, ["id"], "media", ["media_url", "url"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_hashtag(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv') json2csv_entities( self.infile, outfn, ['id', 'text'], 'hashtags', ['text'], gzip_compress=False, ) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_place_boundingbox(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.placeboundingbox.csv.ref' outfn = tmp_path / 'tweets.20150430-223406.placeboundingbox.csv' json2csv_entities( infile, outfn, ['id', 'name'], 'place.bounding_box', ['coordinates'], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_place_boundingbox(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.placeboundingbox.csv.ref" outfn = tmp_path / "tweets.20150430-223406.placeboundingbox.csv" json2csv_entities( infile, outfn, ["id", "name"], "place.bounding_box", ["coordinates"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_userurl(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.userurl.csv.ref' outfn = tmp_path / 'tweets.20150430-223406.userurl.csv' json2csv_entities( infile, outfn, ['id', 'screen_name'], 'user.urls', ['url', 'expanded_url'], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_media(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.media.csv.ref' outfn = tmp_path / 'tweets.20150430-223406.media.csv' json2csv_entities( infile, outfn, ['id'], 'media', ['media_url', 'url'], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_userurl(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.userurl.csv.ref" outfn = tmp_path / "tweets.20150430-223406.userurl.csv" json2csv_entities( infile, outfn, ["id", "screen_name"], "user.urls", ["url", "expanded_url"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_place(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.place.csv.ref" outfn = tmp_path / "tweets.20150430-223406.place.csv" json2csv_entities( infile, outfn, ["id", "text"], "place", ["name", "country"], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_tweet_place(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.place.csv.ref' outfn = tmp_path / 'tweets.20150430-223406.place.csv' json2csv_entities( infile, outfn, ['id', 'text'], 'place', ['name', 'country'], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_retweet_original_tweet(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.retweet.csv.ref' outfn = tmp_path / 'tweets.20150430-223406.retweet.csv' json2csv_entities( infile, outfn, ['id'], 'retweeted_status', [ 'created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'text', 'truncated', 'user.id', ], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
def test_retweet_original_tweet(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" outfn = tmp_path / "tweets.20150430-223406.retweet.csv" json2csv_entities( infile, outfn, ["id"], "retweeted_status", [ "created_at", "favorite_count", "id", "in_reply_to_status_id", "in_reply_to_user_id", "retweet_count", "text", "truncated", "user.id", ], gzip_compress=False, ) assert files_are_identical(outfn, ref_fn)
json2csv(fp, 'tweets_text.csv', ['text']) # 讀取 data = pd.read_csv('tweets_text.csv') for line in data.text: print('Trump tweets content: ') print(line) # 斷詞 tokenized = twitter_samples.tokenized(input_file) for tok in tokenized[:5]: print('tokenized: ') print(tok) # tweets 資料處理 with open(input_file) as fp: json2csv_entities(fp, 'tweets.20180726-155316.hashtags.csv', ['id', 'text'], 'hashtags', ['text']) with open(input_file) as fp: json2csv_entities(fp, 'tweets.20180726-155316.user_mentions.csv', ['id', 'text'], 'user_mentions', ['id', 'screen_name']) with open(input_file) as fp: json2csv_entities(fp, 'tweets.20180726-155316.media.csv', ['id'], 'media', ['media_url', 'url']) with open(input_file) as fp: json2csv_entities(fp, 'tweets.20180726-155316.urls.csv', ['id'], 'urls', ['url', 'expanded_url'])