Ejemplo n.º 1
0
    def test_retweet_original_tweet(self):
        ref_fn = os.path.join(self.subdir,
                              'tweets.20150430-223406.retweet.csv.ref')
        with TemporaryDirectory() as tempdir:
            outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv')
            json2csv_entities(
                self.infile,
                outfn,
                ['id'],
                'retweeted_status',
                [
                    'created_at',
                    'favorite_count',
                    'id',
                    'in_reply_to_status_id',
                    'in_reply_to_user_id',
                    'retweet_count',
                    'text',
                    'truncated',
                    'user.id',
                ],
                gzip_compress=False,
            )

            self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
 def test_tweet_usermention(self):
     ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.usermention.csv.ref')
     with TemporaryDirectory() as tempdir:
         outfn = os.path.join(tempdir, 'tweets.20150430-223406.usermention.csv')
         json2csv_entities(self.infile, outfn,
                           ['id', 'text'], 'user_mentions', ['id', 'screen_name'],
                           gzip_compress=False)
         self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
    def test_tweet_url(self):
        ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.url.csv.ref')
        with TemporaryDirectory() as tempdir:
            outfn = os.path.join(tempdir, 'tweets.20150430-223406.url.csv')
            json2csv_entities(self.infile, outfn,
                              ['id'], 'urls', ['url', 'expanded_url'],
                              gzip_compress=False)

            self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
    def test_tweet_place_boundingbox(self):
        ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.placeboundingbox.csv.ref')
        with TemporaryDirectory() as tempdir:
            outfn = os.path.join(tempdir, 'tweets.20150430-223406.placeboundingbox.csv')
            json2csv_entities(self.infile, outfn,
                              ['id', 'name'], 'place.bounding_box', ['coordinates'],
                              gzip_compress=False)

            self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
    def test_retweet_original_tweet(self):
        ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
        with TemporaryDirectory() as tempdir:
            outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv')
            json2csv_entities(self.infile, outfn, ['id'], 'retweeted_status',
                              ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id',
                               'in_reply_to_user_id', 'retweet_count', 'text', 'truncated',
                               'user.id'],
                              gzip_compress=False)

            self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
Ejemplo n.º 6
0
def test_tweet_usermention(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.usermention.csv.ref'
    outfn = tmp_path / 'tweets.20150430-223406.usermention.csv'
    json2csv_entities(
        infile,
        outfn,
        ['id', 'text'],
        'user_mentions',
        ['id', 'screen_name'],
        gzip_compress=False,
    )
    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 7
0
def test_tweet_hashtag(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.hashtag.csv.ref'
    outfn = tmp_path / 'tweets.20150430-223406.hashtag.csv'
    json2csv_entities(
        infile,
        outfn,
        ['id', 'text'],
        'hashtags',
        ['text'],
        gzip_compress=False,
    )
    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 8
0
def test_tweet_hashtag(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.hashtag.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.hashtag.csv"
    json2csv_entities(
        infile,
        outfn,
        ["id", "text"],
        "hashtags",
        ["text"],
        gzip_compress=False,
    )
    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 9
0
def test_tweet_usermention(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.usermention.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.usermention.csv"
    json2csv_entities(
        infile,
        outfn,
        ["id", "text"],
        "user_mentions",
        ["id", "screen_name"],
        gzip_compress=False,
    )
    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 10
0
def test_tweet_media(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.media.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.media.csv"
    json2csv_entities(
        infile,
        outfn,
        ["id"],
        "media",
        ["media_url", "url"],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
 def test_tweet_hashtag(self):
     ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref')
     with TemporaryDirectory() as tempdir:
         outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv')
         json2csv_entities(
             self.infile,
             outfn,
             ['id', 'text'],
             'hashtags',
             ['text'],
             gzip_compress=False,
         )
         self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
Ejemplo n.º 12
0
def test_tweet_place_boundingbox(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.placeboundingbox.csv.ref'
    outfn = tmp_path / 'tweets.20150430-223406.placeboundingbox.csv'
    json2csv_entities(
        infile,
        outfn,
        ['id', 'name'],
        'place.bounding_box',
        ['coordinates'],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 13
0
def test_tweet_place_boundingbox(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.placeboundingbox.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.placeboundingbox.csv"
    json2csv_entities(
        infile,
        outfn,
        ["id", "name"],
        "place.bounding_box",
        ["coordinates"],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 14
0
def test_userurl(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.userurl.csv.ref'
    outfn = tmp_path / 'tweets.20150430-223406.userurl.csv'
    json2csv_entities(
        infile,
        outfn,
        ['id', 'screen_name'],
        'user.urls',
        ['url', 'expanded_url'],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 15
0
def test_tweet_media(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.media.csv.ref'
    outfn = tmp_path / 'tweets.20150430-223406.media.csv'
    json2csv_entities(
        infile,
        outfn,
        ['id'],
        'media',
        ['media_url', 'url'],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 16
0
def test_userurl(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.userurl.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.userurl.csv"
    json2csv_entities(
        infile,
        outfn,
        ["id", "screen_name"],
        "user.urls",
        ["url", "expanded_url"],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 17
0
def test_tweet_place(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.place.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.place.csv"
    json2csv_entities(
        infile,
        outfn,
        ["id", "text"],
        "place",
        ["name", "country"],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 18
0
def test_tweet_place(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.place.csv.ref'
    outfn = tmp_path / 'tweets.20150430-223406.place.csv'
    json2csv_entities(
        infile,
        outfn,
        ['id', 'text'],
        'place',
        ['name', 'country'],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 19
0
 def test_tweet_hashtag(self):
     ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref')
     with TemporaryDirectory() as tempdir:
         outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv')
         json2csv_entities(
             self.infile,
             outfn,
             ['id', 'text'],
             'hashtags',
             ['text'],
             gzip_compress=False,
         )
         self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
Ejemplo n.º 20
0
def test_retweet_original_tweet(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.retweet.csv.ref'
    outfn = tmp_path / 'tweets.20150430-223406.retweet.csv'
    json2csv_entities(
        infile,
        outfn,
        ['id'],
        'retweeted_status',
        [
            'created_at',
            'favorite_count',
            'id',
            'in_reply_to_status_id',
            'in_reply_to_user_id',
            'retweet_count',
            'text',
            'truncated',
            'user.id',
        ],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 21
0
def test_retweet_original_tweet(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.retweet.csv"
    json2csv_entities(
        infile,
        outfn,
        ["id"],
        "retweeted_status",
        [
            "created_at",
            "favorite_count",
            "id",
            "in_reply_to_status_id",
            "in_reply_to_user_id",
            "retweet_count",
            "text",
            "truncated",
            "user.id",
        ],
        gzip_compress=False,
    )

    assert files_are_identical(outfn, ref_fn)
Ejemplo n.º 22
0
    json2csv(fp, 'tweets_text.csv', ['text'])

# 讀取
data = pd.read_csv('tweets_text.csv')
for line in data.text:
    print('Trump tweets content: ')
    print(line)

# 斷詞
tokenized = twitter_samples.tokenized(input_file)
for tok in tokenized[:5]:
    print('tokenized: ')
    print(tok)

# tweets 資料處理
with open(input_file) as fp:
    json2csv_entities(fp, 'tweets.20180726-155316.hashtags.csv',
                      ['id', 'text'], 'hashtags', ['text'])

with open(input_file) as fp:
    json2csv_entities(fp, 'tweets.20180726-155316.user_mentions.csv',
                      ['id', 'text'], 'user_mentions', ['id', 'screen_name'])

with open(input_file) as fp:
    json2csv_entities(fp, 'tweets.20180726-155316.media.csv',
                      ['id'], 'media', ['media_url', 'url'])

with open(input_file) as fp:
    json2csv_entities(fp, 'tweets.20180726-155316.urls.csv',
                      ['id'], 'urls', ['url', 'expanded_url'])