def twitterToCsv(inputfolder, outputFolder): if not os.path.exists(outputFolder): os.makedirs(outputFolder) for file in os.listdir(inputfolder): print(file) common.json2csv(file, 'tweets_' + file + '.csv', ['text'])
def convert(self,input_file,output_file): with open(input_file) as fp: json2csv(fp, output_file, ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'retweeted', 'text', 'truncated', 'user.id']) return 1
def test_user_metadata(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.user.csv.ref" fields = ["id", "text", "user.id", "user.followers_count", "user.friends_count"] outfn = tmp_path / "tweets.20150430-223406.user.csv" json2csv(infile, outfn, fields, gzip_compress=False) assert files_are_identical(outfn, ref_fn)
def test_textoutput(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.text.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv') json2csv(self.infile, outfn, ['text'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def find_matching_tweets(num_tweets=100, fname="matching_tweets.csv", shownum=50): """Given the number of tweets to retrieve, queries that number of tweets with the keyword "Trump" and saves the tweet id and text as a csv file "fname". Prints out the shownum amount of tweets using panda. Does not remove retweets.""" oauth = credsfromfile() # create and register a streamer client = Streamer(**oauth) writer = TweetWriter(limit=num_tweets) client.register(writer) # get the name of the newly-created json file input_file = writer.timestamped_file() client.filter(track="trump") # case-insensitive client.sample() with open(input_file) as fp: # these two fields for now json2csv(fp, fname, [ 'id', 'text', ]) # pretty print using pandas tweets = pd.read_csv(fname, encoding="utf8") return tweets.head(shownum)
def test_user_metadata(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref') fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count'] with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv') json2csv(self.infile, outfn, fields, gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_file_is_wrong(tmp_path, infile): """ Sanity check that file comparison is not giving false positives. """ ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" outfn = tmp_path / "tweets.20150430-223406.text.csv" json2csv(infile, outfn, ["text"], gzip_compress=False) assert not files_are_identical(outfn, ref_fn)
def test_file_is_wrong(self): """ Sanity check that file comparison is not giving false positives. """ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv') json2csv(self.infile, outfn, ['text'], gzip_compress=False) self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_user_metadata(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.user.csv.ref' fields = [ 'id', 'text', 'user.id', 'user.followers_count', 'user.friends_count' ] outfn = tmp_path / 'tweets.20150430-223406.user.csv' json2csv(infile, outfn, fields, gzip_compress=False) assert files_are_identical(outfn, ref_fn)
def test_tweet_metadata(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.tweet.csv.ref') fields = ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'retweeted', 'text', 'truncated', 'user.id'] with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.tweet.csv') json2csv(self.infile, outfn, fields, gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_metadata(tmp_path, infile): ref_fn = subdir / 'tweets.20150430-223406.tweet.csv.ref' fields = [ 'created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'retweeted', 'text', 'truncated', 'user.id', ] outfn = tmp_path / 'tweets.20150430-223406.tweet.csv' json2csv(infile, outfn, fields, gzip_compress=False) assert files_are_identical(outfn, ref_fn)
def test_tweet_metadata(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.tweet.csv.ref" fields = [ "created_at", "favorite_count", "id", "in_reply_to_status_id", "in_reply_to_user_id", "retweet_count", "retweeted", "text", "truncated", "user.id", ] outfn = tmp_path / "tweets.20150430-223406.tweet.csv" json2csv(infile, outfn, fields, gzip_compress=False) assert files_are_identical(outfn, ref_fn)
def read_single_tweets(path): word_list = [] if os.path.isfile(path): input_tweets = twitter_samples.abspath(os.path.abspath(path)) output_tweets = os.path.join( os.path.dirname(path) + '_text', os.path.basename(path) + '.csv') os.makedirs(os.path.dirname(output_tweets), exist_ok=True) try: with open(input_tweets) as fp: json2csv(fp, output_tweets, ['text']) with open(output_tweets, 'r') as fp: reader = csv.DictReader(fp) for row in reader: try: tweet = row['text'] if detect(tweet) == 'en': word_list.append(clean_and_tokenize(tweet)) except lang_detect_exception.LangDetectException: continue except: print(path) return word_list
def test_textoutput(tmp_path, infile): ref_fn = subdir / "tweets.20150430-223406.text.csv.ref" outfn = tmp_path / "tweets.20150430-223406.text.csv" json2csv(infile, outfn, ["text"], gzip_compress=False) assert files_are_identical(outfn, ref_fn)
n = 10 # 設定拿取 tweets 資料則數 username = '******' # Query client = Query(**oauth) # 歷史資料 client.register(TweetWriter()) # 寫入 client.user_tweets(username, n) # 拿取 tweets 資料(n則) ''' 使用 json2csv 存取 tweets 資料 (text欄位) input_file 的 abspath 需參考上述 Query 寫入資料的路徑做修改 ''' input_file = twitter_samples.abspath('/Users/youngmihuang/twitter-files/tweets.20180726-155316.json') with open(input_file) as fp: json2csv(fp, 'tweets_text.csv', ['text']) # 讀取 data = pd.read_csv('tweets_text.csv') for line in data.text: print('Trump tweets content: ') print(line) # 斷詞 tokenized = twitter_samples.tokenized(input_file) for tok in tokenized[:5]: print('tokenized: ') print(tok) # tweets 資料處理 with open(input_file) as fp:
from nltk.corpus import twitter_samples from nltk.twitter.common import json2csv #corpus twitter_sample tweets ~20k jsonfile = twitter_samples.fileids()[-1] #absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path input_file = twitter_samples.abspath(jsonfile) #returns system /usr/share/ path #with open(input_file) as fp: #json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3]) #think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. with open(input_file) as fp: json2csv(fp, 'tweets_dataframe.csv',['id','text','user.favourites_count','user.id','lang','user.followers_count','user.verified','truncated']) #json, csv
from nltk.twitter.common import json2csv #absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path input_file = '/home/mradul/twitter-files/Tweets.json' #with open(input_file) as fp: #json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3]) #think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. with open(input_file) as fp: json2csv(fp, 'boston.csv',['text','id','from_user','iso_language_code']) #json, csv