def get_activity_set(self): """Generator iterates through the entire activity set from memory or disk.""" if self.paged and self.output_file_path is not None: for file_name in self.paged_file_list: with codecs.open(file_name, "rb") as f: for res in f: yield Tweet(json.loads(res.decode('utf-8'))) else: for res in self.rec_dict_list: yield Tweet(res)
def create_tweet_graph(graph, start_at=None): print("Creating tweet graph...") all_tweet_files = [] for data_folder in config.DATA_FOLDERS: data_path = path.join(config.DATA_DIR, data_folder, "*.jsonl.gz") all_tweet_files.extend(glob.glob(data_path)) if start_at is not None: all_tweet_files = all_tweet_files[all_tweet_files.index(start_at):] print("Starting to add tweets...") for tweet_file in all_tweet_files: print("Adding", tweet_file) try: with gzip.open(tweet_file, 'rb') as f: for tweet_dict in tqdm(f): try: tweet = Tweet(json.loads(tweet_dict)) add_tweet_to_graph(graph, tweet) except (json.JSONDecodeError, NotATweetError) as e: print(e) print(tweet) pass except Exception as e: print(tweet) raise e except EOFError as e: print(e) continue
def setUp(self): tweet_payloads = {} tweet_payloads["activity_streams"] = {} tweet_payloads["original_format"] = {} tweet_ids = [] for line in fileinput.FileInput( "tweet_payload_examples/activity_streams_examples.json"): tweet = Tweet(json.loads(line)) tweet_ids.append(tweet.id) tweet_payloads["activity_streams"][tweet.id] = tweet for line in fileinput.FileInput( "tweet_payload_examples/original_format_examples.json"): tweet = Tweet(json.loads(line)) tweet_ids.append(tweet.id) tweet_payloads["original_format"][tweet.id] = tweet self.tweet_payloads = tweet_payloads self.tweet_ids = list(set(tweet_ids))
def parse(self): for line in fileinput.FileInput(self.file): try: tweet_dict = json.loads(line) tweet = Tweet(tweet_dict) except Exception as ex: pass print(tweet.all_text)
def parse_tweet(json_string): """Parse JSON string to Tweet object. Returns None if parsing fails.""" tweet = None try: tweet_dict = json.loads(json_string) tweet = Tweet(tweet_dict) except (json.JSONDecodeError, NotATweetError): logging.debug('record is not a Tweet: {}'.format(json_string[:75])) return tweet
def on_data(self, data): producer.send("tweets", data.encode('utf-8')) tweet_dict = json.loads(data) tweet = Tweet(tweet_dict) print(tweet.created_at_string, tweet.all_text) #print(tweet) f.write(data) return True
def open_tweets(filename): tweets = [] for line in fileinput.FileInput(filename): try: tweet_dict = json.loads(line) tweet = Tweet(tweet_dict) except (json.JSONDecodeError,NotATweetError): pass tweets.append(tweet) return tweets
def test_bad_payloads(self): # missing the user field, raises a "NotATweetError" with self.assertRaises(NotATweetError): f = open( "tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_user.json", "r") tweet = json.load(f) f.close() Tweet(tweet) # missing a different required field, raises "UnexpectedFormatError" with self.assertRaises(UnexpectedFormatError): f = open( "tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_field.json", "r") tweet = json.load(f) f.close() Tweet(tweet, do_format_validation=True) # missing a different required field, raises "UnexpectedFormatError" with self.assertRaises(UnexpectedFormatError): f = open( "tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_missing_field.json", "r") tweet = json.load(f) f.close() Tweet(tweet, do_format_validation=True) # added a new field, raises "UnexpectedFormatError" with self.assertRaises(UnexpectedFormatError): f = open( "tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_additional_field.json", "r") tweet = json.load(f) f.close() Tweet(tweet, do_format_validation=True) # added a new field, raises "UnexpectedFormatError" with self.assertRaises(UnexpectedFormatError): f = open( "tweet_payload_examples/broken_and_unsupported_payloads/original_format_additional_field.json", "r") tweet = json.load(f) f.close() Tweet(tweet, do_format_validation=True)
def create_tweet_df(): """ Creates a pandas dataframe representing all of the coronavirus tweets stored in DATA_DIR/f/*.jsonl.gz where f is in DATA_FOLDERS. :return: the tweet dataframe """ tweet_df = pd.DataFrame( columns=['tweet_id', 'text', 'screen_name', 'user_id', 'name', 'bio', 'creation_date', 'profile_location', 'tweet_type', 'embedded_tweet', 'lang', 'hashtags', 'user_mentions', 'tweet_links', 'generator'])\ .astype({'tweet_id': 'str', 'user_id': 'int32'}) all_tweet_files = [] for data_folder in config.DATA_FOLDERS: data_path = path.join(config.DATA_DIR, data_folder, "*.jsonl.gz") all_tweet_files.extend(glob.glob(data_path)) for tweet_file in all_tweet_files: with gzip.open(tweet_file, 'rb') as f: for tweet_dict in tqdm(f): try: tweet = Tweet(json.loads(tweet_dict)) tweet_df = tweet_df.append( { 'tweet_id': tweet.id, 'text': tweet.all_text, 'screen_name': tweet.screen_name, 'user_id': tweet.user_id, 'name': tweet.name, 'bio': tweet.bio, 'creation_date': tweet.created_at_datetime, 'profile_location': tweet.profile_location, 'tweet_type': tweet.tweet_type, 'embedded_tweet': tweet.embedded_tweet, 'lang': tweet.lang, 'hashtags': tweet.hashtags, 'user_mentions': tweet.user_mentions, 'tweet_links': tweet.tweet_links, 'generator': tweet.generator, }, ignore_index=True) except (json.JSONDecodeError, NotATweetError) as e: print(e) print(tweet) pass except Exception as e: print(tweet) raise e return tweet_df
def execute( self, pt_filter, max_results=100, start=None, end=None, count_bucket=None # None is json , show_query=False): """Execute a query with filter, maximum results, start and end dates. Count_bucket determines the bucket size for the counts endpoint. If the count_bucket variable is set to a valid bucket size such as mintute, day or week, then the acitivity counts endpoint will Otherwise, the data endpoint is used.""" # set class start and stop datetime variables self.set_dates(start, end) # make a friendlier file name from the rules self.name_munger(pt_filter) if self.paged or max_results > 500: # avoid making many small requests max_results = 500 self.rule_payload = {'query': pt_filter} self.rule_payload["maxResults"] = int(max_results) if start: self.rule_payload["fromDate"] = self.fromDate if end: self.rule_payload["toDate"] = self.toDate # use the proper endpoint url self.stream_url = self.end_point if count_bucket: if not self.end_point.endswith("counts.json"): self.stream_url = self.end_point[:-5] + "/counts.json" if count_bucket not in ['day', 'minute', 'hour']: raise ValueError("Error. Invalid count bucket: %s \n" % str(count_bucket)) self.rule_payload["bucket"] = count_bucket self.rule_payload.pop("maxResults", None) # for testing, show the query JSON and stop if show_query: sys.stderr.write("API query:\n") sys.stderr.write(json.dumps(self.rule_payload) + '\n') sys.exit() # set up variable to catch the data in 3 formats self.time_series = [] self.rec_dict_list = [] self.res_cnt = 0 # timing self.delta_t = 1 # keeps us from crashing # actual oldest tweet before now self.oldest_t = datetime.datetime.utcnow() # search v2: newest date is more recent than 2006-03-01T00:00:00 self.newest_t = datetime.datetime(2006, 3, 1) # for rec in self.parse_responses(count_bucket): # parse_responses returns only the last set of activities retrieved, not all paged results. # to access the entire set, use the helper functions get_activity_set and get_list_set! self.res_cnt += 1 self.rec_dict_list.append(rec) if count_bucket: # timeline data t = datetime.datetime.strptime(rec["timePeriod"], TIME_FORMAT_SHORT) tmp_tl_list = [rec["timePeriod"], rec["count"], t] self.tweet_times_flag = False else: # json activities # keep track of tweet times for time calculation tweet = Tweet(rec) t = tweet.created_at_datetime tmp_tl_list = [tweet.created_at_seconds, 1, t] self.tweet_times_flag = True # this list is ***either*** list of buckets or list of tweet times! self.time_series.append(tmp_tl_list) # timeline requests don't return activities! if t < self.oldest_t: self.oldest_t = t if t > self.newest_t: self.newest_t = t self.delta_t = (self.newest_t - self.oldest_t).total_seconds() / 60. return
openhook = None # parse some tweets for line in fileinput.FileInput(options.data_files, openhook=openhook): csv = [] # load the JSON try: tweet_dict = json.loads(line) except JSONDecodeError as json_error: if not options.pass_bad_json: sys.stderr.write( "{}. Use the flag '-j' to pass silently next time.\nBad JSON payload: {}" .format(json_error, line)) continue # load a Tweet try: tweet_obj = Tweet(tweet_dict, do_format_validation=options.do_format_validation) except NotATweetError as nate: if not options.pass_non_tweet: sys.stderr.write( "{}. Use the flag '-t' to pass silently next time.\nNon Tweet payload: {}" .format(nate, line)) continue # get the relevant fields for func in functions: try: attribute = getattr(tweet_obj, func) if sys.version_info[0] == 3: csv.append(str(attribute)) else: if isinstance(attribute, str) or isinstance( attribute, unicode):
from tweet_parser.tweet import Tweet from tweet_parser.tweet_parser_errors import NotATweetError import fileinput import json import pprint for line in fileinput.FileInput("dataTest.json"): print(line) try: tweet_dict = json.loads(line) print(tweet_dict) tweet = Tweet(tweet_dict) print(tweet) except (json.JSONDecodeError,NotATweetError): pass print(tweet.created_at_string, tweet.all_text)
openhook = fileinput.hook_compressed else: openhook = None # parse some tweets for line in fileinput.FileInput(options.data_files, openhook=openhook): csv = [] # load the JSON try: tweet_dict = json.loads(line) except JSONDecodeError as json_error: if not options.pass_bad_json: sys.stderr.write("{}. Use the flag '-j' to pass silently next time.\nBad JSON payload: {}".format(json_error, line)) continue # load a Tweet try: tweet_obj = Tweet(tweet_dict, do_format_checking=options.do_format_checking) except NotATweetError as nate: if not options.pass_non_tweet: sys.stderr.write("{}. Use the flag '-t' to pass silently next time.\nNon Tweet payload: {}".format(nate, line)) continue # get the relevant fields for func in functions: try: attribute = getattr(tweet_obj, func) if type(attribute) == str: attribute = attribute.replace(options.delim, " ").replace("\n", " ").replace("\r", " ") csv.append(str(attribute)) except NotAvailableError as nae: if not options.pass_not_available: sys.stderr.write("{}. Use the flag -a to pass silently next time.\nAttribute Unavailable: {}".format(nae, line)) csv.append("NOT_AVAILABLE")
def get_tweet(path): ready = False with open(path, 'r') as f: lines = fileinput.FileInput(path) t = {} unique_u = {} unique_f = {} Bot = bot.load_bot() for line in lines: #print(line) tweet_dict = json.loads(line) tweet = Tweet(tweet_dict) t_id1 = tweet['id_str'] u_id1 = tweet['user']['id_str'] tweet1 = tweet['text'] screen_name = tweet['user']['screen_name'] time1 = tweet['created_at'] unique_u[u_id1] = 1 if bot.check_bot(Bot, t_id1) == 1: continue #isretweeted try: #retweet = tweet['retweeted_status'] retweet = tweet.get('retweeted_status', None) if retweet == None: retweet = tweet.get('quoted_status', None) if retweet == None: t[t_id1] = { 'user': u_id1, 'parent': u_id1, 'origin': u_id1, 'confirm': True, 'text': tweet1, 'origin_tweet': t_id1, 'parent_tweet': t_id1, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': screen_name, 'time': time1, 'depth': 1 } else: tweet2 = retweet['text'] t_id2 = retweet['id_str'] u_id2 = retweet['user']['id_str'] origin_name = retweet['user']['screen_name'] time2 = retweet['created_at'] t[t_id1] = { 'user': u_id1, 'parent': u_id2, 'origin': u_id2, 'confirm': False, 'text': tweet1, 'origin_tweet': t_id2, 'parent_tweet': t_id2, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': origin_name, 'time': time1, 'depth': 2 } t[t_id2] = { 'user': u_id2, 'parent': u_id2, 'origin': u_id2, 'confirm': True, 'text': tweet2, 'origin_tweet': t_id2, 'parent_tweet': t_id2, 'tweet': t_id2, 'screen_name': origin_name, 'origin_name': origin_name, 'time': time2, 'depth': 1 } unique_u[u_id2] = 1 except KeyError as e: #no retweeted print("Key Error Exception!!!!") t[t_id1] = { 'user': u_id1, 'parent': u_id1, 'origin': u_id1, 'confirm': True, 'text': tweet1, 'origin_tweet': t_id1, 'parent_tweet': t_id1, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': screen_name, 'time': time1, 'depth': 1 } #print(tweet.created_at_string, tweet.all_text) # if follower, origin_follwer, friends counts are same as unique users, then struct retweet networks # and the number of tweets are more than 100, else return None f_count = 0 fr_count = 0 for uid in unique_u.keys(): user_path = '../Data/followers/followers/' + uid if os.path.exists(user_path): f_count += 1 for uid in unique_u.keys(): user_path = '../Data/friends/friends/' + uid if os.path.exists(user_path): fr_count += 1 if len(t) <= 100: return 0, None print(path) print( 'unique_users : %s , collected followers : %s, collected friends : %s' % (len(unique_u), f_count, fr_count)) if f_count == len(unique_u) and fr_count == len(unique_u): print('%s : %s tweets' % (path, len(t))) return 1, t elif f_count == len(unique_u): print('%s : %s tweets' % (path, len(t))) return 2, t elif fr_count == len(unique_u): return 3, t else: return 0, t