def get_activity_set(self):
     """Generator iterates through the entire activity set from memory or disk."""
     if self.paged and self.output_file_path is not None:
         for file_name in self.paged_file_list:
             with codecs.open(file_name, "rb") as f:
                 for res in f:
                     yield Tweet(json.loads(res.decode('utf-8')))
     else:
         for res in self.rec_dict_list:
             yield Tweet(res)
Exemple #2
0
def create_tweet_graph(graph, start_at=None):
    print("Creating tweet graph...")
    all_tweet_files = []
    for data_folder in config.DATA_FOLDERS:
        data_path = path.join(config.DATA_DIR, data_folder, "*.jsonl.gz")
        all_tweet_files.extend(glob.glob(data_path))

    if start_at is not None:
        all_tweet_files = all_tweet_files[all_tweet_files.index(start_at):]

    print("Starting to add tweets...")
    for tweet_file in all_tweet_files:
        print("Adding", tweet_file)
        try:
            with gzip.open(tweet_file, 'rb') as f:
                for tweet_dict in tqdm(f):
                    try:
                        tweet = Tweet(json.loads(tweet_dict))
                        add_tweet_to_graph(graph, tweet)
                    except (json.JSONDecodeError, NotATweetError) as e:
                        print(e)
                        print(tweet)
                        pass
                    except Exception as e:
                        print(tweet)
                        raise e
        except EOFError as e:
            print(e)
            continue
Exemple #3
0
 def setUp(self):
     tweet_payloads = {}
     tweet_payloads["activity_streams"] = {}
     tweet_payloads["original_format"] = {}
     tweet_ids = []
     for line in fileinput.FileInput(
             "tweet_payload_examples/activity_streams_examples.json"):
         tweet = Tweet(json.loads(line))
         tweet_ids.append(tweet.id)
         tweet_payloads["activity_streams"][tweet.id] = tweet
     for line in fileinput.FileInput(
             "tweet_payload_examples/original_format_examples.json"):
         tweet = Tweet(json.loads(line))
         tweet_ids.append(tweet.id)
         tweet_payloads["original_format"][tweet.id] = tweet
     self.tweet_payloads = tweet_payloads
     self.tweet_ids = list(set(tweet_ids))
Exemple #4
0
	def parse(self):
		for line in fileinput.FileInput(self.file):
			try:
				tweet_dict = json.loads(line)
				tweet = Tweet(tweet_dict)
			except Exception as ex:
				pass
			print(tweet.all_text)
Exemple #5
0
def parse_tweet(json_string):
    """Parse JSON string to Tweet object. Returns None if parsing fails."""
    tweet = None
    try:
        tweet_dict = json.loads(json_string)
        tweet = Tweet(tweet_dict)
    except (json.JSONDecodeError, NotATweetError):
        logging.debug('record is not a Tweet: {}'.format(json_string[:75]))
    return tweet
    def on_data(self, data):
        producer.send("tweets", data.encode('utf-8'))
        tweet_dict = json.loads(data)
        tweet = Tweet(tweet_dict)
        print(tweet.created_at_string, tweet.all_text)
        #print(tweet)

        f.write(data)

        return True
Exemple #7
0
def open_tweets(filename):
    tweets = []
    for line in fileinput.FileInput(filename):
        try:
            tweet_dict = json.loads(line)
            tweet = Tweet(tweet_dict)
        except (json.JSONDecodeError,NotATweetError):
            pass
        tweets.append(tweet)
    return tweets
Exemple #8
0
 def test_bad_payloads(self):
     # missing the user field, raises a "NotATweetError"
     with self.assertRaises(NotATweetError):
         f = open(
             "tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_user.json",
             "r")
         tweet = json.load(f)
         f.close()
         Tweet(tweet)
     # missing a different required field, raises "UnexpectedFormatError"
     with self.assertRaises(UnexpectedFormatError):
         f = open(
             "tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_field.json",
             "r")
         tweet = json.load(f)
         f.close()
         Tweet(tweet, do_format_validation=True)
     # missing a different required field, raises "UnexpectedFormatError"
     with self.assertRaises(UnexpectedFormatError):
         f = open(
             "tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_missing_field.json",
             "r")
         tweet = json.load(f)
         f.close()
         Tweet(tweet, do_format_validation=True)
     # added a new field, raises "UnexpectedFormatError"
     with self.assertRaises(UnexpectedFormatError):
         f = open(
             "tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_additional_field.json",
             "r")
         tweet = json.load(f)
         f.close()
         Tweet(tweet, do_format_validation=True)
     # added a new field, raises "UnexpectedFormatError"
     with self.assertRaises(UnexpectedFormatError):
         f = open(
             "tweet_payload_examples/broken_and_unsupported_payloads/original_format_additional_field.json",
             "r")
         tweet = json.load(f)
         f.close()
         Tweet(tweet, do_format_validation=True)
Exemple #9
0
def create_tweet_df():
    """
    Creates a pandas dataframe representing all of the coronavirus tweets stored in DATA_DIR/f/*.jsonl.gz where f is in
    DATA_FOLDERS.
    :return: the tweet dataframe
    """

    tweet_df = pd.DataFrame(
            columns=['tweet_id', 'text', 'screen_name', 'user_id', 'name', 'bio', 'creation_date', 'profile_location',
                     'tweet_type', 'embedded_tweet', 'lang', 'hashtags', 'user_mentions', 'tweet_links', 'generator'])\
        .astype({'tweet_id': 'str', 'user_id': 'int32'})
    all_tweet_files = []
    for data_folder in config.DATA_FOLDERS:
        data_path = path.join(config.DATA_DIR, data_folder, "*.jsonl.gz")
        all_tweet_files.extend(glob.glob(data_path))

    for tweet_file in all_tweet_files:
        with gzip.open(tweet_file, 'rb') as f:
            for tweet_dict in tqdm(f):
                try:
                    tweet = Tweet(json.loads(tweet_dict))
                    tweet_df = tweet_df.append(
                        {
                            'tweet_id': tweet.id,
                            'text': tweet.all_text,
                            'screen_name': tweet.screen_name,
                            'user_id': tweet.user_id,
                            'name': tweet.name,
                            'bio': tweet.bio,
                            'creation_date': tweet.created_at_datetime,
                            'profile_location': tweet.profile_location,
                            'tweet_type': tweet.tweet_type,
                            'embedded_tweet': tweet.embedded_tweet,
                            'lang': tweet.lang,
                            'hashtags': tweet.hashtags,
                            'user_mentions': tweet.user_mentions,
                            'tweet_links': tweet.tweet_links,
                            'generator': tweet.generator,
                        },
                        ignore_index=True)
                except (json.JSONDecodeError, NotATweetError) as e:
                    print(e)
                    print(tweet)
                    pass
                except Exception as e:
                    print(tweet)
                    raise e
    return tweet_df
    def execute(
            self,
            pt_filter,
            max_results=100,
            start=None,
            end=None,
            count_bucket=None  # None is json
        ,
            show_query=False):
        """Execute a query with filter, maximum results, start and end dates.

           Count_bucket determines the bucket size for the counts endpoint.
           If the count_bucket variable is set to a valid bucket size such 
           as mintute, day or week, then the acitivity counts endpoint will 
           Otherwise, the data endpoint is used."""
        # set class start and stop datetime variables
        self.set_dates(start, end)
        # make a friendlier file name from the rules
        self.name_munger(pt_filter)
        if self.paged or max_results > 500:
            # avoid making many small requests
            max_results = 500
        self.rule_payload = {'query': pt_filter}
        self.rule_payload["maxResults"] = int(max_results)
        if start:
            self.rule_payload["fromDate"] = self.fromDate
        if end:
            self.rule_payload["toDate"] = self.toDate
        # use the proper endpoint url
        self.stream_url = self.end_point
        if count_bucket:
            if not self.end_point.endswith("counts.json"):
                self.stream_url = self.end_point[:-5] + "/counts.json"
            if count_bucket not in ['day', 'minute', 'hour']:
                raise ValueError("Error. Invalid count bucket: %s \n" %
                                 str(count_bucket))
            self.rule_payload["bucket"] = count_bucket
            self.rule_payload.pop("maxResults", None)
        # for testing, show the query JSON and stop
        if show_query:
            sys.stderr.write("API query:\n")
            sys.stderr.write(json.dumps(self.rule_payload) + '\n')
            sys.exit()
        # set up variable to catch the data in 3 formats
        self.time_series = []
        self.rec_dict_list = []
        self.res_cnt = 0
        # timing
        self.delta_t = 1  # keeps us from crashing
        # actual oldest tweet before now
        self.oldest_t = datetime.datetime.utcnow()
        # search v2: newest date is more recent than 2006-03-01T00:00:00
        self.newest_t = datetime.datetime(2006, 3, 1)
        #
        for rec in self.parse_responses(count_bucket):
            # parse_responses returns only the last set of activities retrieved, not all paged results.
            # to access the entire set, use the helper functions get_activity_set and get_list_set!
            self.res_cnt += 1
            self.rec_dict_list.append(rec)
            if count_bucket:
                # timeline data
                t = datetime.datetime.strptime(rec["timePeriod"],
                                               TIME_FORMAT_SHORT)
                tmp_tl_list = [rec["timePeriod"], rec["count"], t]
                self.tweet_times_flag = False
            else:
                # json activities
                # keep track of tweet times for time calculation
                tweet = Tweet(rec)
                t = tweet.created_at_datetime
                tmp_tl_list = [tweet.created_at_seconds, 1, t]
                self.tweet_times_flag = True
            # this list is ***either*** list of buckets or list of tweet times!
            self.time_series.append(tmp_tl_list)
            # timeline requests don't return activities!
            if t < self.oldest_t:
                self.oldest_t = t
            if t > self.newest_t:
                self.newest_t = t
            self.delta_t = (self.newest_t -
                            self.oldest_t).total_seconds() / 60.
        return
Exemple #11
0
    openhook = None
# parse some tweets
for line in fileinput.FileInput(options.data_files, openhook=openhook):
    csv = []
    # load the JSON
    try:
        tweet_dict = json.loads(line)
    except JSONDecodeError as json_error:
        if not options.pass_bad_json:
            sys.stderr.write(
                "{}. Use the flag '-j' to pass silently next time.\nBad JSON payload: {}"
                .format(json_error, line))
        continue
    # load a Tweet
    try:
        tweet_obj = Tweet(tweet_dict,
                          do_format_validation=options.do_format_validation)
    except NotATweetError as nate:
        if not options.pass_non_tweet:
            sys.stderr.write(
                "{}. Use the flag '-t' to pass silently next time.\nNon Tweet payload: {}"
                .format(nate, line))
        continue
    # get the relevant fields
    for func in functions:
        try:
            attribute = getattr(tweet_obj, func)
            if sys.version_info[0] == 3:
                csv.append(str(attribute))
            else:
                if isinstance(attribute, str) or isinstance(
                        attribute, unicode):
Exemple #12
0
from tweet_parser.tweet import Tweet
from tweet_parser.tweet_parser_errors import NotATweetError
import fileinput
import json
import pprint

for line in fileinput.FileInput("dataTest.json"):
    print(line)
    try:
        tweet_dict = json.loads(line)
        print(tweet_dict)
        tweet = Tweet(tweet_dict)
        print(tweet)
    except (json.JSONDecodeError,NotATweetError):
        pass
    print(tweet.created_at_string, tweet.all_text)
Exemple #13
0
    openhook = fileinput.hook_compressed
else:
    openhook = None
# parse some tweets
for line in fileinput.FileInput(options.data_files, openhook=openhook):
    csv = []
    # load the JSON
    try:
        tweet_dict = json.loads(line)
    except JSONDecodeError as json_error:
        if not options.pass_bad_json:
            sys.stderr.write("{}. Use the flag '-j' to pass silently next time.\nBad JSON payload: {}".format(json_error, line))
        continue
    # load a Tweet
    try:
        tweet_obj = Tweet(tweet_dict, do_format_checking=options.do_format_checking)
    except NotATweetError as nate:
        if not options.pass_non_tweet:
            sys.stderr.write("{}. Use the flag '-t' to pass silently next time.\nNon Tweet payload: {}".format(nate, line))
        continue
    # get the relevant fields
    for func in functions:
        try:
            attribute = getattr(tweet_obj, func)
            if type(attribute) == str:
                attribute = attribute.replace(options.delim, " ").replace("\n", " ").replace("\r", " ")
            csv.append(str(attribute))
        except NotAvailableError as nae:
            if not options.pass_not_available:
                sys.stderr.write("{}. Use the flag -a to pass silently next time.\nAttribute Unavailable: {}".format(nae, line))
            csv.append("NOT_AVAILABLE")
Exemple #14
0
def get_tweet(path):
    ready = False
    with open(path, 'r') as f:
        lines = fileinput.FileInput(path)

    t = {}
    unique_u = {}
    unique_f = {}
    Bot = bot.load_bot()

    for line in lines:
        #print(line)
        tweet_dict = json.loads(line)
        tweet = Tweet(tweet_dict)
        t_id1 = tweet['id_str']
        u_id1 = tweet['user']['id_str']
        tweet1 = tweet['text']
        screen_name = tweet['user']['screen_name']
        time1 = tweet['created_at']
        unique_u[u_id1] = 1

        if bot.check_bot(Bot, t_id1) == 1:
            continue
        #isretweeted
        try:
            #retweet = tweet['retweeted_status']
            retweet = tweet.get('retweeted_status', None)
            if retweet == None:
                retweet = tweet.get('quoted_status', None)

            if retweet == None:
                t[t_id1] = {
                    'user': u_id1,
                    'parent': u_id1,
                    'origin': u_id1,
                    'confirm': True,
                    'text': tweet1,
                    'origin_tweet': t_id1,
                    'parent_tweet': t_id1,
                    'tweet': t_id1,
                    'screen_name': screen_name,
                    'origin_name': screen_name,
                    'time': time1,
                    'depth': 1
                }
            else:
                tweet2 = retweet['text']
                t_id2 = retweet['id_str']
                u_id2 = retweet['user']['id_str']
                origin_name = retweet['user']['screen_name']
                time2 = retweet['created_at']
                t[t_id1] = {
                    'user': u_id1,
                    'parent': u_id2,
                    'origin': u_id2,
                    'confirm': False,
                    'text': tweet1,
                    'origin_tweet': t_id2,
                    'parent_tweet': t_id2,
                    'tweet': t_id1,
                    'screen_name': screen_name,
                    'origin_name': origin_name,
                    'time': time1,
                    'depth': 2
                }
                t[t_id2] = {
                    'user': u_id2,
                    'parent': u_id2,
                    'origin': u_id2,
                    'confirm': True,
                    'text': tweet2,
                    'origin_tweet': t_id2,
                    'parent_tweet': t_id2,
                    'tweet': t_id2,
                    'screen_name': origin_name,
                    'origin_name': origin_name,
                    'time': time2,
                    'depth': 1
                }
                unique_u[u_id2] = 1
        except KeyError as e:
            #no retweeted
            print("Key Error Exception!!!!")
            t[t_id1] = {
                'user': u_id1,
                'parent': u_id1,
                'origin': u_id1,
                'confirm': True,
                'text': tweet1,
                'origin_tweet': t_id1,
                'parent_tweet': t_id1,
                'tweet': t_id1,
                'screen_name': screen_name,
                'origin_name': screen_name,
                'time': time1,
                'depth': 1
            }
        #print(tweet.created_at_string, tweet.all_text)

    # if follower, origin_follwer, friends counts are same as unique users, then struct retweet networks
    # and the number of tweets are more than 100, else return None

    f_count = 0
    fr_count = 0
    for uid in unique_u.keys():
        user_path = '../Data/followers/followers/' + uid
        if os.path.exists(user_path):
            f_count += 1

    for uid in unique_u.keys():
        user_path = '../Data/friends/friends/' + uid
        if os.path.exists(user_path):
            fr_count += 1

    if len(t) <= 100:
        return 0, None
    print(path)
    print(
        'unique_users : %s , collected followers : %s, collected friends : %s'
        % (len(unique_u), f_count, fr_count))
    if f_count == len(unique_u) and fr_count == len(unique_u):
        print('%s : %s tweets' % (path, len(t)))
        return 1, t
    elif f_count == len(unique_u):
        print('%s : %s tweets' % (path, len(t)))
        return 2, t
    elif fr_count == len(unique_u):
        return 3, t
    else:
        return 0, t