コード例 #1
0
def get_tweet(item):
    """
	takes a tweet id and uses the twarc lib to harvest it
	searches for media in the tweet - if it can find any it also tries to download that media item
	"""
    item.agent_name = agent_name + "_1_get_tweet"
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    my_content_types = []
    url = item.url
    if url.endswith("/"):
        url = url[:-1]
    __, __id = url.rsplit("/", 1)

    t = Twarc(twitter_consumer_key, twitter_consumer_secret,
              twitter_access_token, twitter_access_token_secret)
    for tweet in t.hydrate([__id]):
        get_assets(tweet, item.storage_folder)
        file_path = os.path.join(
            item.storage_folder,
            "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"),
                                tweet['id']))
        with open(file_path, "w") as outfile:
            json.dump(tweet, outfile)
    item.completed = True
    return item
コード例 #2
0
    def dehydrate(self, tweet_ids: List[str]):
        t = Twarc(self.configuration["twitter"]["consumer_key"],
                  self.configuration["twitter"]["consumer_secret"],
                  self.configuration["twitter"]["access_token"],
                  self.configuration["twitter"]["access_token_secret"],
                  tweet_mode="extended")
        count: int = 0
        print("Reading tweets from Twitter")
        with tqdm(total=self.configuration["sampling"]["size"],
                  unit="tweet") as written_progress_bar:
            with tqdm(total=len(tweet_ids),
                      unit="tweet") as hydrate_progress_bar:
                for tweet in t.hydrate(tweet_ids):
                    hydrate_progress_bar.update(1)
                    if any(keyword in tweet["full_text"].lower() for keyword in
                           self.configuration["sampling"]["keywords"]):
                        append: bool = True

                        if "only_media" in self.configuration["sampling"].keys(
                        ):
                            if self.configuration["sampling"]["only_media"]:
                                if not self.contains_media(tweet):
                                    append = False

                        if len(self.configuration["sampling"]
                               ["languages"]) > 0:
                            if tweet["lang"] not in self.configuration[
                                    "sampling"]["languages"]:
                                append = False
                        if append:
                            written_progress_bar.update(1)
                            count += 1
                            yield tweet
                        if count == self.configuration["sampling"]["size"]:
                            return
コード例 #3
0
    def get_traing_data(self):
        '''
        :return: combined data (tweets info and trec-is data) as dictionary {tweet_id: Tweet}
        '''
        # load tweets retrieved by TREC-Tweets downloader
        # retrieved_tweets, f_name = self.load_Tweets()
        #retrieved_tweets, f_name = self.load_event_tweets()
        file = open('data/all_tweets.pkl', 'rb')
        retrieved_tweets = pickle.load(file)
        file.close()

        missed_tweets = []
        training_data = {}  # dict {'tweet id': Tweet}

        # load TREC data data: tweetsID, tweet_priority, tweet_categories, indicator_terms
        events = json.load(open(self.trec_path))
        events = pd.DataFrame.from_dict(events['events'], orient='columns')

        for _, event in events.iterrows():

            for trec_tweet in event['tweets']:
                if trec_tweet[
                        'postID'] in retrieved_tweets:  # check if tweets_full is retrieved ?
                    retriev_tweet = retrieved_tweets[trec_tweet['postID']]
                    training_data[trec_tweet['postID']] = Tweet(
                        id=retriev_tweet.id,
                        text=retriev_tweet.text,
                        metadata=retriev_tweet.metadata,
                        priority=trec_tweet['priority'],
                        indicatorTerms=trec_tweet['indicatorTerms'],
                        categories=trec_tweet['categories'],
                        event_type=trec_tweet['event_type'])
                else:
                    # adding missed tweets
                    training_data[trec_tweet['postID']] = Tweet(
                        id=trec_tweet['postID'],
                        priority=trec_tweet['priority'],
                        indicatorTerms=trec_tweet['indicatorTerms'],
                        categories=trec_tweet['categories'],
                        event_type=trec_tweet['event_type'])
                    missed_tweets.append(trec_tweet['postID'])

        # Retrieve the missed tweets by Twarc tool and combine with training data
        t = Twarc(self.consumer_key, self.consumer_secret, self.access_token,
                  self.access_token_secret)

        tweets_twarc = t.hydrate(
            iter(missed_tweets))  # retrieve all tweets by IDs

        for twtt in tweets_twarc:
            training_data[str(twtt['id'])].add_tweets_data(
                twtt['full_text'], {'created_at': twtt['created_at']})

        return training_data
コード例 #4
0
def test_hydrate():
    ids = [
        "501064188211765249", "501064196642340864", "501064197632167936",
        "501064196931330049", "501064198005481472", "501064198009655296",
        "501064198059597824", "501064198513000450", "501064180468682752",
        "501064199142117378", "501064171707170816", "501064200186118145",
        "501064200035516416", "501064201041743872", "501064201251880961",
        "501064198973960192", "501064201256071168", "501064202027798529",
        "501064202245521409", "501064201503113216", "501064202363359232",
        "501064202295848960", "501064202380115971", "501064202904403970",
        "501064203135102977", "501064203508412416", "501064203516407810",
        "501064203546148864", "501064203697156096", "501064204191690752",
        "501064204288540672", "501064197396914176", "501064194309906436",
        "501064204989001728", "501064204980592642", "501064204661850113",
        "501064205400039424", "501064205089665024", "501064206666702848",
        "501064207274868736", "501064197686296576", "501064207623000064",
        "501064207824351232", "501064208083980290", "501064208277319680",
        "501064208398573568", "501064202794971136", "501064208789045248",
        "501064209535614976", "501064209551994881", "501064141332029440",
        "501064207387742210", "501064210177331200", "501064210395037696",
        "501064210693230592", "501064210840035329", "501064211855069185",
        "501064192024006657", "501064200316125184", "501064205642903552",
        "501064212547137536", "501064205382848512", "501064213843169280",
        "501064208562135042", "501064214211870720", "501064214467731457",
        "501064215160172545", "501064209648848896", "501064215990648832",
        "501064216241897472", "501064215759568897", "501064211858870273",
        "501064216522932227", "501064216930160640", "501064217667960832",
        "501064211997274114", "501064212303446016", "501064213675012096",
        "501064218343661568", "501064213951823873", "501064219467341824",
        "501064219677044738", "501064210080473088", "501064220415229953",
        "501064220847656960", "501064222340423681", "501064222772445187",
        "501064222923440130", "501064220121632768", "501064222948593664",
        "501064224936714240", "501064225096499201", "501064225142624256",
        "501064225314185216", "501064225926561794", "501064226451259392",
        "501064226816143361", "501064227302674433", "501064227344646144",
        "501064227688558592", "501064228288364546", "501064228627705857",
        "501064229764751360", "501064229915729921", "501064231304065026",
        "501064231366983681", "501064231387947008", "501064231488200704",
        "501064231941570561", "501064232188665856", "501064232449114112",
        "501064232570724352", "501064232700350464", "501064233186893824",
        "501064233438568450", "501064233774510081", "501064235107897344",
        "501064235175399425", "501064235456401410",
    ]
    t = Twarc()
    count = 0
    for tweet in t.hydrate(iter(ids)):
        assert tweet['id_str']
        count += 1
    assert count > 100 # may need to adjust as these might get deleted
コード例 #5
0
 def clean_tweets(self):
     self.save_ids()
     t = Twarc()
     file = open(f'{self.output_path}/{self.hydrate_name}.csv', 'w', encoding="ISO-8859-1")
     file.write("i,id,year,month,day,tweet\n")
     i = 1
     for tweet in tqdm(t.hydrate(open(f'{self.output_path}/{self.ids_txt}.txt')), desc="parsing tweet ..."):
         try:
             output = self.filter_tweet(tweet)
         except UnicodeDecodeError:
             output = "error,error,error,error,error"
         file.write(f'{i},{output}\n')
         i += 1
     file.close()
     os.remove(f'{self.output_path}/{self.ids_txt}.txt')
コード例 #6
0
def hydrate(files, api_key):
    consumer_key = api_key['consumer_key']
    consumer_secret = api_key['consumer_secret']
    access_token = api_key['access_token']
    access_token_secret = api_key['access_token_secret']
    t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
    file1 = open(mainpath + 'data/flutids.txt', 'w')
    keywords = ('flu', 'common flu', 'covid19 flu', 'coronavirus common flu')
    records = []
    hashtags = [""]
    for tweetIDs in files:

        for tweet in t.hydrate(open(mainpath + dataPath + "/" + tweetIDs)):
            txt = tweet['full_text']
            if (tweet["lang"] == "en") and (not tweet['retweeted'] and 'RT @'
                                            not in tweet['full_text']):
                if any(keyword in tweet["full_text"].lower()
                       for keyword in keywords):

                    tid = str(tweet['id_str'])
                    file1.write(tid + '\n')
                    screen_name = tweet['user']['screen_name']
                    if not tweet["entities"]["hashtags"]:
                        hashtags = [""]
                    else:
                        for h in tweet["entities"]["hashtags"]:
                            hashtags.append(h["text"])
                            continue
                    if not tweet["entities"]["urls"]:
                        url = ""
                    else:
                        for urls in tweet["entities"]["urls"]:
                            url = str(urls["expanded_url"])
                            continue

                    retweets = str(tweet['retweet_count'])
                    favorites = str(tweet['favorite_count'])
                    records.append(
                        [screen_name, txt, hashtags, url, retweets, favorites])
    df = pd.DataFrame(records,
                      columns=[
                          'screen_name', 'tweet', 'hashtag', 'url',
                          '#retweets', '#favorites'
                      ])
    df.to_csv(mainpath + 'data/tweets.csv')
    file1.close()
def hydrate_tweets(data, consumer_key, consumer_secret, access_token,
                   access_token_secret):
    t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
    tweet_text = []
    favorite_count = []
    retweet_count = []

    for tweet in t.hydrate(data['tweet_id']):
        tweet_text.append(tweet['full_text'])
        favorite_count.append(tweet['favorite_count'])
        retweet_count.append(tweet['retweet_count'])

    data['tweet_text'] = tweet_text
    data['favorite_count'] = favorite_count
    data['retweet_count'] = retweet_count

    data.to_csv("HydratedTweets")
    return (data)
コード例 #8
0
def twarc_provider(
    tweet_ids: typing.Iterable[int]
) -> typing.Tuple[typing.Set[int], typing.List[typing.Mapping]]:
    """Get a list of Tweets from their IDs sourced from the Twitter API.

    Uses Twarc Twitter API connector - https://github.com/DocNow/twarc.
    """
    # Twitter API consumer - handles rate limits for us
    t = Twarc(  # pylint: disable=invalid-name
        consumer_key=current_app.config['TWITTER_CONSUMER_KEY'],
        consumer_secret=current_app.config['TWITTER_CONSUMER_SECRET'],
        access_token=current_app.config['TWITTER_ACCESS_TOKEN'],
        access_token_secret=current_app.config['TWITTER_ACCESS_TOKEN_SECRET'],
    )

    found_tweets = list(t.hydrate(tweet_ids))
    found_tweet_ids = {tweet['id'] for tweet in found_tweets}

    return found_tweet_ids, found_tweets
コード例 #9
0
def get_tweet(item):
    item.agent_name = agent_name + "_1_get_tweet"
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    my_content_types = []
    url = item.url
    if url.endswith("/"):
        url = url[:-1]
    __, __id = url.rsplit("/", 1)

    t = Twarc(twitter_consumer_key, twitter_consumer_secret,
              twitter_access_token, twitter_access_token_secret)
    for tweet in t.hydrate([__id]):
        get_assets(tweet, item.storage_folder)
        file_path = os.path.join(
            item.storage_folder,
            "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"),
                                tweet['id']))
        with open(file_path, "w") as outfile:
            json.dump(tweet, outfile)
    item.completed = True
    return item
コード例 #10
0
def engage_discourse(TID_PATH=Path('data/tweets_ids/'),
                     LM_PATH=Path('data/tweets/'),
                     USERS_PATH=Path('data/users/')):
    t = Twarc(credentials.CONSUMER_KEY, credentials.CONSUMER_SECRET,
              credentials.ACCESS_TOKEN, credentials.ACCESS_TOKEN_SECRET)
    chunksize = 24000

    users = []
    for doc in (USERS_PATH).glob('*.*'):
        with open(doc) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            next(csv_reader)
            for row in csv_reader:
                users.append(row[0])
    print(f'{len(users)} users in the corpus.')
    for doc in (TID_PATH).glob('*.*'):
        start = time.time()
        for count, tweet in enumerate(t.hydrate(open(doc))):
            process_tweet(tweet)
            if count % 1000 == 0:
                print(f'{time.time()-start} seconds for {count} tweets.')
        print('Document done!')
    print('Files written!')
コード例 #11
0
def readIdFile(input_file_name, tweets_num=0):
    t = Twarc(CONSUMER_KEY, CONSUMER_KEY_SECRET, ACCESS_TOKEN,
              ACCESS_TOKEN_SECRET)
    inputF = open(input_file_name, "r")
    i = 1
    subIdFileName = "tweets_id_" + str(i // 50000 + 1) + ".txt"
    subIdFile = open(subIdFileName, "a+")
    print("start read")
    line = inputF.readline()
    while line != "" and (tweets_num > 0
                          and i < tweets_num + 1) or tweets_num <= 0:
        subIdFile.write(line)
        line = inputF.readline()
        if i % 50000 == 0 or tweets_num == i or line == "":
            print("Read: " + subIdFileName)
            subIdFile.close()
            # send request
            tweets = t.hydrate(open(subIdFileName))
            tweetsClean(tweets)
            print("Finish read:" + subIdFileName)
            subIdFileName = "./tweets_id_" + str(i // 50000 + 1) + ".txt"
            subIdFile = open("./tweets_id_" + str(i // 50000 + 1) + ".txt",
                             "w")
        i += 1
コード例 #12
0
ids_dir = main_dir + 'data/'
# Make sure you create this folder in the main directory before running this script
target_dir = main_dir + 'data_full/'

# Twitter API Credentials
ACCESS_TOKEN = config.ACCESS_TOKEN
ACCESS_SECRET = config.ACCESS_SECRET
CONSUMER_KEY = config.CONSUMER_KEY
CONSUMER_SECRET = config.CONSUMER_SECRET

t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET)

tweet_ids = pd.read_csv(ids_dir + filename + ".csv", lineterminator='\n')
tweet_objects = []

for tweet in t.hydrate(tweet_ids.id.drop_duplicates()):
    tweet_objects.append(tweet)

df_full = pd.DataFrame(
    tweet_objects,
    columns=[
        'created_at', 'id', 'id_str', 'full_text', 'truncated',
        'display_text_range', 'entities', 'source', 'in_reply_to_status_id',
        'in_reply_to_status_id_str', 'in_reply_to_user_id',
        'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
        'coordinates', 'place', 'contributors', 'is_quote_status',
        'retweet_count', 'favorite_count', 'favorited', 'retweeted',
        'possibly_sensitive', 'lang'
    ])
df_full.to_csv(target_dir + filename + '_full.csv', index=None)
コード例 #13
0
def main(warc_file):
    twitter = Twarc()
    out = csv.writer(sys.stdout)
    out.writerow(json2csv.get_headings())
    for tweet in twitter.hydrate(tweet_ids(warc_file)):
        out.writerow(json2csv.get_row(tweet))
コード例 #14
0
class TwitterApi(object):
    def __init__(self,
                 consumer_key,
                 consumer_secret,
                 access_token_key="",
                 access_token_secret=""):
        """
        This method authenticates and creates a twitterapi object.
        In case the system is unable to authenticate the object, a SystemError is returned.
        """
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        try:
            auth.get_authorization_url()
        except TweepError as e:
            print("Unable to authenticate", str(e))
            raise ApplicationError(*error_list["AUTH_ERROR"])

        auth.set_access_token(access_token_key, access_token_secret)
        self._api = tweepy.API(auth_handler=auth)
        try:
            self._api_twarc = Twarc(cnst.CONSUMER_KEY, cnst.CONSUMER_SECRET,
                                    cnst.ACCESS_TOKEN_KEY,
                                    cnst.ACCESS_TOKEN_SECRET)
        except Exception as e:
            print("Unable to authenticate", str(e))
            raise ApplicationError(*error_list["AUTH_ERROR"])

    # @property
    def get_tweet_from_id(self, tweet_id):
        """
        given a valid twitter url, the method returns the tweet as a tweepy
        status object
        :error: in case of limit reached "RT_LMT_RCHD" is raised
        :returns: tweet object 
        """
        try:
            tweet = self._api.get_status(tweet_id, tweet_mode="extended")
            return tweet
        except RateLimitError as r:
            print("Rate limit exceeded", str(r))
            raise ApplicationError(*error_list["LMT_RCHD_ERROR"])
        except TweepError as e:
            print("Error occured", str(e))
            raise ApplicationError(*error_list["FTCH_ERR"])

    def _is_valid_url(self, tweet_url):
        url_without_share = tweet_url.split("?")[0]
        m = re.match("https://twitter.com/(.*)/status/(.*)", url_without_share)
        n = re.match("twitter.com/(.*)/status/(.*)", url_without_share)
        o = m or n
        return o

    def get_tweet_from_url(self, tweet_url):
        """
        Given a tweet url this method identifies the tweet id from the url
        and then queries get_tweet_from_id to return a tweet object.
        The URL must be of the form https://twitter.com/[user]/status/[tweet_id]
        or twitter.com/[user]/status/[tweet_id] 
        :error: malformed url, the application error "MAL_TWT_URL" is raised,
        :returns: tweet object
        """
        o = self._is_valid_url(tweet_url)
        if type(tweet_url) is str and o and o.group(2).isnumeric():
            return self.get_tweet_from_id(int(o.group(2)))
        else:
            raise ApplicationError(*error_list["MAL_TWT_URL"])

    def get_original_tweet_from_url(self, tweet_url):
        """
        Given a url this method returns the source twitter object of the tweet.
        That is, if a tweet A is a retweet, this method returns the source tweet B
        for retweet A or if the tweet A is  source tweet, it returns tweet A.
        The system is coded with assumption that there can be retweets or a retweet,
        hence this method searches the original tweet by looping over and over
        till the "in_reply_to_status_id_str" is None"
        :error: Application error if there is no embeded url
        :error: Application error if the original tweet is older than 7 or 30 days,
                depending on the end
        """
        original_tweet, tweet = None, None
        while True:
            # if tweet is None then this is first call and we use the tweet_url
            if tweet is None:
                tweet = self.get_tweet_from_url(tweet_url)
            if not tweet.is_quote_status:
                # is_quote_status is false for original tweet
                original_tweet = tweet
                break
            # if tweet.in_reply_to_status_id_str is None:
            #     original_tweet = tweet
            #     # pytest.set_trace()
            #     break
            else:
                tweet = self.get_tweet_from_id(tweet.quoted_status_id)
                # tweet = self.get_tweet_from_id(tweet.in_reply_to_status_id)
        if len(original_tweet.entities["urls"]) == 0:
            raise ApplicationError(*error_list["NO_EMBD_URL"])
        if (original_tweet.created_at -
                datetime.now()).days >= cnst.MAX_TWEET_CREATION_RANGE:
            # pytest.set_trace()
            raise ApplicationError(*error_list["EXPRD_TWT"])
        return original_tweet

    def get_replies(self,
                    tweet,
                    reply_limit=cnst.MAX_REPLY,
                    search_per_request=cnst.SEARCH_PER_REQUEST):
        """
        This method takes in the tweet object and returns replies
        for the tweet, the count of replies are defined by reply_limit.
        :error: AssertionError if the tweet object is not of type Tweepy.status
        :error: Application Error if the limit for the twitter API is reached.
        """
        assert type(tweet) == tweepy.Status
        reply_tweet_ids_list = list()
        # get replies on the original tweet
        self.get_reply_ids_(tweet, reply_limit, search_per_request,
                            reply_tweet_ids_list)
        # self.get_reply_ids(tweet, reply_limit, search_per_request, reply_tweet_ids_list)

        # in case the list is not big enough get replies to the reply tweets
        # until limit is reached. We do not go deeper that level 1 tweet.
        # NOTE: Commenting below code as we are not going deeper than level one.
        # if len(reply_tweet_ids_list) < reply_limit:
        #     temp_list = reply_tweet_ids_list.copy()
        #     for tweet_id in temp_list:
        #         tweet = api.get_tweet_from_id(tweet_id)
        #         self.get_reply_ids(tweet, reply_limit, search_per_request, reply_tweet_ids_list)
        #         if len(reply_tweet_ids_list) < reply_limit:
        #             break

        # get comments from the list
        replies = list()
        for tweet in self._api_twarc.hydrate(reply_tweet_ids_list):
            try:
                replies.append(tweet['full_text'])
            except Exception as e:
                print(str(e))

        # for reply_id in reply_tweet_ids_list:
        #     tweet = self.get_tweet_from_id(reply_id)
        #     try:
        #         replies.append(tweet.retweeted_status.full_text)
        #     except AttributeError:  # Not a Retweet should never occur
        #         replies.append(tweet.full_text)
        return replies

    def get_reply_ids_(self, tweet, reply_limit, search_per_request,
                       reply_tweet_ids_list):
        """
        Given a tweet this method returns a list of ids for the retweets with comments using the premium api.
        :error: Applicaiton Error when limit is reached.
        :error: Assertion Error if the reply_tweet_ids_list is None or not a list
        """
        assert reply_tweet_ids_list is not None
        assert type(reply_tweet_ids_list) == list
        tweet_id = tweet.id
        user_name = tweet.user.screen_name
        search_string = "url:https%3A%2F%2Ftwitter.com%2F{}%2Fstatus%2F{} lang:en".format(
            user_name, tweet_id)
        replies = tweepy.Cursor(self._api.search_30_day,
                                cnst.SEARCH_ENV,
                                search_string,
                                maxResults=search_per_request).items()
        try:
            startTime = datetime.now()
            for reply in replies:
                current_time = datetime.now()
                if reply.is_quote_status and reply.quoted_status.id == tweet.id:
                    reply_tweet_ids_list.append(reply.id)
                if len(reply_tweet_ids_list) == reply_limit or \
                     (current_time-startTime).total_seconds() >= cnst.MAX_TIME_REPLY_SEARCH :
                    break
            print("Returning", len(reply_tweet_ids_list), " replies")
            return reply_tweet_ids_list
        except tweepy.TweepError as e:
            raise ApplicationError(*error_list["LMT_RCHD_ERROR"])

    def get_reply_ids(self, tweet, reply_limit, search_per_request,
                      reply_tweet_ids_list):
        """
        given a tweet this method returns list of reply tweet ids for the given tweet.
        The upper limit for the tweets returned is defined by reply_limit
        :error: Applicaiton Error when limit is reached.
        :error: Assertion Error if the reply_tweet_ids_list is None or not a list
        """
        # reply_tweet_ids = list()
        assert reply_tweet_ids_list is not None
        assert type(reply_tweet_ids_list) == list
        tweet_id = tweet.id
        user_name = tweet.user.screen_name
        max_id = None
        replies = tweepy.Cursor(self._api.search,
                                count=search_per_request,
                                q='to:{}'.format(user_name),
                                since_id=tweet_id,
                                max_id=max_id,
                                tweet_mode='extended').items()

        try:
            startTime = datetime.now()
            for reply in replies:
                current_time = datetime.now()
                if (reply.in_reply_to_status_id == tweet_id):
                    # pytest.set_trace()
                    reply_tweet_ids_list.append(reply.id)
                if len(reply_tweet_ids_list) == reply_limit or \
                     (current_time-startTime).total_seconds() >= cnst.MAX_TIME_REPLY_SEARCH :
                    # pytest.set_trace()
                    break
                max_id = reply.id
            # pytest.set_trace()
            return reply_tweet_ids_list
        except tweepy.TweepError as e:
            raise ApplicationError(*error_list["LMT_RCHD_ERROR"])
コード例 #15
0
                after.strftime("%B%-d").lower() + ".csv", 'r') as csvfile:
            data = csv.reader(csvfile, delimiter=' ', quotechar='|')
            """
            for row in data:
                print (row)
                break    
            """

            totaldata = pd.read_csv(filename, header=None)
            dataframe = totaldata[0]
            numberfile = "number_corona_tweets" + date.strftime(
                "%B%-d").lower() + ".txt"
            readyfile = "ready_corona_tweets" + date.strftime(
                "%B%-d").lower() + ".csv"
            dataframe.to_csv(numberfile, index=False, header=None)
            for tweet in t.hydrate(open(numberfile)):
                #print (tweet["place"]["country"])
                if (tweet["place"] == None):
                    continue
                if (tweet["place"]["country"] == None):
                    continue
                '''
                if ("place" not in tweet):
                    print ("gibberish")
                    continue
                if ("country" not in tweet["place"]):
                    #print ("sdjfksldf")
                    continue
                '''
                if (tweet["place"]["country"] == "United States"):
                    #print("sdkjf")
コード例 #16
0
ファイル: hydrate.py プロジェクト: RenNedz/third-year-project
from twarc import Twarc
from File_manager import File_manager
#usage, takes in a file called ids with id's of tweets to hydrate.
testing = {""}
t = Twarc('4lJGm5YUrXgtwfMUmlo9L4KgH',
          'YIYeIiZGCJpolIASa56eqLJsEa54vGKFt07CkTai3SWKoCPx3w',
          '276679861-dgKpojdDSWRutxEG7NH2A2ZgD7xHtzcuwOisMo1T',
          '4ckJoGnRm5defgDegfV4opPETmarGuNQr9U6pAVEwR9sT')
out_file = File_manager.open_for_download("download")  #Get file to write to
ids = open("ids", "r", 1)
for tweet in t.hydrate(ids):
    tweet = str(tweet)
    out_file.write(tweet + "\n")
    testing.add("for loop executed")
out_file.close()
ids.close()
testing.add("hydrate  executed")
print testing
コード例 #17
0
from twarc import Twarc
import pprint
import json

consumer_key = "2NBPNFml9TtV3ValyhgZqP4ch"
consumer_secret = "qzCNGbr5I5vD2GAps7gdsQRNW4GbmlhODp0BokqFgCzLw2TjjV"
access_token = "931008641255084032-rMD6zn8esls7S1z4UiebC52Tb0gp8BM"
access_token_secret = "kpxBObeQfcpqbU8EikrionXFa1NbYpstYwPGA542av7K3"

output = open("sample1.json", 'w')
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
hydrated = []

count = 0
for tweet in t.hydrate(open('representatives.txt')):
    if count > 10000:
        break

    count += 1
    hydrated.append(tweet)

    if count == (0 % 1000):
        output.write(json.dump(hydrated))

print("done!")
print(count, " tweets pulled.")
output.write(json.dumps(hydrated))
コード例 #18
0
from twarc import Twarc
import json

#input twitter credentials
consumer_key = '*********'
consumer_secret = '*********'
access_token = '*********'
access_token_secret = '*********'

t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
data = []

for tweet in t.hydrate(open('../input_files/ids.txt')):
    data.append(json.dumps(tweet))

with open('output.json', 'w') as outfile:
    outfile.write("\n".join(data) + '\n')
コード例 #19
0
twarc = Twarc()
tmp_df = pd.read_csv(LOG_FILE, names=["file"])
traversed = list(tmp_df.file.values)
with open(LOG_FILE, 'a+') as logf:
    for file in os.listdir(PATH):
        if file not in traversed:
            file_postfix = str(file).split(".")[0][-2:]
            sample_size = weights[file_postfix]
            print("Extract from file: ", file, "for ", sample_size,
                  " samples:")
            ids = sample_file(PATH + file, sample_size)
            output_file_name = str(file).split(".")[0] + "_contents.txt"
            # log
            w_ = csv.writer(logf)
            w_.writerow([file])
            # extract content
            with open(OUTPUT_PATH + output_file_name, 'w') as wf:
                for tweet in twarc.hydrate(ids):
                    if "retweeted_status" in tweet:
                        if tweet['lang'] != "en":
                            continue
                        else:
                            row = []
                            row.append(tweet['created_at'])
                            row.append(tweet['id_str'])
                            row.append(tweet['retweeted_status']['full_text'])
                            w = csv.writer(wf, delimiter=',')
                            w.writerow(row)
        else:
            print(file, " is processed already!")
コード例 #20
0
from twarc import Twarc
import csv

# Replace with your keys and tokens
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

# Files
tweetList = r"NCHB2-idsab.txt"
csv_write = csv.writer(open("tweets.csv", "wb"))

# Hydrate tweets (remove break to go through entire list)
i = 0
for tweet in t.hydrate(open(tweetList)):
    text = tweet["full_text"].encode('utf-8')
    csv_write.writerow([tweet["id"], text])
    i += 1
    if i == 5:
        break

コード例 #21
0
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)

for section in cfg:
    print(section)

consumer_key = cfg['twitter']['consumer_key']
consumer_secret = cfg['twitter']['consumer_secret']
access_token = cfg['twitter']['access_token']
access_token_secret = cfg['twitter']['access_token_secret']


def ids():
    for id in open("brexit_tweet_ids.csv"):
        yield id


t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

keys = [
    "text", "id", "created_at", "favorite_count", "lang", "place",
    "coordinates", "user", "entities", "geo", "retweeted", "retweet_count"
]
with open('tweets123.txt', 'w') as outfile:
    for tweet in t.hydrate(ids()):
        tweet1 = {filter_key: tweet[filter_key] for filter_key in keys}
        values_json = json.dumps(tweet1, sort_keys=True)
        outfile.write(values_json + "\n")
        print(tweet1['text'])
コード例 #22
0
url = "https://drive.google.com/file/d/1COJ1zrJE-acz0yZssIljRSAPyIRtS2EC/view?usp=sharing"
r = requests.get(url)


def reader_generator(reader):
    b = reader(1024 * 1024)
    while b:
        yield b
        b = reader(1024 * 1024)


def raw_newline_count(fname):
    f = open(fname, 'rb')
    f_gen = reader_generator(f.raw.read)
    return sum(buf.count(b'\n') for buf in f_gen)


if __name__ == "__main__":
    gzip_path = r.with_suffix('.jsonl.gz')
    if gzip_path.is_file():
        return

    num_ids = raw_newline_count(r)

    with gzip.open(gzip_path, 'w') as output:
        with tqdm(total=num_ids) as pbar:
            for tweet in twarc.hydrate(r.open()):
                output.write(json.dumps(tweet).encode('utf8') + b"\n")
                pbar.update(1)
コード例 #23
0
    config = json.load(data_file)

logging.info('Finished parsing config.')

handle = MongoHandle(config)
logging.info('Initialized the Mongo connection.')

t = Twarc(config['twitter']['consumer_key'], config['twitter']['consumer_secret'],
          config['twitter']['access_token'], config['twitter']['access_token_secret'])
logging.info('Initialized Twitter connection.')

for source_file in os.listdir('./' + config['source_folder']):
    logging.info('Preparing to hydrate: ' + source_file)
    tweet_ids = open('./' + config['source_folder'] + '/' + source_file)
    new_tweet_ids = []
    logging.info('Parsing tweet ids.')
    start = time.time()
    for line in tweet_ids:
        line = line.strip()
        if (not handle.is_written(line)):
            new_tweet_ids.append(line)

    end = time.time()
    logging.info('Finished looking for new tweets in %.2f seconds.' % (end - start))
    handle.write(t.hydrate(new_tweet_ids), source_file)
    tweet_ids.close()
    logging.info('Finished hydrating: ' + source_file)

logging.info('Finished hydration task.')
handle.clean()
コード例 #24
0
# Twitter auth for downloading tweets
CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY")
CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET")
ACCESS_TOKEN = os.environ.get("TWITTER_ACCESS_TOKEN")
ACCESS_TOKEN_SECRET = os.environ.get("TWITTER_ACCESS_TOKEN_SECRET")

# Concat and read all the CSVs
dir1 = "data/twitter-framing-master/congressional_tweets_dataset_2017/unlabeled/"
dir2 = "data/twitter-framing-master/congressional_tweets_dataset_2017/labeled/"
csv_files = glob.glob(os.path.join(dir1, "*.csv")) + glob.glob(
    os.path.join(dir2, "*.csv"))
HEADERS = [
    "tweet_id", "issue1", "issue2", "frame1", "frame2", "frame3", "party", "ts"
]
all_df = pd.concat(
    (pd.read_csv(f, names=HEADERS, header=None) for f in csv_files),
    ignore_index=True)

t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
tweet_texts = {}
for tweet in t.hydrate(all_df["tweet_id"]):
    tweet_texts[tweet["id"]] = tweet["full_text"]

text_df = pd.DataFrame(tweet_texts, index=[0]).transpose().rename(columns={
    "index": "tweet_id",
    0: "text"
})
all_df = all_df.set_index("tweet_id")
joined = all_df.join(text_df)
joined.to_pickle("data/tweets.pkl")
コード例 #25
0
tweet_array = []
i = 0
foundid = ""

with open(
        r'''C:\Users\shann\OneDrive\Documents\FLU DATA\flu_annotations\2011-12\TESTPYTHON.csv''',
        'r') as f:
    tweet_array = list(csv.reader(f))

#hydrate the tweets
t = Twarc("RNlmCwGeBhosfRAnkUCswZdrQ",
          "FPN1re2DugM3j5PysnXaA2JwPlOsoFdlGJSuPkLDPyWNd90hkG",
          "1103298875652300800-qDk6jxDGiP5aP2WHr4xhoAUwhIwob1",
          "RMLXZlarWN1NVab2IF38bmtbKNX34euugxwrYnDChmyui")
for tweet in t.hydrate(
        open(
            r'''C:\Users\shann\OneDrive\Documents\FLU DATA\flu_annotations\2011-12\TESTPYTHON.csv'''
        )):
    time.sleep(0.5)
    tweet_array[i][0] = tweet['id']
    print(tweet_array[i][0], " \n")

    tweet_array[i][1] = BMP(tweet['full_text'])  # text
    print(tweet_array[i][1], " \n")

    tweet_array[i][2] = tweet['user']['id']
    print(tweet_array[i][2], " \n")

    tweet_array[i][3] = BMP(tweet['user']['location'])  # encode it
    print(tweet_array[i][3], " \n")

    tweet_array[i][4] = BMP(tweet['created_at'])
コード例 #26
0
ids = []
for file in filenames:
    with open(file, 'r') as tweetids:
        ids.append(tweetids.read())

# Write these merged ids
with open('ids.txt', 'w') as outfile:
    for i in ids[0:2]:
        outfile.write(str(i))

ids[0:2]

testids = [ids[0][0:19], ids[0][20:39], ids[0][40:59]]

jsontweets = []
for tweet in t.hydrate(ids[0]):
    jsontweets.append(tweet)

#for tweet in t.hydrate(open('ids.txt')):
#    print(tweet["text"])

jsontweets

testids = ['1245140084313206786', '1245140084350910464', '1245140084417941505']

jsontweets = []
for i in t.hydrate(testids):
    jsontweets.append(i)

jsontweets = json_normalize(jsontweets)
コード例 #27
0
  access_token_secret] = getKeysTokens(user)
 t = Twarc(consumer_key, consumer_secret, access_token,
           access_token_secret)
 curr_df = pd.read_csv(id_file)
 all_ids = curr_df.iloc[:, 0]
 last_idx = (len(all_ids) - 1)
 print(id_file, "has", last_idx, "ids")
 curr_idx = range(num_iter * tweet_limit, (num_iter + 1) * tweet_limit)
 # setup output .csv
 hydrated_file = filename[:-4] + '_hydrated.csv'
 open(hydrated_file, 'w')
 print('beginning to hydrate', target_length, 'number of tweets from',
       id_file)
 # hydrate up to target_length number of tweets
 while num_iter * tweet_limit < target_length + tweet_limit:
     tweets = t.hydrate(all_ids[curr_idx])
     for tweet in tweets:
         try:
             curr_tweet = dict()
             # ignore any non-English tweets or tweets that don't contain the word vaccine/vax
             if tweet['lang'] != "en":
                 continue
             curr_tweet['text'] = tweet['full_text']
             curr_tweet['id'] = tweet['id']
             curr_tweet['place'] = tweet['place']['country'] if tweet[
                 'place'] else ""
             curr_tweet['created_at'] = tweet['created_at']
             curr_tweet['user_location'] = tweet['user']['location']
             curr_tweet['user_name'] = tweet['user']['name']
             curr_tweet['user_followers_count'] = tweet['user'][
                 'followers_count']
コード例 #28
0
import csv

ACCESS_TOKEN = "2668727876-Yrz4VAyuedncEMFsFRQhy5G8b6ZKbcB9x2G58BU"
ACCESS_TOKEN_SECRET = "LEXRPAoFSKE7oBaqrrZRUBnIbgdoWbZhS5vG2zM2s7Y6j"
CONSUMER_KEY = "l79fswnkaCLeUjXeZzPir9iQU"
CONSUMER_SECRET = "6s1h36BhY9Ypdu7pxDWWSyT2u6mYpex8EUXwKJaewDAtxhsGVq"
t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY)

with open('April1.tsv', 'r') as fin, open('April1_out.tsv', 'w') as fout:

    reader = csv.reader(fin, dialect='excel-tab')
    writer = csv.writer(fout, dialect='excel-tab')
    for row in reader:
        # delete indices in reverse order to avoid shifting earlier indices
        del row[1:]
        writer.writerow(row)
# t hydrate March1_out.tsv > March1.jsonl

with open('April1.csv', mode='w', encoding="utf-8") as corona_file:
    fieldnames = ['date', 'text', 'truncated']
    writer = csv.DictWriter(corona_file, fieldnames=fieldnames)
    writer.writeheader()
    for tweet in t.hydrate(open('April1_out.tsv')):
        p.clean(tweet["full_text"])
        writer.writerow({
            'date': tweet["created_at"],
            'text': tweet["full_text"],
            'truncated': tweet["truncated"]
        })