Esempio n. 1
0
def get_query(query_string, jjmin, jjmax, tweet_fields, nb_tweets=10):
    """Generate formatted query for Twitter v2 API

    Args:
        query_string (string): string used to build query
        jjmin (int): min day offset from current day 
        jjmax (int): max day offset from current day
        tweet_fields (string): fields required to query Tiwtter API
        nb_tweets (int, optional): Max number of tweets to return. Defaults to 10.

    Returns:
        dict: formatted query

    """

    current_date = pd.to_datetime('today')
    start_time = (current_date + pd.Timedelta(jjmin, "D")).strftime("%Y-%m-%d")
    end_time = (current_date + pd.Timedelta(jjmax, "D")).strftime("%Y-%m-%d")

    if jjmax == 1:  # Return end time as today - 1min
        end_time = (current_date -
                    pd.Timedelta(1, "m")).strftime("%Y-%m-%dT%H:%M")

    query = gen_request_parameters(query_string,
                                   tweet_fields=tweet_fields,
                                   start_time=start_time,
                                   end_time=end_time,
                                   results_per_call=nb_tweets)
    return query
def load():
    config = twitter_conifg()
    base_date = datetime.datetime.today()
    date_list = [base_date - datetime.timedelta(days=x) for x in range(5)]
    date_list.reverse()
    all_tweets = []
    for idx, date in enumerate(date_list):
        if idx != 4:
            final_date = date + datetime.timedelta(days=1)
            search_args = load_credentials(
                filename="./configs/twitter_api.yaml",
                yaml_key="search_tweets_v2",
                env_overwrite=False)

            query = gen_request_parameters(
                config['query'],
                results_per_call=100,
                place_fields='country',
                start_time=date.strftime('%Y-%m-%d'),
                end_time=final_date.strftime('%Y-%m-%d'))

            tweets = collect_results(query,
                                     max_tweets=1000,
                                     result_stream_args=search_args)

            def add_date(x):
                x['fecha'] = date.strftime('%Y-%m-%d')

                return x

            tweets = list(map(add_date, tweets))
            all_tweets.append(tweets)

    all_tweets = reduce(lambda x, y: x + y, all_tweets)
    return all_tweets
Esempio n. 3
0
def search(queryString, outputpath, api_key_yaml,startTime="2016-01-01",endTime="2021-03-15", lang="en"):

    search_args = load_credentials(api_key_yaml,
                                   yaml_key="search_tweets_v2",
                                   env_overwrite=False)

    print("Should be 1024, but it:")
    print(len(queryString + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang))

    #,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations
    query = gen_request_parameters(query=queryString.strip() + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang, media_fields="media_key,type",user_fields="id,description,location,name,entities,url,username,public_metrics,verified,withheld,protected",tweet_fields="id,text,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations,attachments",start_time=startTime,end_time=endTime, stringify=False, expansions="author_id,attachments.media_keys",results_per_call=500)

    rs = ResultStream(request_parameters=query, max_tweets=sys.maxsize, max_requests=sys.maxsize, **search_args)
    i = 0
    with open(outputpath, 'w') as outputcsv:
        writer = csv.writer(outputcsv)
        writer.writerow(headers)
        for tweet in rs.stream():
            # print(tweet)
            if "id" in tweet:
                writer.writerow(createRow(headers, tweet))
            if "users" in tweet:
                print("parsing users")
                dump_users_info(tweet,outputpath.replace(".csv",str(i) +"-users.csv"))
                i+=1
Esempio n. 4
0
def create_query_obj(query_str: str, start_date: str, end_date: str):
    return gen_request_parameters(
        query=query_str,
        results_per_call=500,
        start_time=start_date,
        end_time=end_date,
        tweet_fields="created_at",
        expansions="author_id,referenced_tweets.id,referenced_tweets.id.author_id",
    )
def main(mytimer: func.TimerRequest,
         fetchedTweetsQue: func.Out[func.QueueMessage]) -> None:
    time = datetime.utcnow().replace(tzinfo=timezone.utc)

    hashtags = get_hashtags()
    credentials = load_twitter_credentials()
    start_time = time - timedelta(minutes=5)
    tweet_fields = ['id', 'text', 'created_at', 'lang']

    for hashtag in hashtags:
        query = hashtag

        logging.info(f'Fetching tweets with query: {query}')
        retquest_params = gen_request_parameters(
            query,
            start_time=start_time.strftime("%Y-%m-%d %H:%M"),
            tweet_fields=','.join(tweet_fields),
            # since_id= # TODO: Use last fetch tweet id in request
        )

        response = collect_results(retquest_params,
                                   max_tweets=100,
                                   result_stream_args=credentials)

        if (response):
            tweets = response[:-1]
            response_metadata = response[-1]
            # TODO: Store 'newest_id'
            # TODO: Support pagination

            logging.info(f'Unfiltered tweets count: {len(tweets)}')

            messages = []
            for t in filter_tweets(tweets):
                t['hashtag'] = hashtag
                messages.append(dumps(t))

            logging.info(f'Filtered tweets count: {len(messages)}')
            logging.info(messages)
            fetchedTweetsQue.set(messages)

    logging.info('Python timer trigger function ran at %s', time.isoformat())
    # loop through dates
    for single_date in daterange(start_date, end_date):

        # set start timestamp
        start_ts = single_date

        # set end timestamp
        end_ts = single_date + timedelta(days=1)

        # payload rules for v2 api
        rule = gen_request_parameters(
            query=config['query'],
            results_per_call=config['results_per_call'],
            start_time=start_ts.isoformat(),
            end_time=end_ts.isoformat(),
            tweet_fields=tweetfields,
            user_fields=userfields,
            media_fields=mediafields,
            place_fields=placefields,
            expansions=expansions,
            stringify=False)

        # result stream from twitter v2 api
        rs = ResultStream(request_parameters=rule,
                          max_results=100000,
                          max_pages=1,
                          max_tweets=config['max_tweets'],
                          **search_creds)

        # number of reconnection tries
        tries = 10
Esempio n. 7
0
    def lookup(self, config: TwitterSourceConfig,
               **kwargs) -> List[AnalyzerRequest]:
        if not config.query and not config.keywords and not config.hashtags and config.usernames:
            raise AttributeError(
                "At least one non empty parameter required (query, keywords, hashtags, and usernames)"
            )

        place_fields = ",".join(
            config.place_fields) if config.place_fields is not None else None
        user_fields = ",".join(
            config.user_fields) if config.user_fields is not None else None
        expansions = ",".join(
            config.expansions) if config.expansions is not None else None
        tweet_fields = ",".join(
            config.tweet_fields) if config.tweet_fields is not None else None

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Dict[
            str, Any] = None if id is None else self.store.get_source_state(id)
        since_id: Optional[
            int] = config.since_id or None if state is None else state.get(
                "since_id", None)
        until_id: Optional[
            int] = config.until_id or None if state is None else state.get(
                "until_id", None)
        update_state: bool = True if id else False
        state = state or dict()
        max_tweet_id = since_id
        min_tweet_id = until_id
        lookup_period = config.lookup_period
        start_time = None if lookup_period is None else datetime.strptime(
            convert_utc_time(lookup_period), "%Y-%m-%dT%H:%M:%S%z")

        if since_id or until_id:
            lookup_period = None

        query = self._generate_query_string(query=config.query,
                                            keywords=config.keywords,
                                            hashtags=config.hashtags,
                                            usernames=config.usernames,
                                            operators=config.operators)

        source_responses: List[AnalyzerRequest] = []
        need_more_lookup = True
        while need_more_lookup:
            search_query = gen_request_parameters(
                query=query,
                results_per_call=config.max_tweets,
                place_fields=place_fields,
                expansions=expansions,
                user_fields=user_fields,
                tweet_fields=tweet_fields,
                since_id=since_id,
                until_id=until_id,
                start_time=lookup_period)
            logger.info(search_query)

            tweets_output = collect_results(
                query=search_query,
                max_tweets=config.max_tweets,
                result_stream_args=config.credential.get_twitter_credentials())

            if not tweets_output:
                logger.info("No Tweets found")
                need_more_lookup = False
                break

            tweets = []
            users = []
            meta_info = None
            for raw_output in tweets_output:
                if "text" in raw_output:
                    tweets.append(raw_output)
                elif "users" in raw_output:
                    users = raw_output["users"]
                elif "meta" in raw_output:
                    meta_info = raw_output["meta"]

            # Extract user info and create user map
            user_map: Dict[str, Dict[str, Any]] = {}
            if len(users) > 0 and "id" in users[0]:
                for user in users:
                    user_map[user["id"]] = user

            # TODO use it later
            logger.info(f"Twitter API meta_info='{meta_info}'")

            for tweet in tweets:
                if "author_id" in tweet and tweet["author_id"] in user_map:
                    tweet["author_info"] = user_map.get(tweet["author_id"])

                source_responses.append(self._get_source_output(tweet))

                # Get latest tweet id
                current_tweet_id = int(tweet["id"])

                logger.info(
                    f'{tweet["created_at"]}:{current_tweet_id}:{since_id}:{until_id}'
                )

                if start_time:
                    created_date = datetime.strptime(tweet["created_at"],
                                                     "%Y-%m-%dT%H:%M:%S.%f%z")
                    if start_time > created_date:
                        need_more_lookup = False
                        break

                if max_tweet_id is None:
                    max_tweet_id = current_tweet_id
                if min_tweet_id is None:
                    min_tweet_id = current_tweet_id
                if max_tweet_id < current_tweet_id:
                    max_tweet_id = current_tweet_id
                if min_tweet_id > current_tweet_id:
                    min_tweet_id = current_tweet_id

            logger.info(f'{max_tweet_id}:{min_tweet_id}')
            until_id = min_tweet_id
            lookup_period = None

        if update_state:
            state["since_id"] = max_tweet_id
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses
Esempio n. 8
0
    with open(value, "r") as credfile:
        os.environ[key] = credfile.read()

stream_args = load_credentials(filename="config.yaml",
                               yaml_key="search_tweets_pgdinamica",
                               env_overwrite=True)

LIMIT = 100
search_term = "python"

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

for day in range(14, 21):
    query = gen_request_parameters(f"{search_term} lang:pt",
                                   start_time=f"2021-03-{day} 09:00",
                                   results_per_call=LIMIT)

    tweets = collect_results(query,
                             max_tweets=LIMIT,
                             result_stream_args=stream_args)

    print(f"{len(tweets)} resultados no dia {day}")

    with open(os.path.join(OUT_DIR, f"tweets_{search_term}.txt"),
              "a") as tweetsfile:
        lines = [tweet['text'] for tweet in tweets if 'text' in tweet]
        tweetsfile.writelines(lines)

print("FIM")
Esempio n. 9
0
    def lookup(self, config: TwitterSourceConfig,
               **kwargs) -> List[TextPayload]:  # type: ignore[override]
        if (not config.query and not config.keywords and not config.hashtags
                and config.usernames):
            raise AttributeError(
                "At least one non empty parameter required (query, keywords, hashtags, and usernames)"
            )

        place_fields = (",".join(config.place_fields)
                        if config.place_fields is not None else None)
        user_fields = (",".join(config.user_fields)
                       if config.user_fields is not None else None)
        expansions = (",".join(config.expansions)
                      if config.expansions is not None else None)
        tweet_fields = (",".join(config.tweet_fields)
                        if config.tweet_fields is not None else None)

        # Get data from state
        identifier: str = kwargs.get("id", None)
        state: Optional[Dict[str,
                             Any]] = (None if identifier is None
                                      or self.store is None else
                                      self.store.get_source_state(identifier))
        since_id: Optional[int] = (config.since_id or None if state is None
                                   else state.get("since_id", None))
        until_id: Optional[int] = (config.until_id or None if state is None
                                   else state.get("until_id", None))
        update_state: bool = True if identifier else False
        state = state or dict()
        max_tweet_id = since_id
        lookup_period = config.lookup_period
        if lookup_period is None:
            start_time = None
        elif len(lookup_period) <= 5:
            start_time = convert_utc_time(lookup_period).replace(
                tzinfo=pytz.UTC)
        else:
            start_time = datetime.strptime(lookup_period,
                                           "%Y-%m-%dT%H:%M:%S%z")

        if since_id or until_id:
            lookup_period = None

        query = self._generate_query_string(
            query=config.query,
            keywords=config.keywords,
            hashtags=config.hashtags,
            usernames=config.usernames,
            operators=config.operators,
        )

        source_responses: List[TextPayload] = []

        search_query = gen_request_parameters(
            granularity=None,
            query=query,
            results_per_call=config.max_tweets,
            place_fields=place_fields,
            expansions=expansions,
            user_fields=user_fields,
            tweet_fields=tweet_fields,
            since_id=since_id,
            until_id=until_id,
            start_time=lookup_period,
            stringify=False,
        )
        logger.info(search_query)

        tweets_output = collect_results(
            query=search_query,
            max_tweets=config.max_tweets,
            result_stream_args=config.get_twitter_credentials(),
        )

        tweets: List[Dict[str, Any]] = []
        users: List[Dict[str, Any]] = []
        meta_info: Dict[str, Any] = {}

        if not tweets_output and len(tweets_output) == 0:
            logger.info("No Tweets found")
        else:
            tweets = tweets_output[0]["data"] if "data" in tweets_output[
                0] else tweets
            if "includes" in tweets_output[0] and "users" in tweets_output[0][
                    "includes"]:
                users = tweets_output[0]["includes"]["users"]
            meta_info = tweets_output[0]["meta"] if "meta" in tweets_output[
                0] else meta_info

        # Extract user info and create user map
        user_map: Dict[str, Dict[str, Any]] = {}
        if len(users) > 0 and "id" in users[0]:
            for user in users:
                if "username" in user:
                    user[
                        "user_url"] = f'https://twitter.com/{user["username"]}'
                user_map[user["id"]] = user

        logger.info(f"Twitter API meta_info='{meta_info}'")

        for tweet in tweets:
            if "author_id" in tweet and tweet["author_id"] in user_map:
                tweet["author_info"] = user_map.get(tweet["author_id"])

            source_responses.append(self._get_source_output(tweet))

            if start_time:
                created_date = datetime.strptime(tweet["created_at"],
                                                 "%Y-%m-%dT%H:%M:%S.%f%z")
                if start_time > created_date:
                    break

        max_tweet_id = meta_info[
            "newest_id"] if "newest_id" in meta_info else max_tweet_id
        # min_tweet_id = meta_info["oldest_id"] if "oldest_id" in meta_info else min_tweet_id

        if update_state and self.store is not None:
            state["since_id"] = max_tweet_id
            self.store.update_source_state(workflow_id=identifier, state=state)

        return source_responses
Esempio n. 10
0
# Where to save our results should be defined here
saving_path = r'/Volumes/My Passport for Mac/tweets/tweets_metadata_08052021.csv'
""" Credentials file for developer accounts, mandatory for the access to API!!! """

credentials = load_credentials(filename="credentials.yaml",
                               yaml_key="credentials",
                               env_overwrite=False)  # change if needed
""" Query can be defined here, 
there always has to be a certain search keyword, I put 'a' here because of the wider 
reach, might be possible to exclude (have to do further investigations in this case.
 results_per_call can be redefined via a .yaml file """

# request params for this query
query = searchtweets.gen_request_parameters("a lang:de",
                                            start_time="2021-05-08T00:00",
                                            end_time="2021-05-08T23:59",
                                            results_per_call=100)
""" List of tweet dicts, including the ids and the tweet text. Can be directly printed or stored in a file """

# tweets = collect_results(query,
#                          max_tweets=100,
#                          result_stream_args=credentials)
#
# with open('./tweets.txt', 'w') as tweet_file:
#     for x in tweets:
#         for y in x:
#             if y == 'text':
#                 tweet_file.write(x[y] + '\n')


def check_files():
Esempio n. 11
0
        print("Empty txt can't generate a word cloud")
    else:
        wordcloud = WordCloud().generate(text)

        image = wordcloud.to_image()
        image.save(os.path.join("output", f"{OUT_FILE}.png"))
        print("Image Loaded")


stream_args = load_credentials(filename="config.yalm",
                               yaml_key="search_tweets_v2",
                               env_overwrite=False)

tweeterUser = input("Inform the Tweet user: "******"from:{tweeterUser} -has:links",
                               results_per_call=TWITTER_QUANTITY)

try:
    tweets = collect_results(query,
                             max_tweets=TWITTER_QUANTITY,
                             result_stream_args=stream_args)
except requests.exceptions.HTTPError as exception:
    print(colored("There's an error in your api request, Error: ", 'red'))
    sys.exit()

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

with io.open(os.path.join(OUT_DIR, f"{OUT_FILE}.txt"), "w",
             encoding="utf-8") as tweetsfile:
    for tweet in tweets:
Esempio n. 12
0
    def lookup(self, config: TwitterSourceConfig) -> List[AnalyzerRequest]:
        if not config.query and not config.keywords and not config.hashtags and config.usernames:
            raise AttributeError(
                "At least one non empty parameter required (query, keywords, hashtags, and usernames)"
            )

        place_fields = ",".join(
            config.place_fields) if config.place_fields is not None else None
        user_fields = ",".join(
            config.user_fields) if config.user_fields is not None else None
        expansions = ",".join(
            config.expansions) if config.expansions is not None else None
        tweet_fields = ",".join(
            config.tweet_fields) if config.tweet_fields is not None else None

        query = self._generate_query_string(query=config.query,
                                            keywords=config.keywords,
                                            hashtags=config.hashtags,
                                            usernames=config.usernames,
                                            operators=config.operators)

        search_query = gen_request_parameters(
            query=query,
            results_per_call=config.max_tweets,
            place_fields=place_fields,
            expansions=expansions,
            user_fields=user_fields,
            tweet_fields=tweet_fields,
            since_id=config.since_id,
            until_id=config.until_id,
            start_time=config.lookup_period)

        tweets_output = collect_results(
            query=search_query,
            max_tweets=config.max_tweets,
            result_stream_args=config.credential.get_twitter_credentials())

        if not tweets_output:
            logger.info("No Tweets found")
            return []

        tweets = []
        users = []
        meta_info = None
        for raw_output in tweets_output:
            if "text" in raw_output:
                tweets.append(raw_output)
            elif "users" in raw_output:
                users = raw_output["users"]
            elif "meta" in raw_output:
                meta_info = raw_output["meta"]

        # Extract user info and create user map
        user_map: Dict[str, Dict[str, Any]] = {}
        if len(users) > 0 and "id" in users[0]:
            for user in users:
                user_map[user["id"]] = user

        # TODO use it later
        logger.info(f"Twitter API meta_info='{meta_info}'")

        source_responses: List[AnalyzerRequest] = []
        for tweet in tweets:
            if "author_id" in tweet and tweet["author_id"] in user_map:
                tweet["author_info"] = user_map.get(tweet["author_id"])

            source_responses.append(self._get_source_output(tweet))

        return source_responses
Esempio n. 13
0
from searchtweets import ResultStream, gen_request_parameters, load_credentials

search_args = load_credentials("~/.twitter_keys.yaml",
                               yaml_key="search_tweets_v2",
                               env_overwrite=False)

query = gen_request_parameters("Electric Vehicle", results_per_call=100)

rs = ResultStream(request_parameters=query,
                  max_results=500,
                  max_pages=1,
                  **search_args)

tweets = list(rs.stream())