Python ResultStream.ResultStream Exemples, searchtweets.ResultStream.ResultStream Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : search_funcs.py Projet : jackpay/twitter-v2-script

def search(queryString, outputpath, api_key_yaml,startTime="2016-01-01",endTime="2021-03-15", lang="en"):

    search_args = load_credentials(api_key_yaml,
                                   yaml_key="search_tweets_v2",
                                   env_overwrite=False)

    print("Should be 1024, but it:")
    print(len(queryString + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang))

    #,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations
    query = gen_request_parameters(query=queryString.strip() + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang, media_fields="media_key,type",user_fields="id,description,location,name,entities,url,username,public_metrics,verified,withheld,protected",tweet_fields="id,text,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations,attachments",start_time=startTime,end_time=endTime, stringify=False, expansions="author_id,attachments.media_keys",results_per_call=500)

    rs = ResultStream(request_parameters=query, max_tweets=sys.maxsize, max_requests=sys.maxsize, **search_args)
    i = 0
    with open(outputpath, 'w') as outputcsv:
        writer = csv.writer(outputcsv)
        writer.writerow(headers)
        for tweet in rs.stream():
            # print(tweet)
            if "id" in tweet:
                writer.writerow(createRow(headers, tweet))
            if "users" in tweet:
                print("parsing users")
                dump_users_info(tweet,outputpath.replace(".csv",str(i) +"-users.csv"))
                i+=1

Exemple #2

0

Afficher le fichier

Fichier : searcher.py Projet : teovrs/OpinionDynamics-Covid19

    def arquive_search(self,
                       query,
                       start,
                       end,
                       dev_env,
                       max_size=2500,
                       max_call=100):
        self.settings['search_tweets_api']['endpoint'] =\
           f"https://api.twitter.com/1.1/tweets/search/fullarchive/{dev_env}.json"

        credentials = load_credentials("archive_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

        with open('archive_keys.yaml', 'w') as config_file:
            yaml.dump(self.settings, config_file, default_flow_style=False)

        q_rule = gen_rule_payload(query,
                                  results_per_call=max_call,
                                  from_date=start,
                                  to_date=end)

        rs = ResultStream(rule_payload=q_rule,
                          max_results=max_size,
                          **credentials)

        with open('tweet_data_archive.csv', 'a', encoding='utf-8') as file:
            n = 0
            for tweet in rs.stream():
                n += 1
                if n % (max_size / 10) == 0:
                    print('{0}: {1}'.format(str(n), tweet['created_at']))
                json.dump(tweet, file)
                file.write('\n')

Exemple #3

0

Afficher le fichier

def collect_tweets_in_files():
    """ Using a ResultStream for getting tweets
      We can configure the amount of pages/tweets we want to obtain """

    if not check_files():  # file should not already be existing

        max_results = 10000
        max_pages = 300
        max_tweets = 15000

        rs = ResultStream(request_parameters=query,
                          max_results=max_results,
                          max_pages=max_pages,
                          **credentials)

        # Set how many tweets we want to catch
        rs.max_tweets = max_tweets

        tweets_2 = list(rs.stream())
        dataframe = pandas.DataFrame(tweets_2)

        csv_file = dataframe.to_csv(saving_path)
    else:
        print(
            FileExistsError,
            'File already exists! Please check if you really want to overwrite the file.'
        )

Exemple #4

0

Afficher le fichier

Fichier : search_tweets.py Projet : Noura-alh/SMI-working

def main():
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug("command line args dict:")
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    extra_headers_str = args_dict.get("extra_headers")
    if extra_headers_str is not None:
        args_dict['extra_headers_dict'] = json.loads(extra_headers_str)
        del args_dict['extra_headers']

    logger.debug("config file ({}) arguments sans sensitive args:".format(
        args_dict["config_filename"]))
    logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4))

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  account_type=args_dict["account_type"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(creds_dict), dict_filter(args_dict))

    logger.debug("combined dict (cli, config, creds) sans password:"******"ERROR: not enough arguments for the program to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)
    logger.debug(
        "full arguments passed to the ResultStream object sans password")
    logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4))

    rs = ResultStream(tweetify=False, **stream_params)

    logger.debug(str(rs))

    if config_dict.get("filename_prefix") is not None:
        stream = write_result_stream(
            rs,
            filename_prefix=config_dict.get("filename_prefix"),
            results_per_file=config_dict.get("results_per_file"))
    else:
        stream = rs.stream()

    for tweet in stream:
        if config_dict["print_stream"] is True:
            print(json.dumps(tweet))

Exemple #5

0

Afficher le fichier

    def gather_data(self, screen_name: str,  user_id: int, rt_date: str, file_path: str):
        query_str = create_query_str(screen_name)
        # print(f'reconstructing timeline for @{screen_name}')

        time_range = get_start_and_end_date(rt_date)
        query_obj = create_query_obj(query_str, *time_range)
        rs = ResultStream(
            request_parameters=query_obj,
            # parameter changed from 2 -> 1 to avoid being ratelimited within the project timeline
            max_requests=1,
            **self.academic_search_args
        )
        inbound_timeline = []

        replies = []
        retweets = []
        quotes = []

        for tweet in rs.stream():
            if "author_id" not in tweet:
                if "tweets" in tweet:
                    # Tweets are found
                    for t in tweet["tweets"]:
                        if int(t["author_id"]) == user_id:
                            if "referenced_tweets" in t:
                                ref_tweets = t["referenced_tweets"]
                                for ref in ref_tweets:
                                    type = ref["type"]
                                    if type == "replied_to":
                                        replies.append(ref["id"])
                                    elif type == "quoted":
                                        quotes.append(ref["id"])
                            else:
                                # normal tweet, which holds no info on the information strength
                                pass
                        else:
                            if "referenced_tweets" not in t:
                                # the only way this situation can occur is when the tweet is retweeted by the autor
                                # and someone is replying to that retweet
                                retweets.append(t["author_id"])
                            else:
                                # this indicates a reply with a quote, or a reply of a reply
                                pass

        # print(f"done collecting the retweeted user objects, there are {len(retweets)} in total")

        # print(f"converting the {len(replies)} replied tweet objects to user ids")
        replies = self.gather_users(replies)
        # print(f"done collecting the replies user objects, there are {len(replies)} in total")

        # print(f"converting the {len(quotes)} quoted tweet objects to user ids")
        quotes = self.gather_users(quotes)
        # print(f"done collecting the quotes user objects, there are {len(quotes)} in total")

        # print(f"retweets: {len(retweets)}\treplies: {len(replies)}\tquotes: {len(quotes)}")

        dump_dict = {"replies": replies, "quotes": quotes, "retweets": retweets}
        json.dump(dump_dict, open(file_path, "w"))

Exemple #6

0

Afficher le fichier

Fichier : tweet_downloader.py Projet : x0rzkov/graphy-backend

def _download_tweets(trend, enterprise_search_args):
    powertrack_rule = '(has:geo OR has:profile_geo) lang:en -is:retweet %s' % trend
    rule = gen_rule_payload(powertrack_rule, results_per_call=500)
    rs = ResultStream(rule_payload=rule,
                      max_requests=2,
                      **enterprise_search_args)
    for tweet in rs.stream():
        print(tweet)
        _store_tweet(tweet)

Exemple #7

0

Afficher le fichier

def get_tweets(trend,date):
    enddate = date+datetime.timedelta(days=1)
    username="******"
    password="******"
    endpoint="https://gnip-api.twitter.com/search/fullarchive/accounts/greg-students/prod.json"
    bearer_token=""
    rule = gen_rule_payload(trend+" lang:en",from_date=date.isoformat() ,to_date=enddate.isoformat(), results_per_call=500) # testing with a sandbox account
    rs=ResultStream(rule_payload=rule,max_results=10000,max_pages=10, username=username,endpoint=endpoint, password=password)
    #tweets=collect_results(rule, result_stream_args=args,max_results=20000)
    return rs

Exemple #8

0

Afficher le fichier

Fichier : search.py Projet : bugryn-josh/ENGL581_PROJ

def usersTweetsByIds():

    search_args1 = load_credentials(".twitter_keys.yaml",
                                    yaml_key="search_tweets_v2_id",
                                    env_overwrite=False)

    search_args2 = load_credentials(".twitter_keys.yaml",
                                    yaml_key="search_tweets_v2_user",
                                    env_overwrite=False)

    f = open(
        'C:\\Users\\Josh\\Documents\\GitHub\\search-tweets-python\\enUsers_Tweets.json',
        'r',
        encoding='utf-8')

    obj = json.load(f)

    for u in obj['includes']:

        idList = u.get('tweetids')

        ids = ''

        idList = list(set(idList))

        if len(idList) == 0:
            u['tweets'] = []
            continue

        if len(idList) > 99:
            ids = ','.join(idList[0:99])
        else:
            ids = ','.join(idList)

        endTweet = 'https://api.twitter.com/2/tweets'

        query = {"ids": ids, "tweet.fields": "author_id,public_metrics,text"}
        rs = ResultStream(request_parameters=query,
                          endpoint=endTweet,
                          bearer_token=bt)

        tweets = []
        result = list(rs.stream())

        for r in result:

            tweets = r.get('data')

        u['tweets'] = tweets

    fo = open('Random_WithTweets.json', 'w', encoding='utf-8')
    json.dump(obj, fo)

Exemple #9

0

Afficher le fichier

Fichier : main.py Projet : x0rzkov/graphy-backend

def _download_tweets(trend):
    powertrack_rule = '%s (has:geo OR has:profile_geo) lang:en -is:retweet' % trend
    rule = gen_rule_payload(powertrack_rule,
                            results_per_call=500,
                            to_date=None,
                            from_date='201207220000')
    logging.info("PowerTrack rule: %s" % rule)
    rs = ResultStream(rule_payload=rule,
                      max_results=500,
                      max_requests=1,
                      **enterprise_search_args)
    for tweet in rs.stream():
        _push_tweet(tweet, trend)

Exemple #10

0

Afficher le fichier

def get_file(aname,
             cak,
             cask,
             etype,
             hashtag,
             keywords,
             fdate='00-00-0000',
             tdate='00-00-0000',
             ftime='00:00',
             ttime='00:00'):

    if etype == 'efa':  # Full archive scraping (refer to limits on README)
        endp = 'https://api.twitter.com/1.1/tweets/search/fullarchive/' + aname + '.json'
    elif etype == 'tdays':  # 30 days scraping (refer to limits on README)
        endp = 'https://api.twitter.com/1.1/tweets/search/30day/' + aname + '.json'
    else:
        endp = 'ERROR'

    # Creating a yaml credentials file
    config = dict(search_tweets_api=dict(account_type='premium',
                                         endpoint=endp,
                                         consumer_key=cak,
                                         consumer_secret=cask))

    with open('C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml',
              'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)

    # loading credentials
    premium_search_args = load_credentials(
        'C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml',
        yaml_key='search_tweets_api',
        env_overwrite=True)
    print(premium_search_args)

    if etype == 'efa':
        rule = gen_rule_payload(
            results_per_call=100,
            from_date=fdate + ' ' + ftime,  #"2019-07-06 01:00",
            to_date=tdate + ' ' + ttime,  #"2019-07-06 02:15",
            pt_rule=keywords,
        )
    else:
        rule = gen_rule_payload(results_per_call=100, pt_rule=keywords)

    # result stream

    rs = ResultStream(rule_payload=rule, max_results=50, **premium_search_args)

    return rs

Exemple #11

0

Afficher le fichier

Fichier : search_tweets.py Projet : spinrut/Bitcamp-2018

def main():
    parser = parse_cmd_args()
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  account_type=args_dict["account_type"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(args_dict), dict_filter(creds_dict))

    logger.debug(json.dumps(config_dict, indent=4))

    if len(dict_filter(config_dict).keys()
           & REQUIRED_KEYS) < len(REQUIRED_KEYS):
        print(REQUIRED_KEYS - dict_filter(config_dict).keys())
        logger.error("ERROR: not enough arguments for the program to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)

    logger.debug(json.dumps(config_dict, indent=4))

    rs = ResultStream(tweetify=False, **stream_params)

    logger.debug(str(rs))

    if config_dict.get("filename_prefix") is not None:
        stream = write_result_stream(
            rs,
            filename_prefix=config_dict["filename_prefix"],
            results_per_file=config_dict["results_per_file"])
    else:
        stream = rs.stream()

    for tweet in stream:
        if config_dict["print_stream"] is True:
            print(json.dumps(tweet))

Exemple #12

0

Afficher le fichier

Fichier : data_scrapper.py Projet : anika2302/Review-Analysis

    def tw_get_premium_search(self, keyword: str):
        with open(f'datasets/tw_{keyword.lower()}_searches_premium.json',
                  'w') as f:
            try:
                f.write('{"statuses": [')

                rule = gen_rule_payload(
                    pt_rule="near:\"New York, NY\" within:50mi".format(),
                    results_per_call=100,
                    from_date="2018-07-01",
                    to_date="2018-10-01")

                rule = gen_rule_payload(
                    pt_rule="place:\"New York, NY\"".format(),
                    results_per_call=100,
                    from_date=(datetime.date.today() -
                               datetime.timedelta(31)).isoformat(),
                    to_date=datetime.date.today().isoformat())

                next_token = None
                while True:
                    results = ResultStream(rule_payload=rule,
                                           **self.twitter_premium_api)
                    results.next_token = next_token

                    tweets = []

                    try:
                        tweets = list(results.stream())
                    except Exception as ex:
                        print(str(ex))

                    for tweet in tweets:
                        f.write("%s," % json.dumps(tweet))

                    if results.next_token is None:
                        break
                    else:
                        next_token = results.next_token

                next_token is not None and f.seek(f.tell() - 1, os.SEEK_SET)
                f.write("]}")

            except Exception as ex:
                print("Error:\n" + str(ex))

Exemple #13

0

Afficher le fichier

def get_data(search_query, api_key, secret_key, to_date, from_date, filename):
    """ get twitter data through twitter API from full archive search sand box and return all twitters in JSONL file
    based on 
     search term, 
     the geographic location of interest
     the time period of interest.
     and personal twitter account information.

     Reference: https://github.com/geduldig/TwitterAPI/tree/master/TwitterAPI
     Reference: https://developer.twitter.com/en/docs/tweets/search/overview
    """
    print_after_x = 1000
    config = dict(
        search_tweets_api=dict(
            account_type='premium',
            endpoint=f"https://api.twitter.com/1.1/tweets/search/{'fullarchive'}/{'mangroveConservation'}.json",
            consumer_key=api_key,
            consumer_secret=secret_key
        )
    )
    with open('twitter_keys.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)
    from searchtweets import load_credentials, gen_rule_payload, ResultStream

    premium_search_args = load_credentials("twitter_keys.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)
    rule = gen_rule_payload(search_query,
                            results_per_call=100,
                            from_date=from_date,
                            to_date=to_date
                            )
    temp = ResultStream(rule_payload=rule,
                      max_results=100000,
                      **premium_search_args)
    with open(filename, 'a', encoding='utf-8') as temp_file:
        num = 0
        for tweet in temp.stream():
            num += 1
            if num % print_after_x == 0:
                print('{0}: {1}'.format(str(num), tweet['created_at']))
            json.dump(tweet, temp_file)
            temp_file.write('\n')
    print('done')

Exemple #14

0

Afficher le fichier

Fichier : twitter_helper.py Projet : HealthHackAu2020/not_the_only_one

def save_old_tweets():
    from searchtweets import load_credentials, gen_rule_payload, ResultStream
    import json

    premium_search_args = load_credentials("twitter_keys_fullarchive.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)

    query = "from:NTOO_Org"
    rule = gen_rule_payload(query, results_per_call=100)

    rs = ResultStream(rule_payload=rule,
                      max_results=1000,
                      **premium_search_args)

    with open('fullTweetsData.json', 'a', encoding='utf-8') as f:
        for tweet in rs.stream():
            json.dump(tweet, f)
            f.write('\n')

Exemple #15

0

Afficher le fichier

Fichier : sandbox_search.py Projet : krishnan2107/SentimentAnalysis

def read_stream(apiscope, label):
    API_KEY = api_key
    API_SECRET_KEY = api_secret_key
    DEV_ENVIRONMENT_LABEL = label
    API_SCOPE = apiscope  # 'fullarchive'  # 'fullarchive' for full archive, '30day' for last 31 days

    SEARCH_QUERY = 'delays, @WestMidRailway OR @NetworkRailBHM OR @networkrail'
    RESULTS_PER_CALL = 100  # 100 for sandbox, 500 for paid tiers
    TO_DATE = '2021-01-30'  # format YYYY-MM-DD HH:MM (hour and minutes optional)
    FROM_DATE = '2021-01-01'  # format YYYY-MM-DD HH:MM (hour and minutes optional)

    MAX_RESULTS = 10000  # Number of Tweets you want to collect

    # --------------------------- STOP -------------------------------#
    # Don't edit anything below, if you don't know what you are doing.
    # --------------------------- STOP -------------------------------#

    config = dict(search_tweets_api=dict(
        account_type='premium',
        endpoint=
        f"https://api.twitter.com/1.1/tweets/search/{API_SCOPE}/{DEV_ENVIRONMENT_LABEL}.json",
        consumer_key=API_KEY,
        consumer_secret=API_SECRET_KEY))

    with open('twitter_keys.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)

    premium_search_args = load_credentials("twitter_keys.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)

    rule = gen_rule_payload(SEARCH_QUERY,
                            results_per_call=RESULTS_PER_CALL,
                            from_date=FROM_DATE,
                            to_date=TO_DATE)

    rs = ResultStream(rule_payload=rule,
                      max_results=MAX_RESULTS,
                      **premium_search_args)

    return rs

Exemple #16

0

Afficher le fichier

Fichier : twit_scrape.py Projet : luro8005/Sentiment-Analysis

def tweet_search(search_key, search_args):
    """
    search for "spectrumtv" and create a dict of tweet timestamp (dictionary key, in epoch seconds),
                                                 tweet authors screen name (dict value, tuple element 1),
                                                 tweet text (dict value, tuple element 2)
    """
    print("searching for tweets containing \"{}\"".format(search_key))
    key_rule = gen_rule_payload(search_key, results_per_call=100)
    key_rs = ResultStream(rule_payload=key_rule,
                          max_results=500,
                          max_pages=1,
                          **search_args)
    key_results = list(key_rs.stream())
    key_tweets = {}
    for tweet in key_results:
        key_tweets[tweet.created_at_seconds] = (
            tweet.screen_name, tweet.all_text.replace('\n', ' '), ' '
        )  # this space is a placeholder for the sentiment value
    print("{} tweets found containing \"{}\"\n".format(len(key_results),
                                                       search_key))
    return key_tweets

Exemple #17

0

Afficher le fichier

Fichier : get_twitter_data.py Projet : xzhou0509/ReCOVery

def get_twitter_results(news_id,
                        query,
                        from_date,
                        premium_search_args,
                        filename,
                        to_date="202005260000"):
    query1 = "url:" + query + " lang:en"

    rule = gen_rule_payload(query1,
                            from_date=from_date,
                            to_date=to_date,
                            results_per_call=100)

    rs = ResultStream(rule_payload=rule,
                      max_results=100,
                      **premium_search_args)
    l = 0
    with open(filename, 'a', encoding='utf-8') as f:
        n = 0
        for tweet in rs.stream():
            news_tweet_json = {
                "news_id": news_id,
                "query": query,
                "tweet": tweet
            }

            n += 1
            if n % 10 == 0:
                print('{0}: {1}'.format(str(n), tweet['created_at']))
            json.dump(news_tweet_json, f)
            f.write('\n')
            l = datetime.strptime(tweet['created_at'],
                                  "%a %b %d %H:%M:%S +%f %Y").date()
    print(rs, type(l), l)
    print('done')
    return l

Exemple #18

0

Afficher le fichier

Fichier : search.py Projet : bugryn-josh/ENGL581_PROJ

def getRecentTweets():
    endRecent = 'https://api.twitter.com/2/tweets/search/recent'

    search_args_rec = load_credentials(".twitter_keys.yaml",
                                       yaml_key="search_tweets_v2_recent",
                                       env_overwrite=False)

    query = {
        "max_results": 100,
        "tweet.fields": "public_metrics,author_id,lang",
        "query":
        "happy -RT OR upset -RT OR lol -RT OR ugh -RT OR dog -RT OR cat -RT OR food -RT OR sucks -RT",
        "expansions": "author_id",
        "user.fields": "public_metrics"
    }

    rs = ResultStream(
        request_parameters=query,
        endpoint=endRecent,
        bearer_token=bt,
        max_tweets=100,
        max_requests=1,
    )
    result = list(rs.stream())

    obj = {}

    obj['data'] = []
    obj['includes'] = []

    for r in result:
        obj['data'] = obj['data'] + r.get('data')
        obj['includes'] = obj['includes'] + r.get('includes').get('users')

    out = open('testJson.json', 'w')
    json.dump(obj, out)

Exemple #19

0

Afficher le fichier

Fichier : ub_model_v2.py Projet : DataScienceUnbound/Subs_project

    def pull_data_for_handle(self,
                             handle,
                             date,
                             days_before,
                             results_per_call=100,
                             max_results=2500):
        # check handle can be found!
        user_id = self.get_handle_id(handle)
        if user_id is 0:
            return 0
        from_date = self.subtract_from_datestring(date, days_before)
        rule = self.make_rule(handle, date, from_date, results_per_call)

        rs = ResultStream(rule_payload=rule,
                          max_results=max_results,
                          **self.endpoint_args)
        results_list = list(rs.stream())
        #         results_list=temp_dict[list(temp_dict.keys())[0]]
        print('Found', len(results_list), 'tweets for', handle)
        if len(results_list) == max_results:
            print('Max results limit hit (' + str(2500) +
                  '). Consider changing the parameter')

        return self.strip_maxresults_from_query(rule), results_list

Exemple #20

0

Afficher le fichier

Fichier : poll_tweets.py Projet : g9singh/world_cup_2018

def main():
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug("command line args dict:")
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    extra_headers_str = args_dict.get("extra_headers")
    if extra_headers_str is not None:
        args_dict['extra_headers_dict'] = json.loads(extra_headers_str)
        del args_dict['extra_headers']

    logger.debug("config file ({}) arguments sans sensitive args:".format(
        args_dict["config_filename"]))
    logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4))

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(creds_dict), dict_filter(args_dict))

    logger.debug("combined dict (cli, config, creds):")
    logger.debug(json.dumps(_filter_sensitive_args(config_dict), indent=4))

    if len(dict_filter(config_dict).keys()
           & REQUIRED_KEYS) < len(REQUIRED_KEYS):
        print(REQUIRED_KEYS - dict_filter(config_dict).keys())
        logger.error("ERROR: not enough arguments for the script to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)
    logger.debug(
        "full arguments passed to the ResultStream object sans credentials")
    logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4))

    while True:

        start = time.time()
        rs = ResultStream(tweetify=False, **stream_params)

        logger.debug(str(rs))

        if config_dict.get("filename_prefix") is not None:
            stream = write_result_stream(
                rs,
                filename_prefix=config_dict.get("filename_prefix"),
                results_per_file=config_dict.get("results_per_file"))
        else:
            stream = rs.stream()

        first_tweet = True
        tweets_num = 0

        #Iterate through Tweet array and handle output.
        for tweet in stream:
            tweets_num = tweets_num + 1
            #Get Tweet ID from first Tweet
            if first_tweet:
                newest_id = tweet['id']
                first_tweet = False
            if config_dict["print_stream"] is True:
                print(json.dumps(tweet))

        #This polling script switches to a since_id requests and removes the start_time parameter if it is used for backfill.
        #Prepare next query, by setting the since_id request parameter.
        print(f"{tweets_num} new Tweets. Newest_id: {newest_id}")

        request_json = json.loads(stream_params['request_parameters'])

        if 'start_time' in request_json.keys():
            del request_json['start_time']

        request_json.update(since_id=newest_id)
        stream_params['request_parameters'] = json.dumps(request_json)

        duration = time.time() - start

        sleep_interval = (float(config_dict["interval"]) * 60) - duration

        if sleep_interval < 0:
            sleep_interval = (float(config_dict["interval"]) * 60)

        time.sleep(sleep_interval)

Exemple #21

0

Afficher le fichier

Fichier : tweet_extracts.py Projet : dtheod/TwitterAPI-NLP

def extract_tweets():
    today = date.today()
    d1 = today.strftime("%d-%m-%Y")

    with open('config.json','r') as f:
        keys = json.load(f)

    config = dict(
        search_tweets_api = dict(
            account_type = 'premium',
            endpoint = 'https://api.twitter.com/1.1/tweets/search/30day/development1.json',
            consumer_key = keys['consumer_key'],
            consumer_secret = keys['consumer_secret'])
            )
    with open('twitter_keys_fullhistory.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)

    premium_search_args = load_credentials("twitter_keys_fullhistory.yaml",
                                        yaml_key="search_tweets_api",
                                        env_overwrite=False)

    SEARCH_QUERY = 'to:Lloydsbank'
    RESULTS_PER_CALL = 100
    FROM_DATE = "2020-06-01"
    TO_DATE = "2020-06-10"
    MAX_RESULTS = 100000
    FILENAME = 'twitter_input_data_{}_{}.jsonl'.format(FROM_DATE, TO_DATE)  # Where the Tweets should be saved
    PRINT_AFTER_X = 100


    rule = gen_rule_payload(SEARCH_QUERY,
                            results_per_call=RESULTS_PER_CALL,
                            from_date=FROM_DATE,
                            to_date=TO_DATE
                            )

    rs = ResultStream(rule_payload=rule,
                    max_results=MAX_RESULTS,
                    **premium_search_args)

    with open(FILENAME, 'a', encoding='utf-8') as f:
        n = 0
        for tweet in rs.stream():
            n += 1
            if n % PRINT_AFTER_X == 0:
                print('{0}: {1}'.format(str(n), tweet['created_at']))
            json.dump(tweet, f)
            f.write('\n')


    new_tweets = []
    dates_created = []
    location = []
    user= []

    with open(FILENAME, 'rb') as f:
        for item in json_lines.reader(f):
            try:
                new_tweets.append(item['extended_tweet']['full_text'])
            except KeyError as e:
                new_tweets.append(item['text'])
                dates_created.append(item['created_at'])
                location.append(item['user']['location'])
                user.append(item['user']['id'])

    dataframe = pd.DataFrame(list(zip(user, location, dates_created, new_tweets)), 
                columns =['User', 'Location', 'date_created', 'text'])
    print(dataframe.head())
    dataframe.to_csv("tweets.csv", sep =",")

Exemple #22

0

Afficher le fichier

Fichier : premiumapi.py Projet : SDASANZ/Stata-Python-Twitter-API

    consumer_secret=API_SECRET_KEY))

with open('twitter_keys.yaml', 'w') as config_file:
    yaml.dump(config, config_file, default_flow_style=False)

import json
from searchtweets import load_credentials, gen_rule_payload, ResultStream

premium_search_args = load_credentials("twitter_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

rule = gen_rule_payload(SEARCH_QUERY,
                        results_per_call=RESULTS_PER_CALL,
                        from_date=FROM_DATE,
                        to_date=TO_DATE)

rs = ResultStream(rule_payload=rule,
                  max_results=MAX_RESULTS,
                  **premium_search_args)

with open(FILENAME, 'a', encoding='utf-8') as f:
    n = 0
    for tweet in rs.stream():
        n += 1
        if n % PRINT_AFTER_X == 0:
            print('{0}: {1}'.format(str(n), tweet['created_at']))
        json.dump(tweet, f)
        f.write('\n')
print('done')

Exemple #23

0

Afficher le fichier

Fichier : _pull_data.py Projet : kanav-mehra/solve-iwmi

def pull_tweets(query,
                from_date,
                to_date,
                save_path,
                credentials_path,
                yaml_key,
                file_name=None,
                results_per_call=500,
                max_results=3000,
                verbose=False,
                **kwargs):
    """
    Pulls data (i.e., tweets and user info) from Twitter using its API.
    The data received from the API is stored in its original form (JSON)
    without performing any type of preprocessing.

    Parameters
    ----------
    query : str
        Query passed to the Twitter API to fecth Tweets.
    from_date : str or None
        Date format as specified by `convert_utc_time` for the starting time
        of your search.
    to_date : str or None
        Date format as specified by `convert_utc_time` for the end time of
        your search.
    save_path : str
        Path where the raw data will be stored.
    credentials_path : str
        Path for the yaml file with the Twitter API credentials.
    yaml_key : str
        Key within the yaml file containing the Twitter API credentials to be
        used.
    file_name : str or None, default=None
        Name of the json file saved containing the data dump. If None, the
        named will be assigned as a function of `query`, `from_date` and
        `to_date`.
    results_per_call : int, default=500
        Number of Tweets returned per call.
    max_results : int, default=3000
        Maximum number of Tweets to be pulled.
    verbose : int or bool, default=False
        Controls the verbosity when pulling data.


    Returns
    -------
    None : NoneType
    """

    logger = logging.getLogger(__name__)
    logger.propagate = verbose
    logger.info('Pulling raw Twitter data')

    search_args = load_credentials(filename=credentials_path,
                                   yaml_key=yaml_key)

    rule = gen_rule_payload(query,
                            results_per_call=results_per_call,
                            from_date=from_date,
                            to_date=to_date)

    rs = ResultStream(rule_payload=rule,
                      max_results=max_results,
                      **search_args)

    if file_name is None:
        file_name = f'SAMPLE_DATA_QUERY_{query}_'\
                  + f'FROMDATE_{from_date}_TODATE_{to_date}.json'

    with open(os.path.join(save_path, file_name), 'a', encoding='utf-8') as f:
        for tweet in rs.stream():
            json.dump(tweet, f)
            f.write('\n')

    logger.info('Data successfuly saved at' +
                f'\"{os.path.join(save_path, file_name)}\"')
    return None

Exemple #24

0

Afficher le fichier

Fichier : v2_tweets_to_file.py Projet : DigitalGeographyLab/tweetsearcher

        rule = gen_request_parameters(
            query=config['query'],
            results_per_call=config['results_per_call'],
            start_time=start_ts.isoformat(),
            end_time=end_ts.isoformat(),
            tweet_fields=tweetfields,
            user_fields=userfields,
            media_fields=mediafields,
            place_fields=placefields,
            expansions=expansions,
            stringify=False)

        # result stream from twitter v2 api
        rs = ResultStream(request_parameters=rule,
                          max_results=100000,
                          max_pages=1,
                          max_tweets=config['max_tweets'],
                          **search_creds)

        # number of reconnection tries
        tries = 10

        # while loop to protect against 104 error
        while True:
            tries -= 1
            # attempt retrieving tweets
            try:
                # indicate which day is getting retrieved
                print('[INFO] - Retrieving tweets from ' + str(start_ts))

                # get json response to list

Exemple #25

0

Afficher le fichier

Fichier : premium_search.py Projet : asoa/Twitter-Movie-Sentiment

 def write_stream(self):
     """ write ResultStream object to disk using the write_ndjson utility """
     stream = ResultStream(**self.premium_search_args, rule_payload=self.rule, max_results=62000)
     columns = []
     for _ in write_ndjson('US_apr02_apr09_some.json', stream.stream()):  # exhaust generator
         pass

Exemple #26

0

Afficher le fichier

Fichier : twitterreport2.py Projet : Rothamsted-Ecoinformatics/tweetworks

import pprint
import csv
from searchtweets import load_credentials, gen_rule_payload, ResultStream
premium_search_args = load_credentials(
    filename='D:/Code/python/workspace/LTETwitter/cred.yaml',
    yaml_key='search_tweets_api',
    env_overwrite=False)

rule = gen_rule_payload("broadbalk",
                        from_date="2010-04-01",
                        to_date="2018-02-14",
                        results_per_call=100)  # testing with a sandbox account
rs = ResultStream(rule_payload=rule,
                  max_results=500,
                  max_pages=5,
                  **premium_search_args)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(rs)

tweets = list(rs.stream())

with open("tweets18-19.csv", "a", newline="", encoding="utf-8") as csvFile:
    writer = csv.writer(csvFile, quoting=csv.QUOTE_MINIMAL)
    for tweet in tweets:
        writer.writerow([
            tweet.created_at_datetime, tweet.favorite_count, tweet.quote_count,
            tweet.retweet_count, tweet.name, tweet.follower_count,
            tweet.geo_coordinates, tweet.profile_location, tweet.bio,
            tweet.user_id, tweet.screen_name, tweet.hashtags,
            tweet.in_reply_to_screen_name, tweet.all_text
        ])

Exemple #27

0

Afficher le fichier

Fichier : resultStream.py Projet : jaisekhar/Stock-Market-Analysis-using-Hadoop-Ecosystem

from searchtweets import ResultStream

rs = ResultStream(rule_payload=rule, max_results=1000, **premium_search_args)
print(rs)

Exemple #28

0

Afficher le fichier

from searchtweets import ResultStream, gen_request_parameters, load_credentials

search_args = load_credentials("~/.twitter_keys.yaml",
                               yaml_key="search_tweets_v2",
                               env_overwrite=False)

query = gen_request_parameters("Electric Vehicle", results_per_call=100)

rs = ResultStream(request_parameters=query,
                  max_results=500,
                  max_pages=1,
                  **search_args)

tweets = list(rs.stream())