コード例 #1
0
def search(queryString, outputpath, api_key_yaml,startTime="2016-01-01",endTime="2021-03-15", lang="en"):

    search_args = load_credentials(api_key_yaml,
                                   yaml_key="search_tweets_v2",
                                   env_overwrite=False)

    print("Should be 1024, but it:")
    print(len(queryString + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang))

    #,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations
    query = gen_request_parameters(query=queryString.strip() + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang, media_fields="media_key,type",user_fields="id,description,location,name,entities,url,username,public_metrics,verified,withheld,protected",tweet_fields="id,text,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations,attachments",start_time=startTime,end_time=endTime, stringify=False, expansions="author_id,attachments.media_keys",results_per_call=500)

    rs = ResultStream(request_parameters=query, max_tweets=sys.maxsize, max_requests=sys.maxsize, **search_args)
    i = 0
    with open(outputpath, 'w') as outputcsv:
        writer = csv.writer(outputcsv)
        writer.writerow(headers)
        for tweet in rs.stream():
            # print(tweet)
            if "id" in tweet:
                writer.writerow(createRow(headers, tweet))
            if "users" in tweet:
                print("parsing users")
                dump_users_info(tweet,outputpath.replace(".csv",str(i) +"-users.csv"))
                i+=1
コード例 #2
0
    def arquive_search(self,
                       query,
                       start,
                       end,
                       dev_env,
                       max_size=2500,
                       max_call=100):
        self.settings['search_tweets_api']['endpoint'] =\
           f"https://api.twitter.com/1.1/tweets/search/fullarchive/{dev_env}.json"

        credentials = load_credentials("archive_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

        with open('archive_keys.yaml', 'w') as config_file:
            yaml.dump(self.settings, config_file, default_flow_style=False)

        q_rule = gen_rule_payload(query,
                                  results_per_call=max_call,
                                  from_date=start,
                                  to_date=end)

        rs = ResultStream(rule_payload=q_rule,
                          max_results=max_size,
                          **credentials)

        with open('tweet_data_archive.csv', 'a', encoding='utf-8') as file:
            n = 0
            for tweet in rs.stream():
                n += 1
                if n % (max_size / 10) == 0:
                    print('{0}: {1}'.format(str(n), tweet['created_at']))
                json.dump(tweet, file)
                file.write('\n')
コード例 #3
0
def collect_tweets_in_files():
    """ Using a ResultStream for getting tweets
      We can configure the amount of pages/tweets we want to obtain """

    if not check_files():  # file should not already be existing

        max_results = 10000
        max_pages = 300
        max_tweets = 15000

        rs = ResultStream(request_parameters=query,
                          max_results=max_results,
                          max_pages=max_pages,
                          **credentials)

        # Set how many tweets we want to catch
        rs.max_tweets = max_tweets

        tweets_2 = list(rs.stream())
        dataframe = pandas.DataFrame(tweets_2)

        csv_file = dataframe.to_csv(saving_path)
    else:
        print(
            FileExistsError,
            'File already exists! Please check if you really want to overwrite the file.'
        )
コード例 #4
0
def main():
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug("command line args dict:")
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    extra_headers_str = args_dict.get("extra_headers")
    if extra_headers_str is not None:
        args_dict['extra_headers_dict'] = json.loads(extra_headers_str)
        del args_dict['extra_headers']

    logger.debug("config file ({}) arguments sans sensitive args:".format(
        args_dict["config_filename"]))
    logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4))

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  account_type=args_dict["account_type"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(creds_dict), dict_filter(args_dict))

    logger.debug("combined dict (cli, config, creds) sans password:"******"ERROR: not enough arguments for the program to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)
    logger.debug(
        "full arguments passed to the ResultStream object sans password")
    logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4))

    rs = ResultStream(tweetify=False, **stream_params)

    logger.debug(str(rs))

    if config_dict.get("filename_prefix") is not None:
        stream = write_result_stream(
            rs,
            filename_prefix=config_dict.get("filename_prefix"),
            results_per_file=config_dict.get("results_per_file"))
    else:
        stream = rs.stream()

    for tweet in stream:
        if config_dict["print_stream"] is True:
            print(json.dumps(tweet))
コード例 #5
0
    def gather_data(self, screen_name: str,  user_id: int, rt_date: str, file_path: str):
        query_str = create_query_str(screen_name)
        # print(f'reconstructing timeline for @{screen_name}')

        time_range = get_start_and_end_date(rt_date)
        query_obj = create_query_obj(query_str, *time_range)
        rs = ResultStream(
            request_parameters=query_obj,
            # parameter changed from 2 -> 1 to avoid being ratelimited within the project timeline
            max_requests=1,
            **self.academic_search_args
        )
        inbound_timeline = []

        replies = []
        retweets = []
        quotes = []

        for tweet in rs.stream():
            if "author_id" not in tweet:
                if "tweets" in tweet:
                    # Tweets are found
                    for t in tweet["tweets"]:
                        if int(t["author_id"]) == user_id:
                            if "referenced_tweets" in t:
                                ref_tweets = t["referenced_tweets"]
                                for ref in ref_tweets:
                                    type = ref["type"]
                                    if type == "replied_to":
                                        replies.append(ref["id"])
                                    elif type == "quoted":
                                        quotes.append(ref["id"])
                            else:
                                # normal tweet, which holds no info on the information strength
                                pass
                        else:
                            if "referenced_tweets" not in t:
                                # the only way this situation can occur is when the tweet is retweeted by the autor
                                # and someone is replying to that retweet
                                retweets.append(t["author_id"])
                            else:
                                # this indicates a reply with a quote, or a reply of a reply
                                pass

        # print(f"done collecting the retweeted user objects, there are {len(retweets)} in total")

        # print(f"converting the {len(replies)} replied tweet objects to user ids")
        replies = self.gather_users(replies)
        # print(f"done collecting the replies user objects, there are {len(replies)} in total")

        # print(f"converting the {len(quotes)} quoted tweet objects to user ids")
        quotes = self.gather_users(quotes)
        # print(f"done collecting the quotes user objects, there are {len(quotes)} in total")

        # print(f"retweets: {len(retweets)}\treplies: {len(replies)}\tquotes: {len(quotes)}")

        dump_dict = {"replies": replies, "quotes": quotes, "retweets": retweets}
        json.dump(dump_dict, open(file_path, "w"))
コード例 #6
0
def _download_tweets(trend, enterprise_search_args):
    powertrack_rule = '(has:geo OR has:profile_geo) lang:en -is:retweet %s' % trend
    rule = gen_rule_payload(powertrack_rule, results_per_call=500)
    rs = ResultStream(rule_payload=rule,
                      max_requests=2,
                      **enterprise_search_args)
    for tweet in rs.stream():
        print(tweet)
        _store_tweet(tweet)
コード例 #7
0
ファイル: search.py プロジェクト: bugryn-josh/ENGL581_PROJ
def usersTweetsByIds():

    search_args1 = load_credentials(".twitter_keys.yaml",
                                    yaml_key="search_tweets_v2_id",
                                    env_overwrite=False)

    search_args2 = load_credentials(".twitter_keys.yaml",
                                    yaml_key="search_tweets_v2_user",
                                    env_overwrite=False)

    f = open(
        'C:\\Users\\Josh\\Documents\\GitHub\\search-tweets-python\\enUsers_Tweets.json',
        'r',
        encoding='utf-8')

    obj = json.load(f)

    for u in obj['includes']:

        idList = u.get('tweetids')

        ids = ''

        idList = list(set(idList))

        if len(idList) == 0:
            u['tweets'] = []
            continue

        if len(idList) > 99:
            ids = ','.join(idList[0:99])
        else:
            ids = ','.join(idList)

        endTweet = 'https://api.twitter.com/2/tweets'

        query = {"ids": ids, "tweet.fields": "author_id,public_metrics,text"}
        rs = ResultStream(request_parameters=query,
                          endpoint=endTweet,
                          bearer_token=bt)

        tweets = []
        result = list(rs.stream())

        for r in result:

            tweets = r.get('data')

        u['tweets'] = tweets

    fo = open('Random_WithTweets.json', 'w', encoding='utf-8')
    json.dump(obj, fo)
コード例 #8
0
ファイル: main.py プロジェクト: x0rzkov/graphy-backend
def _download_tweets(trend):
    powertrack_rule = '%s (has:geo OR has:profile_geo) lang:en -is:retweet' % trend
    rule = gen_rule_payload(powertrack_rule,
                            results_per_call=500,
                            to_date=None,
                            from_date='201207220000')
    logging.info("PowerTrack rule: %s" % rule)
    rs = ResultStream(rule_payload=rule,
                      max_results=500,
                      max_requests=1,
                      **enterprise_search_args)
    for tweet in rs.stream():
        _push_tweet(tweet, trend)
コード例 #9
0
ファイル: search_tweets.py プロジェクト: spinrut/Bitcamp-2018
def main():
    parser = parse_cmd_args()
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  account_type=args_dict["account_type"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(args_dict), dict_filter(creds_dict))

    logger.debug(json.dumps(config_dict, indent=4))

    if len(dict_filter(config_dict).keys()
           & REQUIRED_KEYS) < len(REQUIRED_KEYS):
        print(REQUIRED_KEYS - dict_filter(config_dict).keys())
        logger.error("ERROR: not enough arguments for the program to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)

    logger.debug(json.dumps(config_dict, indent=4))

    rs = ResultStream(tweetify=False, **stream_params)

    logger.debug(str(rs))

    if config_dict.get("filename_prefix") is not None:
        stream = write_result_stream(
            rs,
            filename_prefix=config_dict["filename_prefix"],
            results_per_file=config_dict["results_per_file"])
    else:
        stream = rs.stream()

    for tweet in stream:
        if config_dict["print_stream"] is True:
            print(json.dumps(tweet))
コード例 #10
0
    def tw_get_premium_search(self, keyword: str):
        with open(f'datasets/tw_{keyword.lower()}_searches_premium.json',
                  'w') as f:
            try:
                f.write('{"statuses": [')

                rule = gen_rule_payload(
                    pt_rule="near:\"New York, NY\" within:50mi".format(),
                    results_per_call=100,
                    from_date="2018-07-01",
                    to_date="2018-10-01")

                rule = gen_rule_payload(
                    pt_rule="place:\"New York, NY\"".format(),
                    results_per_call=100,
                    from_date=(datetime.date.today() -
                               datetime.timedelta(31)).isoformat(),
                    to_date=datetime.date.today().isoformat())

                next_token = None
                while True:
                    results = ResultStream(rule_payload=rule,
                                           **self.twitter_premium_api)
                    results.next_token = next_token

                    tweets = []

                    try:
                        tweets = list(results.stream())
                    except Exception as ex:
                        print(str(ex))

                    for tweet in tweets:
                        f.write("%s," % json.dumps(tweet))

                    if results.next_token is None:
                        break
                    else:
                        next_token = results.next_token

                next_token is not None and f.seek(f.tell() - 1, os.SEEK_SET)
                f.write("]}")

            except Exception as ex:
                print("Error:\n" + str(ex))
コード例 #11
0
def get_data(search_query, api_key, secret_key, to_date, from_date, filename):
    """ get twitter data through twitter API from full archive search sand box and return all twitters in JSONL file
    based on 
     search term, 
     the geographic location of interest
     the time period of interest.
     and personal twitter account information.

     Reference: https://github.com/geduldig/TwitterAPI/tree/master/TwitterAPI
     Reference: https://developer.twitter.com/en/docs/tweets/search/overview
    """
    print_after_x = 1000
    config = dict(
        search_tweets_api=dict(
            account_type='premium',
            endpoint=f"https://api.twitter.com/1.1/tweets/search/{'fullarchive'}/{'mangroveConservation'}.json",
            consumer_key=api_key,
            consumer_secret=secret_key
        )
    )
    with open('twitter_keys.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)
    from searchtweets import load_credentials, gen_rule_payload, ResultStream

    premium_search_args = load_credentials("twitter_keys.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)
    rule = gen_rule_payload(search_query,
                            results_per_call=100,
                            from_date=from_date,
                            to_date=to_date
                            )
    temp = ResultStream(rule_payload=rule,
                      max_results=100000,
                      **premium_search_args)
    with open(filename, 'a', encoding='utf-8') as temp_file:
        num = 0
        for tweet in temp.stream():
            num += 1
            if num % print_after_x == 0:
                print('{0}: {1}'.format(str(num), tweet['created_at']))
            json.dump(tweet, temp_file)
            temp_file.write('\n')
    print('done')
コード例 #12
0
def get_tweets(trend,date):
    enddate = date+datetime.timedelta(days=1)
    username="******"
    password="******"
    endpoint="https://gnip-api.twitter.com/search/fullarchive/accounts/greg-students/prod.json"
    bearer_token=""
    rule = gen_rule_payload(trend+" lang:en",from_date=date.isoformat() ,to_date=enddate.isoformat(), results_per_call=500) # testing with a sandbox account
    rs=ResultStream(rule_payload=rule,max_results=10000,max_pages=10, username=username,endpoint=endpoint, password=password)
    #tweets=collect_results(rule, result_stream_args=args,max_results=20000)
    return rs
コード例 #13
0
def save_old_tweets():
    from searchtweets import load_credentials, gen_rule_payload, ResultStream
    import json

    premium_search_args = load_credentials("twitter_keys_fullarchive.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)

    query = "from:NTOO_Org"
    rule = gen_rule_payload(query, results_per_call=100)

    rs = ResultStream(rule_payload=rule,
                      max_results=1000,
                      **premium_search_args)

    with open('fullTweetsData.json', 'a', encoding='utf-8') as f:
        for tweet in rs.stream():
            json.dump(tweet, f)
            f.write('\n')
コード例 #14
0
def tweet_search(search_key, search_args):
    """
    search for "spectrumtv" and create a dict of tweet timestamp (dictionary key, in epoch seconds),
                                                 tweet authors screen name (dict value, tuple element 1),
                                                 tweet text (dict value, tuple element 2)
    """
    print("searching for tweets containing \"{}\"".format(search_key))
    key_rule = gen_rule_payload(search_key, results_per_call=100)
    key_rs = ResultStream(rule_payload=key_rule,
                          max_results=500,
                          max_pages=1,
                          **search_args)
    key_results = list(key_rs.stream())
    key_tweets = {}
    for tweet in key_results:
        key_tweets[tweet.created_at_seconds] = (
            tweet.screen_name, tweet.all_text.replace('\n', ' '), ' '
        )  # this space is a placeholder for the sentiment value
    print("{} tweets found containing \"{}\"\n".format(len(key_results),
                                                       search_key))
    return key_tweets
コード例 #15
0
def get_file(aname,
             cak,
             cask,
             etype,
             hashtag,
             keywords,
             fdate='00-00-0000',
             tdate='00-00-0000',
             ftime='00:00',
             ttime='00:00'):

    if etype == 'efa':  # Full archive scraping (refer to limits on README)
        endp = 'https://api.twitter.com/1.1/tweets/search/fullarchive/' + aname + '.json'
    elif etype == 'tdays':  # 30 days scraping (refer to limits on README)
        endp = 'https://api.twitter.com/1.1/tweets/search/30day/' + aname + '.json'
    else:
        endp = 'ERROR'

    # Creating a yaml credentials file
    config = dict(search_tweets_api=dict(account_type='premium',
                                         endpoint=endp,
                                         consumer_key=cak,
                                         consumer_secret=cask))

    with open('C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml',
              'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)

    # loading credentials
    premium_search_args = load_credentials(
        'C:\\Users\\Samuktha\\Documents\\USC\\twitter\\proj\\cred.yaml',
        yaml_key='search_tweets_api',
        env_overwrite=True)
    print(premium_search_args)

    if etype == 'efa':
        rule = gen_rule_payload(
            results_per_call=100,
            from_date=fdate + ' ' + ftime,  #"2019-07-06 01:00",
            to_date=tdate + ' ' + ttime,  #"2019-07-06 02:15",
            pt_rule=keywords,
        )
    else:
        rule = gen_rule_payload(results_per_call=100, pt_rule=keywords)

    # result stream

    rs = ResultStream(rule_payload=rule, max_results=50, **premium_search_args)

    return rs
コード例 #16
0
def get_twitter_results(news_id,
                        query,
                        from_date,
                        premium_search_args,
                        filename,
                        to_date="202005260000"):
    query1 = "url:" + query + " lang:en"

    rule = gen_rule_payload(query1,
                            from_date=from_date,
                            to_date=to_date,
                            results_per_call=100)

    rs = ResultStream(rule_payload=rule,
                      max_results=100,
                      **premium_search_args)
    l = 0
    with open(filename, 'a', encoding='utf-8') as f:
        n = 0
        for tweet in rs.stream():
            news_tweet_json = {
                "news_id": news_id,
                "query": query,
                "tweet": tweet
            }

            n += 1
            if n % 10 == 0:
                print('{0}: {1}'.format(str(n), tweet['created_at']))
            json.dump(news_tweet_json, f)
            f.write('\n')
            l = datetime.strptime(tweet['created_at'],
                                  "%a %b %d %H:%M:%S +%f %Y").date()
    print(rs, type(l), l)
    print('done')
    return l
コード例 #17
0
ファイル: search.py プロジェクト: bugryn-josh/ENGL581_PROJ
def getRecentTweets():
    endRecent = 'https://api.twitter.com/2/tweets/search/recent'

    search_args_rec = load_credentials(".twitter_keys.yaml",
                                       yaml_key="search_tweets_v2_recent",
                                       env_overwrite=False)

    query = {
        "max_results": 100,
        "tweet.fields": "public_metrics,author_id,lang",
        "query":
        "happy -RT OR upset -RT OR lol -RT OR ugh -RT OR dog -RT OR cat -RT OR food -RT OR sucks -RT",
        "expansions": "author_id",
        "user.fields": "public_metrics"
    }

    rs = ResultStream(
        request_parameters=query,
        endpoint=endRecent,
        bearer_token=bt,
        max_tweets=100,
        max_requests=1,
    )
    result = list(rs.stream())

    obj = {}

    obj['data'] = []
    obj['includes'] = []

    for r in result:
        obj['data'] = obj['data'] + r.get('data')
        obj['includes'] = obj['includes'] + r.get('includes').get('users')

    out = open('testJson.json', 'w')
    json.dump(obj, out)
コード例 #18
0
    def pull_data_for_handle(self,
                             handle,
                             date,
                             days_before,
                             results_per_call=100,
                             max_results=2500):
        # check handle can be found!
        user_id = self.get_handle_id(handle)
        if user_id is 0:
            return 0
        from_date = self.subtract_from_datestring(date, days_before)
        rule = self.make_rule(handle, date, from_date, results_per_call)

        rs = ResultStream(rule_payload=rule,
                          max_results=max_results,
                          **self.endpoint_args)
        results_list = list(rs.stream())
        #         results_list=temp_dict[list(temp_dict.keys())[0]]
        print('Found', len(results_list), 'tweets for', handle)
        if len(results_list) == max_results:
            print('Max results limit hit (' + str(2500) +
                  '). Consider changing the parameter')

        return self.strip_maxresults_from_query(rule), results_list
コード例 #19
0
def read_stream(apiscope, label):
    API_KEY = api_key
    API_SECRET_KEY = api_secret_key
    DEV_ENVIRONMENT_LABEL = label
    API_SCOPE = apiscope  # 'fullarchive'  # 'fullarchive' for full archive, '30day' for last 31 days

    SEARCH_QUERY = 'delays, @WestMidRailway OR @NetworkRailBHM OR @networkrail'
    RESULTS_PER_CALL = 100  # 100 for sandbox, 500 for paid tiers
    TO_DATE = '2021-01-30'  # format YYYY-MM-DD HH:MM (hour and minutes optional)
    FROM_DATE = '2021-01-01'  # format YYYY-MM-DD HH:MM (hour and minutes optional)

    MAX_RESULTS = 10000  # Number of Tweets you want to collect

    # --------------------------- STOP -------------------------------#
    # Don't edit anything below, if you don't know what you are doing.
    # --------------------------- STOP -------------------------------#

    config = dict(search_tweets_api=dict(
        account_type='premium',
        endpoint=
        f"https://api.twitter.com/1.1/tweets/search/{API_SCOPE}/{DEV_ENVIRONMENT_LABEL}.json",
        consumer_key=API_KEY,
        consumer_secret=API_SECRET_KEY))

    with open('twitter_keys.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)

    premium_search_args = load_credentials("twitter_keys.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)

    rule = gen_rule_payload(SEARCH_QUERY,
                            results_per_call=RESULTS_PER_CALL,
                            from_date=FROM_DATE,
                            to_date=TO_DATE)

    rs = ResultStream(rule_payload=rule,
                      max_results=MAX_RESULTS,
                      **premium_search_args)

    return rs
コード例 #20
0
 def write_stream(self):
     """ write ResultStream object to disk using the write_ndjson utility """
     stream = ResultStream(**self.premium_search_args, rule_payload=self.rule, max_results=62000)
     columns = []
     for _ in write_ndjson('US_apr02_apr09_some.json', stream.stream()):  # exhaust generator
         pass
コード例 #21
0
ファイル: _pull_data.py プロジェクト: kanav-mehra/solve-iwmi
def pull_tweets(query,
                from_date,
                to_date,
                save_path,
                credentials_path,
                yaml_key,
                file_name=None,
                results_per_call=500,
                max_results=3000,
                verbose=False,
                **kwargs):
    """
    Pulls data (i.e., tweets and user info) from Twitter using its API.
    The data received from the API is stored in its original form (JSON)
    without performing any type of preprocessing.

    Parameters
    ----------
    query : str
        Query passed to the Twitter API to fecth Tweets.
    from_date : str or None
        Date format as specified by `convert_utc_time` for the starting time
        of your search.
    to_date : str or None
        Date format as specified by `convert_utc_time` for the end time of
        your search.
    save_path : str
        Path where the raw data will be stored.
    credentials_path : str
        Path for the yaml file with the Twitter API credentials.
    yaml_key : str
        Key within the yaml file containing the Twitter API credentials to be
        used.
    file_name : str or None, default=None
        Name of the json file saved containing the data dump. If None, the
        named will be assigned as a function of `query`, `from_date` and
        `to_date`.
    results_per_call : int, default=500
        Number of Tweets returned per call.
    max_results : int, default=3000
        Maximum number of Tweets to be pulled.
    verbose : int or bool, default=False
        Controls the verbosity when pulling data.


    Returns
    -------
    None : NoneType
    """

    logger = logging.getLogger(__name__)
    logger.propagate = verbose
    logger.info('Pulling raw Twitter data')

    search_args = load_credentials(filename=credentials_path,
                                   yaml_key=yaml_key)

    rule = gen_rule_payload(query,
                            results_per_call=results_per_call,
                            from_date=from_date,
                            to_date=to_date)

    rs = ResultStream(rule_payload=rule,
                      max_results=max_results,
                      **search_args)

    if file_name is None:
        file_name = f'SAMPLE_DATA_QUERY_{query}_'\
                  + f'FROMDATE_{from_date}_TODATE_{to_date}.json'

    with open(os.path.join(save_path, file_name), 'a', encoding='utf-8') as f:
        for tweet in rs.stream():
            json.dump(tweet, f)
            f.write('\n')

    logger.info('Data successfuly saved at' +
                f'\"{os.path.join(save_path, file_name)}\"')
    return None
コード例 #22
0
ファイル: poll_tweets.py プロジェクト: g9singh/world_cup_2018
def main():
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug("command line args dict:")
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    extra_headers_str = args_dict.get("extra_headers")
    if extra_headers_str is not None:
        args_dict['extra_headers_dict'] = json.loads(extra_headers_str)
        del args_dict['extra_headers']

    logger.debug("config file ({}) arguments sans sensitive args:".format(
        args_dict["config_filename"]))
    logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4))

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(creds_dict), dict_filter(args_dict))

    logger.debug("combined dict (cli, config, creds):")
    logger.debug(json.dumps(_filter_sensitive_args(config_dict), indent=4))

    if len(dict_filter(config_dict).keys()
           & REQUIRED_KEYS) < len(REQUIRED_KEYS):
        print(REQUIRED_KEYS - dict_filter(config_dict).keys())
        logger.error("ERROR: not enough arguments for the script to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)
    logger.debug(
        "full arguments passed to the ResultStream object sans credentials")
    logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4))

    while True:

        start = time.time()
        rs = ResultStream(tweetify=False, **stream_params)

        logger.debug(str(rs))

        if config_dict.get("filename_prefix") is not None:
            stream = write_result_stream(
                rs,
                filename_prefix=config_dict.get("filename_prefix"),
                results_per_file=config_dict.get("results_per_file"))
        else:
            stream = rs.stream()

        first_tweet = True
        tweets_num = 0

        #Iterate through Tweet array and handle output.
        for tweet in stream:
            tweets_num = tweets_num + 1
            #Get Tweet ID from first Tweet
            if first_tweet:
                newest_id = tweet['id']
                first_tweet = False
            if config_dict["print_stream"] is True:
                print(json.dumps(tweet))

        #This polling script switches to a since_id requests and removes the start_time parameter if it is used for backfill.
        #Prepare next query, by setting the since_id request parameter.
        print(f"{tweets_num} new Tweets. Newest_id: {newest_id}")

        request_json = json.loads(stream_params['request_parameters'])

        if 'start_time' in request_json.keys():
            del request_json['start_time']

        request_json.update(since_id=newest_id)
        stream_params['request_parameters'] = json.dumps(request_json)

        duration = time.time() - start

        sleep_interval = (float(config_dict["interval"]) * 60) - duration

        if sleep_interval < 0:
            sleep_interval = (float(config_dict["interval"]) * 60)

        time.sleep(sleep_interval)
コード例 #23
0
def extract_tweets():
    today = date.today()
    d1 = today.strftime("%d-%m-%Y")

    with open('config.json','r') as f:
        keys = json.load(f)

    config = dict(
        search_tweets_api = dict(
            account_type = 'premium',
            endpoint = 'https://api.twitter.com/1.1/tweets/search/30day/development1.json',
            consumer_key = keys['consumer_key'],
            consumer_secret = keys['consumer_secret'])
            )
    with open('twitter_keys_fullhistory.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)

    premium_search_args = load_credentials("twitter_keys_fullhistory.yaml",
                                        yaml_key="search_tweets_api",
                                        env_overwrite=False)

    SEARCH_QUERY = 'to:Lloydsbank'
    RESULTS_PER_CALL = 100
    FROM_DATE = "2020-06-01"
    TO_DATE = "2020-06-10"
    MAX_RESULTS = 100000
    FILENAME = 'twitter_input_data_{}_{}.jsonl'.format(FROM_DATE, TO_DATE)  # Where the Tweets should be saved
    PRINT_AFTER_X = 100


    rule = gen_rule_payload(SEARCH_QUERY,
                            results_per_call=RESULTS_PER_CALL,
                            from_date=FROM_DATE,
                            to_date=TO_DATE
                            )

    rs = ResultStream(rule_payload=rule,
                    max_results=MAX_RESULTS,
                    **premium_search_args)

    with open(FILENAME, 'a', encoding='utf-8') as f:
        n = 0
        for tweet in rs.stream():
            n += 1
            if n % PRINT_AFTER_X == 0:
                print('{0}: {1}'.format(str(n), tweet['created_at']))
            json.dump(tweet, f)
            f.write('\n')


    new_tweets = []
    dates_created = []
    location = []
    user= []

    with open(FILENAME, 'rb') as f:
        for item in json_lines.reader(f):
            try:
                new_tweets.append(item['extended_tweet']['full_text'])
            except KeyError as e:
                new_tweets.append(item['text'])
                dates_created.append(item['created_at'])
                location.append(item['user']['location'])
                user.append(item['user']['id'])

    dataframe = pd.DataFrame(list(zip(user, location, dates_created, new_tweets)), 
                columns =['User', 'Location', 'date_created', 'text'])
    print(dataframe.head())
    dataframe.to_csv("tweets.csv", sep =",")
コード例 #24
0
from searchtweets import gen_rule_payload
from searchtweets import load_credentials

premium_search_args = load_credentials("twitter_keys_fullarchive.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)
print(premium_search_args)
query = "AAPL"
rule = gen_rule_payload(query,
                        results_per_call=100,
                        from_date="2020-10-21",
                        to_date="2020-10-28")
from searchtweets import ResultStream

rs = ResultStream(rule_payload=rule, max_results=25000, **premium_search_args)
print(rs)
import json
with open('tweets1.json', 'a', encoding='utf-8') as f:
    for tweet in rs.stream():
        json.dump(tweet, f)
        f.write('\n')
print('done')
コード例 #25
0
        rule = gen_request_parameters(
            query=config['query'],
            results_per_call=config['results_per_call'],
            start_time=start_ts.isoformat(),
            end_time=end_ts.isoformat(),
            tweet_fields=tweetfields,
            user_fields=userfields,
            media_fields=mediafields,
            place_fields=placefields,
            expansions=expansions,
            stringify=False)

        # result stream from twitter v2 api
        rs = ResultStream(request_parameters=rule,
                          max_results=100000,
                          max_pages=1,
                          max_tweets=config['max_tweets'],
                          **search_creds)

        # number of reconnection tries
        tries = 10

        # while loop to protect against 104 error
        while True:
            tries -= 1
            # attempt retrieving tweets
            try:
                # indicate which day is getting retrieved
                print('[INFO] - Retrieving tweets from ' + str(start_ts))

                # get json response to list
コード例 #26
0
    consumer_secret=API_SECRET_KEY))

with open('twitter_keys.yaml', 'w') as config_file:
    yaml.dump(config, config_file, default_flow_style=False)

import json
from searchtweets import load_credentials, gen_rule_payload, ResultStream

premium_search_args = load_credentials("twitter_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

rule = gen_rule_payload(SEARCH_QUERY,
                        results_per_call=RESULTS_PER_CALL,
                        from_date=FROM_DATE,
                        to_date=TO_DATE)

rs = ResultStream(rule_payload=rule,
                  max_results=MAX_RESULTS,
                  **premium_search_args)

with open(FILENAME, 'a', encoding='utf-8') as f:
    n = 0
    for tweet in rs.stream():
        n += 1
        if n % PRINT_AFTER_X == 0:
            print('{0}: {1}'.format(str(n), tweet['created_at']))
        json.dump(tweet, f)
        f.write('\n')
print('done')
コード例 #27
0
import pprint
import csv
from searchtweets import load_credentials, gen_rule_payload, ResultStream
premium_search_args = load_credentials(
    filename='D:/Code/python/workspace/LTETwitter/cred.yaml',
    yaml_key='search_tweets_api',
    env_overwrite=False)

rule = gen_rule_payload("broadbalk",
                        from_date="2010-04-01",
                        to_date="2018-02-14",
                        results_per_call=100)  # testing with a sandbox account
rs = ResultStream(rule_payload=rule,
                  max_results=500,
                  max_pages=5,
                  **premium_search_args)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(rs)

tweets = list(rs.stream())

with open("tweets18-19.csv", "a", newline="", encoding="utf-8") as csvFile:
    writer = csv.writer(csvFile, quoting=csv.QUOTE_MINIMAL)
    for tweet in tweets:
        writer.writerow([
            tweet.created_at_datetime, tweet.favorite_count, tweet.quote_count,
            tweet.retweet_count, tweet.name, tweet.follower_count,
            tweet.geo_coordinates, tweet.profile_location, tweet.bio,
            tweet.user_id, tweet.screen_name, tweet.hashtags,
            tweet.in_reply_to_screen_name, tweet.all_text
        ])
コード例 #28
0
                                       env_overwrite=False)

today = datetime.date.today()
print(today)

start_date = today + datetime.timedelta(-30)
print(start_date)

rule = gen_rule_payload("from:NYCASP",
                        from_date=str(start_date),
                        to_date=str(today),
                        results_per_call=500)

print(rule)

rs = ResultStream(rule_payload=rule, max_results=500, **premium_search_args)

print(rs)

tweets = rs.stream()
list_tweets = list(tweets)
[print(tweet.all_text, end="\n\n") for tweet in list_tweets[0:100]]

tweet_text = []
tweet_date = []

for tweet in list_tweets:
    tweet_text.append(tweet["text"])
    tweet_date.append(tweet["created_at"])

df = pd.DataFrame({"tweet": tweet_text, "date": tweet_date})
from searchtweets import ResultStream

rs = ResultStream(rule_payload=rule, max_results=1000, **premium_search_args)
print(rs)
コード例 #30
0
# iterate through different dates
start_date = 201811180000
end_date = 201812160000

while start_date <= end_date:
	# generate valid json queries
	query_rule = gen_rule_payload(
		"#sarcasm lang:en",
		results_per_call=100,
		from_date=str(start_date),
		to_date=str(next_date(start_date))
		)
	#print("Query Rule:")
	#print(query_rule)

	rs = ResultStream(rule_payload=query_rule, max_results=100, **premium_search_args)
	#print(rs)

	tweets = list(rs.stream())

	# get tweets from query and save to dynamodb
	for tweet in tweets:
		text = tweet.all_text
		id = tweet.id
		created_time = tweet.created_at_datetime

		#  do not save retweets
		if isRetweet(text) or  startsWithMention(text) or containsURL(text):
			continue

		text = toLowerCase(text)