Ejemplo n.º 1
0
def test_since_id():
    t = Twarc()
    for tweet in t.search('obama'):
        id = tweet['id_str']
        break
    assert id
    time.sleep(5)
    for tweet in t.search('obama', since_id=id):
        assert tweet['id_str'] > id
Ejemplo n.º 2
0
def test_paging():
    # pages are 100 tweets big so if we can get 500 paging is working
    t = Twarc()
    count = 0
    for tweet in t.search('obama'):
        count += 1
        if count == 500:
            break
    assert count == 500
Ejemplo n.º 3
0
def test_search():
    count = 0
    t = Twarc()
    for tweet in t.search('obama'):
        assert tweet['id_str']
        count += 1
        if count == 10:
            break
    assert count == 10
Ejemplo n.º 4
0
def test_stream():
    t = Twarc()
    count = 0
    for tweet in t.stream("obama"):
        assert tweet['id_str']
        assert tweet['text']
        count += 1
        if count == 50:
            break
    assert count == 50
Ejemplo n.º 5
0
def test_max_id():
    t = Twarc()
    for tweet in t.search('obama'):
        id = tweet['id_str']
        break
    assert id
    time.sleep(5)
    count = 0
    for tweet in t.search('obama', max_id=id):
        count += 1
        assert tweet['id_str'] <= id
        if count > 100:
            break
Ejemplo n.º 6
0
def test_hydrate():
    ids = [
        "501064188211765249", "501064196642340864", "501064197632167936",
        "501064196931330049", "501064198005481472", "501064198009655296",
        "501064198059597824", "501064198513000450", "501064180468682752",
        "501064199142117378", "501064171707170816", "501064200186118145",
        "501064200035516416", "501064201041743872", "501064201251880961",
        "501064198973960192", "501064201256071168", "501064202027798529",
        "501064202245521409", "501064201503113216", "501064202363359232",
        "501064202295848960", "501064202380115971", "501064202904403970",
        "501064203135102977", "501064203508412416", "501064203516407810",
        "501064203546148864", "501064203697156096", "501064204191690752",
        "501064204288540672", "501064197396914176", "501064194309906436",
        "501064204989001728", "501064204980592642", "501064204661850113",
        "501064205400039424", "501064205089665024", "501064206666702848",
        "501064207274868736", "501064197686296576", "501064207623000064",
        "501064207824351232", "501064208083980290", "501064208277319680",
        "501064208398573568", "501064202794971136", "501064208789045248",
        "501064209535614976", "501064209551994881", "501064141332029440",
        "501064207387742210", "501064210177331200", "501064210395037696",
        "501064210693230592", "501064210840035329", "501064211855069185",
        "501064192024006657", "501064200316125184", "501064205642903552",
        "501064212547137536", "501064205382848512", "501064213843169280",
        "501064208562135042", "501064214211870720", "501064214467731457",
        "501064215160172545", "501064209648848896", "501064215990648832",
        "501064216241897472", "501064215759568897", "501064211858870273",
        "501064216522932227", "501064216930160640", "501064217667960832",
        "501064211997274114", "501064212303446016", "501064213675012096",
        "501064218343661568", "501064213951823873", "501064219467341824",
        "501064219677044738", "501064210080473088", "501064220415229953",
        "501064220847656960", "501064222340423681", "501064222772445187",
        "501064222923440130", "501064220121632768", "501064222948593664",
        "501064224936714240", "501064225096499201", "501064225142624256",
        "501064225314185216", "501064225926561794", "501064226451259392",
        "501064226816143361", "501064227302674433", "501064227344646144",
        "501064227688558592", "501064228288364546", "501064228627705857",
        "501064229764751360", "501064229915729921", "501064231304065026",
        "501064231366983681", "501064231387947008", "501064231488200704",
        "501064231941570561", "501064232188665856", "501064232449114112",
        "501064232570724352", "501064232700350464", "501064233186893824",
        "501064233438568450", "501064233774510081", "501064235107897344",
        "501064235175399425", "501064235456401410",
    ]
    t = Twarc()
    count = 0
    for tweet in t.hydrate(iter(ids)):
        assert tweet['id_str']
        count += 1
    assert count > 100 # may need to adjust as these might get deleted
Ejemplo n.º 7
0
	def __init__(self, search_terms):

		logging.info("initializing TwitterStream Kafka")

		# globals to all instances
		self.t = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret)
		self.search_terms = search_terms
Ejemplo n.º 8
0
def test_max_and_since_ids():
    t = Twarc()
    max_id = since_id = None
    count = 0
    for tweet in t.search('obama'):
        count += 1
        if not max_id:
            max_id = tweet['id_str']
        since_id = tweet['id_str']
        if count > 500:
            break
    count = 0
    for tweet in t.search('obama', max_id=max_id, since_id=since_id):
        count += 1
        assert tweet['id_str'] <= max_id
        assert tweet['id_str'] > since_id
 def _create_twarc(self):
     self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                        self.message["credentials"]["consumer_secret"],
                        self.message["credentials"]["access_token"],
                        self.message["credentials"]["access_token_secret"],
                        http_errors=self.http_errors,
                        connection_errors=self.connection_errors)
Ejemplo n.º 10
0
class TwitterStreamKafka(object):

	# WORKING TWITTER HOSE
	def __init__(self, search_terms):

		logging.info("initializing TwitterStream Kafka")

		# globals to all instances
		self.t = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret)
		self.search_terms = search_terms

	# method to capture twitter stream
	def captureStream(self):
		for tweet in self.t.stream(",".join(self.search_terms)):
			result = producer.send_messages("betweezered", json.dumps(tweet))
    'may': '05',
    'june': '06',
    'july': '07',
    'august': '08',
    'september': '09',
    'october': '10',
    'november': '11',
    "december": '12'
}

# Initializing Twitter API keys
consumer_key = "IGMYSPiWpx0qLEjhYDrJqRuYp"
consumer_secret = "e5ypjtz2Xn49VsjPulIhrVEUduC0id1roNvzoqGfpy6CCRhBgs"
access_token = "1140054025791926272-sxzwfB5oCl8EBEPdhgewfuNP1oCemG"
access_token_secret = "0tIMUBEeurg9Qmd6e076SLoSbMK7opLaWaorkhpqa4Tn1"
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

# tweet_folder = "tweet_folder"
# last_week_folder = "last_week_folder"
# this_week_folder = "this_week_folder"
# news_path = r"../COVID19下集体理性量化分析与思考/数据/recovery-news-data.csv"
# result_path = "collective_rationalty"


# 用于判断是否存在存储tweet数据的文件夹,不存在则创建
def makedir():
    if not os.path.exists(tweet_folder):
        os.makedirs(tweet_folder)
    if not os.path.exists(last_week_folder):
        os.makedirs(last_week_folder)
    if not os.path.exists(this_week_folder):
Ejemplo n.º 12
0
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)


start_date = date(2020, 4, 12)
end_date = date(
    2020, 7, 13
)  #end date, datetime.date(datetime.now()) (this second option is dynamic and changes by date but depends on timezone)

OAUTH_TOKEN = "1029186921438883845-AQjxqWPxZlURJ47eWFqRFRkSCkDPFh"
OAUTH_TOKEN_SECRET = "YgxeTz31ItxBrJubvwZpZaqa57LLhWRKLMM4t82pdEtsv"
CONSUMER_KEY = "Y70ckEEL2TdQzyq9NqI5RriiB"
CONSUMER_SECRET = "YWQJJlJyzXxkaPXCEdFrANgHFf4Dyd0PtkT4f5TvXFUJLUtpvU"
t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

for singledate in daterange(start_date, end_date):
    after = singledate + timedelta(1)
    filename = singledate.strftime("%B%-d").lower() + "_" + after.strftime(
        "%B%-d").lower() + ".csv"
    if (singledate.strftime("%B%-d").lower() != "march29"):
        with open(filename, 'r') as csvfile:
            data = csv.reader(csvfile, delimiter=' ', quotechar='|')
            totaldata = pd.read_csv(filename, header=None)
            dataframe = totaldata[0]
            sentimentstuff = totaldata[1]
            numberfile = "number_corona_tweets_state" + singledate.strftime(
                "%B%-d").lower() + ".txt"
            readyfile = "sentiment" + singledate.strftime(
                "%B%-d").lower() + ".csv"
Ejemplo n.º 13
0
from twarc import Twarc
import json
import time
from simplesentiment.stence import sentanceanalyser

t = Twarc('EZ4MUdjIR22V8y6TDia6vRrEf',
          'ARY5AgvJKvRWfb6nPeTugnvyKDY8VdQh0HdHpYLhcrUX2AvBdz',
          '1103185799841902592-g6OFAdGgV4vYkeg5KCK2gZwCmI3XzH',
          '6IW8bDnxeBZwLEDNa4GAEBWzvgDkAkh7bRRVrV4xcSfpc')


class TwarcCustom:
    """ getting list of top n tweets reply in a list
    by providing the screen_name and count prams
    e.g: if n=2 for some screen_name then for 2 tweets you will get
    all the replies of two tweets as list wise in plain text.
    """
    def getTweetRepliesList(self, screen_name=None, count=1, limit=100):
        tweet_reply = []
        timeline = t.timeline(screen_name=screen_name, count=count)
        for tw in timeline:
            tweet = t.tweet(tw['id_str'])
            tweet_text = ""
            for index, tweet in zip(range(limit), t.replies(tweet)):
                tweet_text += tweet['full_text'] + " "
            tweet_reply.append(tweet_text)
        return tweet_reply

    """ this will return tweets reply as single text of all tweets """

    def getTweetRepliesText(self, screen_name=None, count=1, pages=1):
Ejemplo n.º 14
0
# Dependencies
from twarc import Twarc
import tweepy
import utils
import keys
import sys

# Set up dependencies for Twitter APIs
twarc = Twarc(keys.consumer_key, keys.consumer_secret, keys.access_token,
              keys.access_token_secret)

auth = tweepy.OAuthHandler(keys.consumer_key, keys.consumer_secret)
auth.set_access_token(keys.access_token, keys.access_token_secret)

api = tweepy.API(auth)

arguments = sys.argv  # Get parameters from command line

if len(arguments) > 1:
    # If there's any arguments join with an OR in between
    hashtags = ' OR '.join(map(str, arguments))
else:
    # If no arguments don't run
    print("No arguments passed")
    sys.exit(0)

# Search Twitter for tweets conraining the hashtags
tweets = twarc.search(hashtags)

for tweet in tweets:
    user = tweet['user']
Ejemplo n.º 15
0
import json
import datetime
from twarc import Twarc

# Collection end
period_end = datetime.datetime(2017, 12, 31, 23, 59, 59, 999999)

# Twitter API keys - geobgu2
t = Twarc(
  'JA5KZiEuU8HDIFDtLXwkHCpdx', 
  'NdGoBYXuYHbHOAInNHHumjz0xeCp8zEYfbm0RW0dzpvcRY8Ovc', 
  '2782755278-ARD36i5dPBU6fxRdgvomZoxuCOI3ewVVGPizZCf', 
  'ceN8O8yIVV2C7o6CJyLYYo3CNIm48Tnojpxj69pqqv36u'
  )

# Twitter stream request
t = t.filter(locations = "\-72.21437,41.19034,-69.64939,43.30924")

# Collect tweets
while datetime.datetime.now() < period_end: # Loop until collection period ends
    day_start = datetime.datetime.now()
    day_end = datetime.datetime(day_start.year, day_start.month, day_start.day, day_start.hour, 59, 59, 999999)
    fh = open("boston_geobgu2_" + day_start.strftime("%Y-%m-%d_%H:%M:%S") + ".json", "w")
    #fh.write("[")
    for tweet in t: # Loop until hour ends
        x = tweet
        try:
            if x["geo"] != None:
                print(x["text"])
                fh.write(json.dumps(x))
                fh.write("\n")
Ejemplo n.º 16
0
                    if "media_url" in item:
                        murl = item["media_url"]
                        if murl not in urls:
                            urls.append(murl)
    return urls


# Main starts here
if __name__ == '__main__':
    # Add your own API key values here
    fsecret = open('/Users/sara/twittersecrets.txt', 'r')
    secrets = fsecret.readline()
    access_token, access_token_secret, consumer_key, consumer_secret = \
        [x.strip() for x in secrets.split(',')]

    twarc = Twarc(consumer_key, consumer_secret, access_token,
                  access_token_secret)

    # Check that search terms were provided at the command line
    target_list = []
    if (len(sys.argv) > 1):
        target_list = sys.argv[1:]
    else:
        print("No search terms provided. Exiting.")
        sys.exit(0)

    num_targets = len(target_list)
    for count, target in enumerate(target_list):
        print(
            str(count + 1) + "/" + str(num_targets) +
            " searching on target: " + target)
        # Create a separate save directory for each search query
Ejemplo n.º 17
0
    if month == 0:
        month += 1
    else:
        month += 2
    last_day = calendar.monthrange(2020, month)[1]
    start = 1
    end = 10
    if i % 3 == 1:
        start = 11
        end = 20
    elif i % 3 == 2:
        start = 21
        end = last_day
    t = Twarc(api_keys[i][0],
              api_keys[i][1],
              api_keys[i][2],
              api_keys[i][3],
              app_auth=True)
    args.append([t, month, start, end])


def get_and_save_data(id_col, t):
    """
    Use configured Twarc t to get full tweets given tweet ids id_col 
    and save tweets in database.
    """
    for tweet in t.hydrate(id_col):
        x = None
        try:
            x = mycol.insert_one(tweet)
        except:
     searches = targets
 else:
     print "Please add search targets in config/searcher_targets.txt"
     sys.exit(0)
 print "Search targets: " + str(len(searches))
 script_start_time_str = time.strftime("%Y-%m-%d %H:%M:%S")
 output_dir_base = output_dir
 current_label = ""
 data = {}
 associations = {}
 frequencies = {}
 max_s = len(searches)
 for count, search in enumerate(searches):
     acct_name, consumer_key, consumer_secret, access_token, access_token_secret = get_account_sequential(
     )
     t = Twarc(consumer_key, consumer_secret, access_token,
               access_token_secret)
     print "Signing in as: " + acct_name
     search = "\"" + search + "\""
     print(str(count) + "/" + str(max_s) + " searching: " + search)
     current_label = "search_" + str(count)
     output_dir = output_dir_base + str(count) + "/"
     if not os.path.exists(output_dir):
         print("Created directory: " + output_dir)
         os.makedirs(output_dir)
     fn = os.path.join(output_dir, "target.txt")
     with open(fn, "w") as f:
         f.write(search + "\n")
     dump_filename = output_dir + "raw.json"
     dump_file_handle = open(dump_filename, "a")
     data = {}
     set_counters()
Ejemplo n.º 19
0
weights['19'] = 3150
weights['20'] = 3150
weights['21'] = 3150
weights['22'] = 3150
weights['23'] = 3150


def sample_file(fileName, numSamples):
    df = pd.read_csv(fileName, names=["ids"])
    numSamples = len(df) if len(df) < numSamples else numSamples
    ids = df['ids'].sample(n=numSamples, random_state=1)
    return ids.values


# Use Twarc extract covid19 related tweets
twarc = Twarc()
tmp_df = pd.read_csv(LOG_FILE, names=["file"])
traversed = list(tmp_df.file.values)
with open(LOG_FILE, 'a+') as logf:
    for file in os.listdir(PATH):
        if file not in traversed:
            file_postfix = str(file).split(".")[0][-2:]
            sample_size = weights[file_postfix]
            print("Extract from file: ", file, "for ", sample_size,
                  " samples:")
            ids = sample_file(PATH + file, sample_size)
            output_file_name = str(file).split(".")[0] + "_contents.txt"
            # log
            w_ = csv.writer(logf)
            w_.writerow([file])
            # extract content
from os.path import dirname, realpath, join, exists
from twarc import Twarc


twarc = Twarc("22GyvUC4Jg89Eh1PuKRh3mwRo", 
          "m75gOSwIccfzYLWxwMCpHEldxgzYP83pTOSqAFbumQ5B6OF1vC",
          "852540250467467266-NoSAf6ZXmWZnr01CdUIfYBP5Z4cLZGJ",
          "kWP6L9F4YCUAsvwuaruuCUPMc4JqAE2jhJA8bhuuQSCSu")

TWEETS_TO_CRAWL = 10000
DATA_DIR = join(dirname(dirname(realpath(__file__))), "data")
TWEET_DATAFILE = join(DATA_DIR, "tweets.json")
USER_DATAFILE = join(DATA_DIR, "following.json")
MODEL_GRAPH_FILE = join(DATA_DIR, "graph.npy")
USER_DATA = join(DATA_DIR, "users.json")
RATINGS_FILE = join(DATA_DIR, "ratings.json")
USERNAMES_FILE = join(DATA_DIR, "usernames.json")
VERIFIED_USERS = join(DATA_DIR, "verified.json")
TOPUSERS_FILE = join(DATA_DIR, "topUsers.json")
TOPUSERS_OLD_FILE = join(DATA_DIR, "topUsersOld.json")
FOLLOW_FILE = join(DATA_DIR, "follow.json")
RETWEET_FILE = join(DATA_DIR, "retweets.json")


USER_CSV = join(DATA_DIR, "users.csv")
TWEETs_CSV = join(DATA_DIR, "tweets.csv")
USER_TWEET_CSV = join(DATA_DIR, "userTweetRelationship.csv")
TWEET_TWEET_CSV = join(DATA_DIR, "tweetTweetRelationship.csv")
RATINGS_CSV = join(DATA_DIR, "ratings.csv")

PLOT_FILE = join(DATA_DIR, "plot.png")
Ejemplo n.º 21
0
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)

for section in cfg:
    print(section)

consumer_key = cfg['twitter']['consumer_key']
consumer_secret = cfg['twitter']['consumer_secret']
access_token = cfg['twitter']['access_token']
access_token_secret = cfg['twitter']['access_token_secret']


def ids():
    for id in open("brexit_tweet_ids.csv"):
        yield id


t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

keys = [
    "text", "id", "created_at", "favorite_count", "lang", "place",
    "coordinates", "user", "entities", "geo", "retweeted", "retweet_count"
]
with open('tweets123.txt', 'w') as outfile:
    for tweet in t.hydrate(ids()):
        tweet1 = {filter_key: tweet[filter_key] for filter_key in keys}
        values_json = json.dumps(tweet1, sort_keys=True)
        outfile.write(values_json + "\n")
        print(tweet1['text'])
Ejemplo n.º 22
0
This can be modified to do a lot more, can be integrated with wordcloud.py for generating wordclouds on the fly.
Or to perform sentiment analysis using any text parser like aylien. 
"""

from twarc import Twarc
import json
import fileinput
import sys

print (" # Loading keys")

consumer_key = 'INSERT YOUR CONSUMER KEY HERE'
consumer_secret = 'INSERT YOUR CONSUMER SECRET HERE'
access_token = 	'INSERT YOUR TOKEN HERE' 
access_token_secret = 'INSERT YOUR TOKEN SECRET HERE'
twarc_auth = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

print (" # Reading search terms")

with open('tweet_terms.txt','r') as tweet_terms_file_content:
	my_tweet_terms = [line.strip() for line in tweet_terms_file_content]
	print (" # Search terms loaded")
	
	if len(my_tweet_terms) > 0:
		twitter_query = ",".join(my_tweet_terms)
		print " # Search terms: " + twitter_query
	
		for tweet in twarc_auth.filter(track = twitter_query):
			with open('data_dump.json', 'a') as json_output_file:
				json.dump(tweet, json_output_file, indent=4, sort_keys=True)
	else:
Ejemplo n.º 23
0
def main(warc_file):
    twitter = Twarc()
    out = csv.writer(sys.stdout)
    out.writerow(json2csv.get_headings())
    for tweet in twitter.hydrate(tweet_ids(warc_file)):
        out.writerow(json2csv.get_row(tweet))
Ejemplo n.º 24
0
class TwitterRelationships():
    # Cut-down code to get twitter relationships for a set of hashtags.
    # Adapted from https://labsblog.f-secure.com/2018/02/16/searching-twitter-with-twarc/

    def __init__(self, secretsfile='/Users/sara/twittersecrets.txt'):

        fsecret = open(secretsfile, 'r')
        secrets = fsecret.readline()
        access_token, access_token_secret, consumer_key, consumer_secret = \
            [x.strip() for x in secrets.split(',')]

        self.twarc = Twarc(consumer_key, consumer_secret, access_token,
                           access_token_secret)

    # Helper functions for saving csv and formatted txt files
    def write_data(self, data, filename, filetype='txt'):
        with io.open(filename, "w", encoding="utf-8") as handle:
            if filetype == 'txt':
                for item, count in data.most_common():
                    handle.write(str(count) + "\t" + item + "\n")

            else:  #write to csv
                handle.write(u"Source,Target,Weight\n")
                for source, targets in sorted(data.items()):
                    for target, count in sorted(targets.items()):
                        if source != target and source is not None and target is not None:
                            handle.write(source + u"," + target + u"," +
                                         str(count) + u"\n")
        return

    # Returns the screen_name of the user retweeted, or None
    def retweeted_user(self, status):
        if "retweeted_status" in status:
            orig_tweet = status["retweeted_status"]
            if "user" in orig_tweet and orig_tweet["user"] is not None:
                user = orig_tweet["user"]
                if "screen_name" in user and user["screen_name"] is not None:
                    return user["screen_name"]
        return

    # Returns a list of screen_names that the user interacted with in this Tweet
    def get_interactions(self, status):
        interactions = []
        if "in_reply_to_screen_name" in status:
            replied_to = status["in_reply_to_screen_name"]
            if replied_to is not None and replied_to not in interactions:
                interactions.append(replied_to)

        if "retweeted_status" in status:
            orig_tweet = status["retweeted_status"]
            if "user" in orig_tweet and orig_tweet["user"] is not None:
                user = orig_tweet["user"]
                if "screen_name" in user and user["screen_name"] is not None:
                    if user["screen_name"] not in interactions:
                        interactions.append(user["screen_name"])

        if "quoted_status" in status:
            orig_tweet = status["quoted_status"]
            if "user" in orig_tweet and orig_tweet["user"] is not None:
                user = orig_tweet["user"]
                if "screen_name" in user and user["screen_name"] is not None:
                    if user["screen_name"] not in interactions:
                        interactions.append(user["screen_name"])

        if "entities" in status:
            entities = status["entities"]
            if "user_mentions" in entities:
                for item in entities["user_mentions"]:
                    if item is not None and "screen_name" in item:
                        mention = item['screen_name']
                        if mention is not None and mention not in interactions:
                            interactions.append(mention)
        return interactions

    # Returns a list of hashtags found in the tweet
    def get_hashtags(self, status):
        hashtags = []
        if "entities" in status:
            entities = status["entities"]
            if "hashtags" in entities:
                for item in entities["hashtags"]:
                    if item is not None and "text" in item:
                        hashtag = item['text']
                        if hashtag is not None and hashtag not in hashtags:
                            hashtags.append(hashtag)
        return hashtags

    # Returns a list of URLs found in the Tweet
    def get_urls(self, status):
        urls = []
        if "entities" in status:
            entities = status["entities"]
            if "urls" in entities:
                for item in entities["urls"]:
                    if item is not None and "expanded_url" in item:
                        url = item['expanded_url']
                        if url is not None and url not in urls:
                            urls.append(url)
        return urls

    def get_image_urls(self, status):
        # Returns the URLs to any images found in the Tweet
        urls = []
        if "entities" in status:
            entities = status["entities"]
            if "media" in entities:
                for item in entities["media"]:
                    if item is not None:
                        if "media_url" in item:
                            murl = item["media_url"]
                            if murl not in urls:
                                urls.append(murl)
        return urls

    def fetch_images(self):
        # Iterate through image URLs, fetching each image if we haven't already
        pictures_dir = os.path.join(self.save_dir,
                                    self.dataname + '_' + "images")
        if not os.path.exists(pictures_dir):
            print("Creating directory: " + pictures_dir)
            os.makedirs(pictures_dir)
        for url in self.all_image_urls:
            m = re.search("^http:\/\/pbs\.twimg\.com\/media\/(.+)$", url)
            if m is not None:
                filename = m.group(1)
                print("Getting picture from: " + url)
                save_path = os.path.join(pictures_dir, filename)
                if not os.path.exists(save_path):
                    response = requests.get(url, stream=True)
                    with open(save_path, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    del response

        return

    def writedf(self, dataset, name, columns):
        filename = os.path.join(self.save_dir, self.dataname + '_' + name)
        with io.open(filename, "w", encoding="utf-8") as handle:
            handle.write('\t'.join(columns) + u"\n")
            for row in dataset:
                handle.write('\t'.join(row) + u"\n")
        return

    def save_datasets(self, fetch_images=True):

        csv_outputs = {
            "user_user_graph.csv": self.user_user_graph,
            "user_hashtag_graph.csv": self.user_hashtag_graph,
            "hashtag_hashtag_graph.csv": self.hashtag_hashtag_graph
        }
        for name, dataset in csv_outputs.items():
            filename = os.path.join(self.save_dir, self.dataname + '_' + name)
            self.write_data(dataset, filename, 'csv')

        text_outputs = {
            "hashtags.txt": self.hashtag_frequency_dist,
            "influencers.txt": self.influencer_frequency_dist,
            "mentioned.txt": self.mentioned_frequency_dist,
            "urls.txt": self.url_frequency_dist
        }
        for name, dataset in text_outputs.items():
            filename = os.path.join(self.save_dir, self.dataname + '_' + name)
            self.write_data(dataset, filename, 'txt')

        self.writedf(self.url_refs, "url_refs.csv", ['url', 'tweeturl'])
        self.writedf(self.image_refs, "image_refs.csv", ['url', 'tweeturl'])
        self.writedf(self.tweets, "tweets.csv",
                     ['url', 'screen_name', 'id', 'created_at', 'text'])

        if fetch_images:
            self.fetch_images()

        return

    def make_directories(self, target, rootdir='../data/twitter'):
        # Create a separate save directory for each search query
        # Since search queries can be a whole sentence, we'll check the length
        # and simply number it if the query is overly long

        self.dataname = datetime.now().strftime(
            "%Y%m%d%H%M%S") + '_' + target.replace(" ", "_")

        self.save_dir = rootdir
        if not os.path.exists(rootdir):
            os.makedirs(rootdir)
        if len(target) < 30:
            self.save_dir += "/" + self.dataname
        else:
            self.save_dir += "/target_" + str(count + 1)
        if not os.path.exists(self.save_dir):
            print("Creating directory: " + self.save_dir)
            os.makedirs(self.save_dir)

        return

    def get_target_data(self, target):

        # Variables for capturing stuff
        self.tweets_captured = 0
        self.influencer_frequency_dist = Counter()
        self.mentioned_frequency_dist = Counter()
        self.hashtag_frequency_dist = Counter()
        self.url_frequency_dist = Counter()
        self.user_user_graph = {}
        self.user_hashtag_graph = {}
        self.hashtag_hashtag_graph = {}
        self.all_image_urls = []
        self.tweets = []
        self.tweet_count = 0
        self.url_refs = []
        self.image_refs = []

        # Start the search
        for status in self.twarc.search(target):

            # Output some status as we go, so we know something is happening
            sys.stdout.write("\r")
            sys.stdout.flush()
            sys.stdout.write("Collected " + str(self.tweet_count) + " tweets.")
            sys.stdout.flush()
            self.tweet_count += 1

            screen_name = None
            if "user" in status:
                if "screen_name" in status["user"]:
                    screen_name = status["user"]["screen_name"]

            retweeted = self.retweeted_user(status)
            if retweeted is not None:
                self.influencer_frequency_dist[retweeted] += 1
            else:
                self.influencer_frequency_dist[screen_name] += 1

            # Tweet text can be in either "text" or "full_text" field...
            text = None
            if "full_text" in status:
                text = status["full_text"]
            elif "text" in status:
                text = status["text"]

            id_str = None
            if "id_str" in status:
                id_str = status["id_str"]

            # Assemble the URL to the tweet we received...
            tweet_url = None
            if id_str is not None and screen_name is not None:
                tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str
            # if tweet_url is not None and text is not None:
            #     self.tweets[tweet_url] = text
            created_at = None
            if "created_at" in status:
                created_at = status["created_at"]
            self.tweets += [[tweet_url, screen_name, id_str, created_at,
                             text]]  #capture everything

            # Record mapping graph between users
            interactions = self.get_interactions(status)
            if interactions is not None:
                for user in interactions:
                    self.mentioned_frequency_dist[user] += 1
                    if screen_name not in self.user_user_graph:
                        self.user_user_graph[screen_name] = {}
                    if user not in self.user_user_graph[screen_name]:
                        self.user_user_graph[screen_name][user] = 1
                    else:
                        self.user_user_graph[screen_name][user] += 1

            # Record mapping graph between users and hashtags
            hashtags = self.get_hashtags(status)
            if hashtags is not None:
                if len(hashtags) > 1:
                    hashtag_interactions = []

                    # This code creates pairs of hashtags in situations where multiple
                    # hashtags were found in a tweet
                    # This is used to create a graph of hashtag-hashtag interactions
                    for comb in combinations(sorted(hashtags), 2):
                        hashtag_interactions.append(comb)
                    if len(hashtag_interactions) > 0:
                        for inter in hashtag_interactions:
                            item1, item2 = inter
                            if item1 not in self.hashtag_hashtag_graph:
                                self.hashtag_hashtag_graph[item1] = {}
                            if item2 not in self.hashtag_hashtag_graph[item1]:
                                self.hashtag_hashtag_graph[item1][item2] = 1
                            else:
                                self.hashtag_hashtag_graph[item1][item2] += 1
                    for hashtag in hashtags:
                        self.hashtag_frequency_dist[hashtag] += 1
                        if screen_name not in self.user_hashtag_graph:
                            self.user_hashtag_graph[screen_name] = {}
                        if hashtag not in self.user_hashtag_graph[screen_name]:
                            self.user_hashtag_graph[screen_name][hashtag] = 1
                        else:
                            self.user_hashtag_graph[screen_name][hashtag] += 1

            urls = self.get_urls(status)
            if urls is not None:
                for url in urls:
                    self.url_refs += [[url, tweet_url]]
                    self.url_frequency_dist[url] += 1

            image_urls = self.get_image_urls(status)
            if image_urls is not None:
                for url in image_urls:
                    self.image_refs += [[url, tweet_url]]
                    if url not in self.all_image_urls:
                        self.all_image_urls.append(url)

        self.save_datasets(fetch_images=True)

        return
Ejemplo n.º 25
0
from twarc import Twarc
import json

#input twitter credentials
consumer_key = '*********'
consumer_secret = '*********'
access_token = '*********'
access_token_secret = '*********'

t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
data = []

for tweet in t.hydrate(open('../input_files/ids.txt')):
    data.append(json.dumps(tweet))

with open('output.json', 'w') as outfile:
    outfile.write("\n".join(data) + '\n')
Ejemplo n.º 26
0
# Twitter auth for downloading tweets
CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY")
CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET")
ACCESS_TOKEN = os.environ.get("TWITTER_ACCESS_TOKEN")
ACCESS_TOKEN_SECRET = os.environ.get("TWITTER_ACCESS_TOKEN_SECRET")

# Concat and read all the CSVs
dir1 = "data/twitter-framing-master/congressional_tweets_dataset_2017/unlabeled/"
dir2 = "data/twitter-framing-master/congressional_tweets_dataset_2017/labeled/"
csv_files = glob.glob(os.path.join(dir1, "*.csv")) + glob.glob(
    os.path.join(dir2, "*.csv"))
HEADERS = [
    "tweet_id", "issue1", "issue2", "frame1", "frame2", "frame3", "party", "ts"
]
all_df = pd.concat(
    (pd.read_csv(f, names=HEADERS, header=None) for f in csv_files),
    ignore_index=True)

t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
tweet_texts = {}
for tweet in t.hydrate(all_df["tweet_id"]):
    tweet_texts[tweet["id"]] = tweet["full_text"]

text_df = pd.DataFrame(tweet_texts, index=[0]).transpose().rename(columns={
    "index": "tweet_id",
    0: "text"
})
all_df = all_df.set_index("tweet_id")
joined = all_df.join(text_df)
joined.to_pickle("data/tweets.pkl")
Ejemplo n.º 27
0
from twarc import Twarc

tw = Twarc()
#get training data
for tweet in tw.search("covid-19", lang='en'):
    try:
        screen_name = None
        if "screen_name" in tweet["user"]:
            screen_name = tweet["user"]["screen_name"]
        id_str = tweet["id_str"]
        tweet_url = None
        if "id_str" != None and "screen_name" != None:
            tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str
        #put training data into a txt file
        with open("trainingcovid-19.txt", "a+") as f:
            # Move read cursor to the start of file.
            f.seek(0)
            # If file is not empty then append '\n'
            data = f.read(100)
            if len(data) > 0:
                f.write("\n")
            # Append text at the end of file
            f.write(tweet['full_text'])
            f.write("\n")
            f.write(tweet_url)

    except UnicodeEncodeError:
        print("UnicodeEncodeError in finding training data")

#now we have to manually sort training data
Ejemplo n.º 28
0
from bottle import run, route, get, post, request, template, static_file
from twarc import Twarc
import pandas as pd
t = Twarc("JNaw7CRIGnQWxHH3C6tcpF0fP",
          "1opF4IfXrtzcUPOJUvnSr4wXbYpVGEJ8J4oBHAzEqRxV1p9FVO",
          "1055391684354203648-bmiuojBuJ8S0a4cQEGErobfaPVMIQV",
          "5R457jy32zTCVtwlQkZCKUtM9mMjgod9fw02g6zNWCOzW")
twdata = None


@get('/get_details')
def get_detail():

    return '''<!doctype html>
        <html>
        <head>
        <title>twitter</title>
        <style>
        body{
        background-image:url("https://thetrendingprof.com/wp-content/uploads/2013/11/twitter.jpg");
        background-size: 1300px 800px;
        background-repeat:no-repeat;
        } 
        #rcorners3{
        border-radius: 80px 0px;
        background-image:url("http://www.hdwallpapers10.com/wp-content/uploads/2017/05/Black%20and%20White%20abstract%20Background%20Full%20HD-623x623.png");
        padding: 20px; 
        width: 500px;
        height: 200px;
        opacity:0.8; 
        }
Ejemplo n.º 29
0
from twarc import Twarc
import pprint
import json

consumer_key = "2NBPNFml9TtV3ValyhgZqP4ch"
consumer_secret = "qzCNGbr5I5vD2GAps7gdsQRNW4GbmlhODp0BokqFgCzLw2TjjV"
access_token = "931008641255084032-rMD6zn8esls7S1z4UiebC52Tb0gp8BM"
access_token_secret = "kpxBObeQfcpqbU8EikrionXFa1NbYpstYwPGA542av7K3"

output = open("sample1.json", 'w')
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
hydrated = []

count = 0
for tweet in t.hydrate(open('representatives.txt')):
    if count > 10000:
        break

    count += 1
    hydrated.append(tweet)

    if count == (0 % 1000):
        output.write(json.dump(hydrated))

print("done!")
print(count, " tweets pulled.")
output.write(json.dumps(hydrated))
Ejemplo n.º 30
0
# Read in necessary libraries and Packages
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
import geopandas as gpd
from twarc import Twarc

# Pass in credentials so I can connect to API
t = Twarc(key)

# The JSON file has thousands of JSON objects in it, so we need to first open the file,
# loop through each line, and use the json.loads() function to extract each object
tweets = []
with open('tweets.json') as f:
    for line in f:
        tweets.append(json.loads(line))

len(tweets)

# Printing out the first five tweets in the file
[print(tweets[i]['full_text'], '\n\n') for i in range(5)]

# If a Tweet was retweeted, the text may be shortened. For example, in this tweet below the 'full text'
# is actually cut short, but in the retweeted status we can see the full text.
print(tweets[27]['user']['location'],
      tweets[27]['full_text']), tweets[27]['retweeted_status']['full_text']
"""### Locations from Tweets"""
Ejemplo n.º 31
0
#!/usr/bin/env python3

#
# Parts of code taken from stackoverflow
#

import gzip
import json
import requests

from tqdm import tqdm
from twarc import Twarc
from pathlib import Path

twarc = Twarc()

url = "https://drive.google.com/file/d/1COJ1zrJE-acz0yZssIljRSAPyIRtS2EC/view?usp=sharing"
r = requests.get(url)


def reader_generator(reader):
    b = reader(1024 * 1024)
    while b:
        yield b
        b = reader(1024 * 1024)


def raw_newline_count(fname):
    f = open(fname, 'rb')
    f_gen = reader_generator(f.raw.read)
    return sum(buf.count(b'\n') for buf in f_gen)
class TwitterHarvester(BaseHarvester):
    def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False,
                 connection_errors=5, http_errors=5, debug_warcprox=False, tries=3):
        BaseHarvester.__init__(self, working_path, mq_config=mq_config,
                               stream_restart_interval_secs=stream_restart_interval_secs,
                               debug=debug, debug_warcprox=debug_warcprox, tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options", {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get("web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations))

    def sample(self):
        self._harvest_tweets(self.twarc.sample())

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {}".format(screen_name)
                    log.exception(msg)
                    self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                if new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(__name__,
                                                          "timeline.{}.since_id".format(
                                                              user_id)) if incremental else None

                    self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id)
                        log.exception(msg)
                        self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        users = list(self.twarc.user_lookup(user_ids=(user_id,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["screen_name"]
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        users = list(self.twarc.user_lookup(screen_names=(screen_name,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["id_str"]
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(__name__, key,
                                               max(self.state_store.get_state(__name__, key), tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])
class TwitterHarvester(BaseHarvester):
    def __init__(self, process_interval_secs=1200, mq_config=None, debug=False):
        BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug)
        self.twarc = None

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"])

    def search(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            query = seed.get("token")
            # Get since_id from state_store
            since_id = self.state_store.get_state(__name__, "{}.since_id".format(query)) if incremental else None

            max_tweet_id = self._process_tweets(self.twarc.search(query, since_id=since_id))
            log.debug("Searching on %s since %s returned %s tweets.", query,
                      since_id, self.harvest_result.summary.get("tweet"))

            # Update state store
            if incremental and max_tweet_id:
                self.state_store.set_state(__name__, "{}.since_id".format(query), max_tweet_id)

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"]

        self._process_tweets(self.twarc.stream(track))

    def _process_tweets(self, tweets):
        max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Processed %s tweets", count)
            if self.stop_event.is_set():
                log.debug("Stopping since stop event set.")
                break
            if "text" in tweet:
                with self.harvest_result_lock:
                    max_tweet_id = max(max_tweet_id, tweet.get("id"))
                    self.harvest_result.increment_summary("tweet")
                    if "urls" in tweet["entities"]:
                        for url in tweet["entities"]["urls"]:
                            self.harvest_result.urls.append(url["expanded_url"])
                    if "media" in tweet["entities"]:
                        for media in tweet["entities"]["media"]:
                            self.harvest_result.urls.append(media["media_url"])
        return max_tweet_id
from os import path

import pandas as pd
from twarc import Twarc
from util.util import DataCollector
from util.util import create_dir, Config

keys = pd.read_csv('resources/tweet_keys_file.txt').iloc[0]
t = Twarc(keys['app_key'], keys['app_secret'], keys['oauth_token'],
          keys['oauth_token_secret'])

features = [
    'tweet_id', 'retweeted_id', 'created_at', 'favorite_count',
    'retweet_count', 'user_id', 'location', 'verified', 'followers_count',
    'source', 'text', 'fake'
]


def collect_tweets(news_list, news_source, label, config: Config):
    create_dir(config.dump_location)
    create_dir("{}/{}".format(config.dump_location, news_source))
    create_dir("{}/{}/tweets".format(config.dump_location, news_source))

    for news in news_list:
        print('Downloading ' + news_source + ' ' + label + ' ' + news.news_id +
              ' tweets')
        create_dir("{}/{}/{}/{}".format(config.dump_location, news_source,
                                        label, news.news_id))
        data = pd.DataFrame(columns=features)
        news_dir = "{}/{}/tweets/{}.csv".format(config.dump_location,
                                                news_source, news.news_id)
 def _create_twarc(self):
     self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                        self.message["credentials"]["consumer_secret"],
                        self.message["credentials"]["access_token"],
                        self.message["credentials"]["access_token_secret"])
Ejemplo n.º 36
0
#!/usr/bin/env python3

#
# This script will walk through all the tweet id files and
# hydrate them with twarc. The line oriented JSON files will
# be placed right next to each tweet id file.

from pathlib import Path
from twarc import Twarc
from pyspark import SparkConf, SparkContext
import sys
from os import listdir

twarc = Twarc(consumer_key="ledLMTpVRfM",
              consumer_secret="Mza5q9YYc2KIK8rI0B0kss3",
              access_token="121996430w61PFl46Q7jRrgbrqkGLxy",
              access_token_secret="8ymranCmZ2UedmN")


def extractInfo(tweet):
    hydrated_info = {}
    hydrated_info['id'] = tweet['id_str']
    hydrated_info['favorite_count'] = tweet['favorite_count']
    hydrated_info['retweet_count'] = tweet['retweet_count']
    hydrated_info['geo'] = tweet['geo']
    hydrated_info['create_at'] = tweet['created_at']
    return hydrated_info


def main(input_dir, outpath):
    conf = SparkConf().setMaster("local").setAppName("Test")
Ejemplo n.º 37
0
if errs:
    raise RuntimeError(
        f"Required environment variables are undefined: {errs}. See README for details."
    )

# Check that tweet_id was provided
if len(sys.argv) != 2:
    raise RuntimeError(
        "Program should be called like: `python main.py <tweet_id>`")

tweet_id = sys.argv[1]

# Main
t = Twarc(
    consumer_key=consumer_key,
    consumer_secret=consumer_secret,
    access_token=access_token,
    access_token_secret=access_token_secret,
)

tweet = t.tweet(tweet_id)
if not tweet:
    raise RuntimeError(f"tweet with id {tweet_id} does not exist")
# replies is a generator object
replies = t.replies(tweet, True)

# List to hold dict of relevant photo data from each of the replies
photo_data = []
for reply in replies:
    # Photos will be in a list stored at reply['extended_entities']['media']
    print("Processing next reply")
Ejemplo n.º 38
0
# from the command line to tell it your Twitter API keys.
#

import gzip
import json

from tqdm import tqdm
from twarc import Twarc
from pathlib import Path
import datetime
print(datetime.datetime.now())

with open('config/cred.json') as json_file:
    cred = json.load(json_file)

twarc = Twarc(cred['CONSUMER_KEY'], cred['CONSUMER_SECRET'],
              cred['ACCESS_TOKEN'], cred['ACCESS_TOKEN_SECRET'])
#data_dirs = ['2020-01', '2020-02', '2020-03', '2020-04', '2020-05']
base_path = "data-ids/"
data_dirs = ['2020-05']

import threading
import queue

#Number of threads
n_thread = 5
all_ids = []
#Create queue
queue = queue.Queue()


class ThreadClass(threading.Thread):
Ejemplo n.º 39
0
					testneg = 0.0
					withoutclass = line.split()
					for eachword in withoutclass:
						if eachword in dictpos:
							testpos += numpy.log10(dictpos[eachword]['probability'])
						else:
							testpos += numpy.log10(a / (counterpos + (a * (counterpos + counterneg))))
						if eachword in dictneg:
							testneg += numpy.log10(dictneg[eachword]['probability'])
						else:
							testneg += numpy.log10(a / (counterneg + (a * (counterpos + counterneg))))
					#if it's good, write it into the result txt file
					if (testpos > testneg):
						with open("testingcovid-19result.txt", "a+") as f:
							f.seek(0)
							data = f.read(100)
							if len(data) > 0 :
								f.write("\n")
							f.write(line)
							positive = True

			except UnicodeEncodeError:
				print("UnicodeEncodeError in testing data")


if __name__ == '__main__':
	tw = Twarc()
	dictpos, dictneg, counterpos, counterneg, a = training()
	gettestingdata()
	testing(dictpos, dictneg, counterpos, counterneg, a)
Ejemplo n.º 40
0
# twitore server flask app

# localConfig
import localConfig

# python
import flask
from flask import Flask, render_template, g

# twarc
from twarc import Twarc
# global twarc instance
twarc_instance = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret)

# crontab
from crontab import CronTab
mycron = CronTab(user=True)

# create app
app = flask.Flask(__name__)

# set session key
app.secret_key = 'twitore_is_the_bomb'

# Flask/MongoEngine
from flask.ext.mongoengine import MongoEngine
app.config['MONGODB_SETTINGS'] = {
    'db': 'twitore_dev'
}
db = MongoEngine(app)
Ejemplo n.º 41
0
def crawl_feed(feed_dict, credentials):
    twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'],
                  credentials['access_token'],
                  credentials['access_token_secret'])
    crawl_time = datetime.datetime.now()
    crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S')
    crawl_time_html = crawl_time.strftime('%B %d, %Y')
    crawl_name = feed_dict['crawl_name']
    crawl_type = feed_dict['crawl_type']
    short_name = feed_dict['short_name']
    search_string = feed_dict['search_string']

    feed_dir = feed_dict['feed_dir']
    json_dir = join(feed_dir, 'json')
    html_dir = join(feed_dir, 'html')
    media_dir = join(feed_dir, 'media')
    logs_dir = join(feed_dir, 'logs')

    for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    log_file = join(logs_dir, 'twarc.log')

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logger = logging.getLogger(crawl_name)
    handler = logging.FileHandler(log_file)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    base_filename = short_name + '-' + crawl_time_filename
    json_file = join(json_dir, base_filename + '.json')

    print "Searching Twitter API for {0}".format(search_string)
    print "Writing JSON and HTML files..."

    logger.info("starting search for %s", search_string)
    tweet_count = 0

    for tweet in twarc.search(search_string):
        with open(json_file, 'a') as json_out:
            json_out.write("{}\n".format(json.dumps(tweet)))

        if "id_str" in tweet:
            logger.info("archived https://twitter.com/%s/status/%s",
                        tweet['user']['screen_name'], tweet["id_str"])
        elif 'limit' in tweet:
            logger.warn("%s tweets undelivered", tweet["limit"]["track"])
        elif 'warning' in tweet:
            logger.warn(tweet['warning']['message'])
        else:
            logger.warn(json.dumps(tweet))

        tweet_count += 1

    if tweet_count == 0:
        logger.info("no new tweets matching %s", search_string)

        # Write an empty json file. Maybe don't do this?
        with open(json_file, 'w') as json_out:
            json_out.close()

    return base_filename, tweet_count, crawl_time_html
Ejemplo n.º 42
0
__location__ = os.path.dirname(os.path.realpath(__file__))

users = os.path.join(__location__, "apostrophe", "tweets.csv")

userList = []
with open(users, 'r', encoding='utf-8') as f:
	reader = csv.reader(f)
	rowCount = 0
	for row in reader:
		rowCount += 1
		if rowCount > 1:
			if not row[3] in userList:
				userList.append(row[3])

tweets = []
tweetContent = ""
for user in userList:				
	t = Twarc()
	for tweet in t.search("from:" + user):
		print (tweet["full_text"])
		tweetContent += "%s\n" % str(tweet["full_text"])
		tweets.append(tweet)
		
outputFile = os.path.join(__location__, "possibleBotTweets.jsonl")
with open(outputFile, "w", encoding='utf-8') as output:
	for line in tweets:
		output.write("%s\n" % str(json.dumps(line)))
		
contentOutput = os.path.join(__location__, "possibleBotTweetContent.txt")
with open(contentOutput, "w", encoding='utf-8') as output2:
	output2.write(tweetContent)
Ejemplo n.º 43
0
from twarc import Twarc
import os
import csv

t = Twarc()

def load_seed_list(filepath):
    """
    For reading user ids from a seed list downloaded from SFM into a dictionary.
    """
    user_ids = set()
    # Encoding handles the BOM
    with open(filepath, encoding='utf-8-sig') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # user id to screen name
            user_ids.add(row['Uid'])
    return user_ids


def get_followings(user_ids):
    existing_followed_user_ids = set()
    new_followed_user_ids = set()
    if os.path.exists('followed.csv'):
        with open('followed.csv') as followed_file:
            for line in followed_file:
                existing_followed_user_ids.add(user_id_from_line(line))
        print('Loaded {} existing followed users'.format(len(existing_followed_user_ids)))
    with open('follower_to_followed.csv', 'w') as follower_to_followed_file:
        for count, user_id in enumerate(user_ids):
            print('Getting following for {} ({})'.format(user_id, count + 1))
Ejemplo n.º 44
0
from twarc import Twarc

client_key = 'client_key'
client_secret = 'client_secret'
access_token = '197456523-m2qIYWxkQTFKj0ModTQPcdByTnjryHwLRm9L8o5y'
access_token_secret = 'access_token_secret'

t = Twarc(client_key, client_secret, access_token, access_token_secret)
for tweet in t.search("resigncameron"):
    print(tweet["text"])
Ejemplo n.º 45
0
# This script was scheduled to run daily, so the filenames to be processed was yesterday's date
filename = (date.today() - timedelta(days=1)).strftime("%m-%d-%Y")

# Main directory which contails the ids folder and the full data folder
main_dir = '/home/vca_rishik/rishik/COVID-19-tweets/'
ids_dir = main_dir + 'data/'
# Make sure you create this folder in the main directory before running this script
target_dir = main_dir + 'data_full/'

# Twitter API Credentials
ACCESS_TOKEN = config.ACCESS_TOKEN
ACCESS_SECRET = config.ACCESS_SECRET
CONSUMER_KEY = config.CONSUMER_KEY
CONSUMER_SECRET = config.CONSUMER_SECRET

t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET)

tweet_ids = pd.read_csv(ids_dir + filename + ".csv", lineterminator='\n')
tweet_objects = []

for tweet in t.hydrate(tweet_ids.id.drop_duplicates()):
    tweet_objects.append(tweet)

df_full = pd.DataFrame(
    tweet_objects,
    columns=[
        'created_at', 'id', 'id_str', 'full_text', 'truncated',
        'display_text_range', 'entities', 'source', 'in_reply_to_status_id',
        'in_reply_to_status_id_str', 'in_reply_to_user_id',
        'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
        'coordinates', 'place', 'contributors', 'is_quote_status',
Ejemplo n.º 46
0
# This script will walk through all the tweet id files and
# hydrate them with twarc. The line oriented JSON files will
# be placed right next to each tweet id file.
#
# Note: you will need to install twarc, tqdm, and run twarc configure
# from the command line to tell it your Twitter API keys.
#

import gzip
import json

from tqdm import tqdm
from twarc import Twarc
from pathlib import Path

twarc = Twarc()
data_dirs = ['2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06', '2020-07', '2020-08', '2020-09']


def main():
    for data_dir in data_dirs:
        for path in Path(data_dir).iterdir():
            if path.name.endswith('.txt'):
                hydrate(path)


def _reader_generator(reader):
    b = reader(1024 * 1024)
    while b:
        yield b
        b = reader(1024 * 1024)
def crawl_feed(feed_dict, credentials):
    twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'], credentials['access_token'], credentials['access_token_secret'])
    crawl_time = datetime.datetime.now()
    crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S')
    crawl_time_html = crawl_time.strftime('%B %d, %Y')
    crawl_name = feed_dict['crawl_name']
    crawl_type = feed_dict['crawl_type']
    short_name = feed_dict['short_name']
    search_string = feed_dict['search_string']

    feed_dir = feed_dict['feed_dir']
    json_dir = join(feed_dir, 'json')
    html_dir = join(feed_dir, 'html')
    media_dir = join(feed_dir, 'media')
    logs_dir = join(feed_dir, 'logs')

    for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    log_file = join(logs_dir, 'twarc.log')

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logger = logging.getLogger(crawl_name)
    handler = logging.FileHandler(log_file)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    base_filename = short_name + '-' + crawl_time_filename
    json_file = join(json_dir, base_filename + '.json')

    print("Searching Twitter API for {0}".format(search_string))
    print("Writing JSON and HTML files...")

    logger.info("starting search for %s", search_string)
    tweet_count = 0

    for tweet in twarc.search(search_string):
        with open(json_file, 'a') as json_out:
            json_out.write("{}\n".format(json.dumps(tweet)))

        if "id_str" in tweet:
            logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"])
        elif 'limit' in tweet:
            logger.warn("%s tweets undelivered", tweet["limit"]["track"])
        elif 'warning' in tweet:
            logger.warn(tweet['warning']['message'])
        else:
            logger.warn(json.dumps(tweet))

        tweet_count += 1

    if tweet_count == 0:
        logger.info("no new tweets matching %s", search_string)

        # Write an empty json file. Maybe don't do this?
        with open(json_file, 'w') as json_out:
            json_out.close()

    return base_filename, tweet_count, crawl_time_html