Python Twarc.timeline Examples

Programming Language: Python

Namespace/Package Name: twarc

Class/Type: Twarc

Method/Function: timeline

Examples at hotexamples.com: 14

Python Twarc.timeline - 14 examples found. These are the top rated real world Python examples of twarc.Twarc.timeline extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Twarc(30)

hydrate(28)

search(22)

timeline(13)

filter(9)

user_lookup(6)

tweet(5)

replies(4)

sample(2)

stream(2)

follower_ids(1)

get(1)

list_members(1)

Example #1

Show file

File: twitter_harvesters.py Project: nlnzcollservices/harvester_manager

def get_account(item):
    """
	Uses the Twarc libtrary to surface all the tweet twarc can see via a twitter username
	Searches for media in all tweets - if it can find any it also tries to download that media item
	"""
    item.agent_name = agent_name + "_1_get_account"
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    t = Twarc(twitter_consumer_key, twitter_consumer_secret,
              twitter_access_token, twitter_access_token_secret)
    name = item.url.strip().replace("https://twitter.com/",
                                    "").replace("?", "")
    file_path = os.path.join(
        item.storage_folder,
        "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), name))
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    tweets = []
    for tweet in t.timeline(screen_name=name):
        tweets.append(tweet)
    tweets = filter_tweets_by_start_date(tweets, item.date_range)
    for tweet in tweets:
        get_assets(tweet, item.storage_folder)
    with open(file_path, "w") as outfile:
        json.dump(tweets, outfile)
    item.completed = True
    return item

Example #2

Show file

def collect_timelines(input_file, output_file, credentials_file):
    with open(credentials_file) as fp:
        credentials = tuple(map(str.strip, fp.readlines()))
    twarc_obj = Twarc(*credentials)
    df = pd.read_csv(input_file, sep="\t")
    with open(output_file, "w+") as fp:
        total = 0
        found_users = 0
        pbar = tqdm.tqdm(df.values)
        for uid, tid, u_statuses in pbar:
            found = 0
            pbar.set_description("User {}".format(uid))
            try:
                for tweet_json in twarc_obj.timeline(user_id="{}".format(uid)):
                    found += 1
                    if found > 190:
                        break
                    total += 1
                    print(json.dumps(tweet_json), file=fp)
                    pbar.set_postfix(found=found_users + 1, total=total)
            except requests.exceptions.HTTPError as e:
                pbar.write("Error for uid={}. {}".format(uid, e))
            else:
                found_users += 1
        pbar.close()
    print("Collected {} tweets.".format(total))

Example #3

Show file

File: scrapeTwitter.py Project: eddwebster/analyticsTwitterInteractions

def get_interactions(consumer_key, consumer_secret, access_token, access_token_secret):
    """
    Arguments are Twitter API credentials. To get them you can go here http://apps.twitter.com/.
    Saves pickled lists of tweet authors and users they mention, and a list of users considered.
    """
    from twarc import Twarc
    from tqdm import tqdm
    import pickle

    t = Twarc(consumer_key,
            consumer_secret,
            access_token,
            access_token_secret)

    list_ids = ["1335885096063295488",
                "1288082572195639296",
                "1287444819015618561",
                "1283739792702713856",
                "1081734288368898048",
                "910757441855459328",
                "193445218",
                "90205656",
                "85315110"]

    users = set([m['screen_name'] for lid in list_ids for m in t.list_members(lid)])

    users_to_exclude = ['premierleague',
                        'SpursOfficial',
                        'Arsenal',
                        'ManCity',
                        'sterling7',
                        'kylewalker2',
                        'HKane',
                        'benmendy23',
                        'dele_official',
                        'RobHolding95',
                        'm8arteta']

    [users.remove(u) for u in users_to_exclude]

    authors = []
    mentions = []

    for user in tqdm(users):
        tl = t.timeline(screen_name=user)
        tweets = [tt for tt in tl]
        m = [u['screen_name'] for tw in tweets for u in tw['entities']['user_mentions']]
        a = [user] * len(m)
        mentions.append(m)
        authors.append(a)

    flat_a = [item for sublist in authors for item in sublist]
    flat_m = [item for sublist in mentions for item in sublist]

    pickle.dump(flat_a, open('authors.p', 'wb'))
    pickle.dump(flat_m, open('mentions.p', 'wb'))
    pickle.dump(users, open('users.p', 'wb'))

Example #4

Show file

def read_timelines(after_date: datetime, handles: List[str]):
    consumer_key = os.environ.get('CONSUMER_KEY')
    consumer_secret = os.environ.get('CONSUMER_SECRET')
    access_token_key = os.environ.get('ACCESS_TOKEN')
    access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')
    # bearer_token = os.environ.get('BEARER_TOKEN')

    twarc = Twarc(consumer_key, consumer_secret, access_token_key, access_token_secret)

    for handle in handles:
        print(f'Scanning twitter handle @{handle}')
        for tweet in twarc.timeline(screen_name=handle):
            created_at = parse_twitter_datetime(tweet['created_at'])
            print(f'Found tweet created at @{created_at}')
            yield tweet
            if created_at <= after_date:
                break

Example #5

Show file

def get_tweets(tcconfig, up_to_pages=1, source_id="dgSHiftCodes"):
    logger.debug("get_tweets args: {}".format(
        [tcconfig, up_to_pages, source_id]))
    if tcconfig is not None:
        logger.info("Setting Twitter client credential config")
        ct = tcconfig["consumer_key"]
        cs = tcconfig["consumer_secret"]
        at = tcconfig["access_token"]
        ats = tcconfig["access_token_secret"]
        logger.debug("CT: {0}, CS: {1}, AT: {2}, ATS: {3}".format(
            ct, cs, at, ats))
    else:
        logger.error("No Twitter client config argument provided")
        raise Exception("tcconfig cannot be None")

    twsclient = Twarc(ct, cs, at, ats)

    return twsclient.timeline(screen_name=source_id, max_pages=up_to_pages)

Example #6

Show file

def UserTimeLine_Extract(variables_dict, target):
	'''
	'''
	# This creates an instance of Twarc.
	credentials = variables_dict['credentials'] 
	t = Twarc(consumer_key=credentials['consumer_key'],
			  consumer_secret=credentials['consumer_secret'],
			  access_token=credentials['access_token'],
			  access_token_secret=credentials['access_token_secret']
			  )
	tweet_list = []
	# go through user timeline
	#for tweet in t.timeline(user_id='1339835893'):
	for tweet in t.timeline(screen_name=target):
		tweet_json = json.dumps(tweet)
		tweet_list.append(tweet_json)
		# tweet infor
		print "{} is created at {} with the following text: ".format(tweet['id_str'], tweet['created_at'])
		print "{}".format(tweet['text'].encode('utf-8'))
		print "by {}. \n".format(tweet['user']['screen_name'])

	return tweet_list

Example #7

Show file

File: twitter_harvesters.py Project: KoroteevaS/harvester_manager

def get_account(item):
    item.agent_name = agent_name + "_1_get_account"
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    t = Twarc(twitter_consumer_key, twitter_consumer_secret,
              twitter_access_token, twitter_access_token_secret)
    name = item.url.strip().replace("https://twitter.com/",
                                    "").replace("?", "")
    file_path = os.path.join(
        item.storage_folder,
        "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), name))
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    tweets = []
    for tweet in t.timeline(screen_name=name):
        tweets.append(tweet)
    tweets = filter_tweets_by_start_date(tweets, item.date_range)
    for tweet in tweets:
        get_assets(tweet, item.storage_folder)
    with open(file_path, "w") as outfile:
        json.dump(tweets, outfile)
    item.completed = True
    return item

Example #8

Show file

consumer_secret =  "2x8Q0WyWNV86XEWRAuYhJB0kUu4M9BosgemxMjnPbiu00t5HE7"
access_key = "602847795-0GJCA5vujrexWTCfK6ZtxZD2MZ8pCuA1zBKO5fNa"
access_secret = "6cN8sgBp7DiDITJbg0uCSlWoeY84YoLJs5HOxzxmqtjEj"


prev_date = timedelta(days=10)
today = datetime.now().date()
time_range = today - prev_date

t = Twarc(consumer_key, consumer_secret, access_key, access_secret)
for name in usa_list:
    print(name)
    file_name = r"C:\\Users\\ravik\\OneDrive\\Desktop\\UsertimelineReplies\\" + str(name) + ".json"
    max_poi_tweet = 0
    with open( file_name, "a", encoding='utf-8') as file:
        for tweet in t.timeline(screen_name=name):
            if 'retweeted_status' in tweet.keys():
                print("Its a retweet")
                continue
            if max_poi_tweet > 3000:
                break
            json.dump(tweet, file,  ensure_ascii=False)
            file.write("\n")
            max_poi_tweet +=1
            max_replies = 0
            if datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S %z %Y").date() >= time_range:
                for reply in t.replies(tweet):
                    #print("In")
                    #preprocessing(tweet, file)
                    if 'retweeted_status' in tweet.keys():
                        print("Its a retweet")

Example #9

Show file

File: analyzer-twitter.py Project: x0rzkov/OsintTool

class AnalyzerProcess():
    def __init__(self, config, loggerObject, alerLoggerObject, rules,
                 executionMode):
        self.logger = loggerObject
        self.alertLogger = alerLoggerObject
        self.rules = rules
        self.config = config
        self.executionMode = executionMode
        self.access_token = "insert Twitter API access token"
        self.access_token_secret = "insert Twitter API token secret"
        self.consumer_key = "insert Twitter API consumer key"
        self.consumer_secret = "insert Twitter API consumer secret"
        self.twarc = Twarc(self.consumer_key, self.consumer_secret,
                           self.access_token, self.access_token_secret)
        self.currdir = "/home/centos/modosint-python3" + path.dirname(__file__)
        self.wcloud = ""
        self.stop_words = get_stop_words('spanish')
        newStopWords = ["http", "https", "co", "n'", "'", '"']
        self.stop_words.extend(newStopWords)

#Search Tweets that contais term in different Language

    def searchDifLanguage(self, text, language, ruleId):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            traductor = Translator()
            translatedText = traductor.translate(text, dest=language)
            repeated = False
            if self.executionMode == "daemon":
                searchDif = self.twarc.search(translatedText.text)
                for tweet in searchDif:
                    tweetTime = parser.parse(''.join(tweet['created_at']))
                    timeFormed = time.strptime(
                        str(tweetTime.time()).split(',')[0], '%H:%M:%S')
                    createdAtSeconds = datetime.timedelta(
                        hours=timeFormed.tm_hour,
                        minutes=timeFormed.tm_min,
                        seconds=timeFormed.tm_sec).total_seconds()
                    nowTimeUtc = datetime.datetime.utcnow().time()
                    nowTimeFormed = time.strptime(
                        str(nowTimeUtc).split('.')[0], '%H:%M:%S')
                    nowTimeSeconds = datetime.timedelta(
                        hours=nowTimeFormed.tm_hour,
                        minutes=nowTimeFormed.tm_min,
                        seconds=nowTimeFormed.tm_sec).total_seconds()
                    if (nowTimeSeconds - createdAtSeconds <
                            300):  #time in 5 minutes
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')

                                texto = tweet['full_text']

                                for c in texto:
                                    if c in emoji.UNICODE_EMOJI:
                                        texto = texto.replace(c, "")
                                texto = u'' + texto
                                try:
                                    emoji_pattern = re.compile(
                                        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
                                        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
                                        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
                                        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
                                        u"(\U0001F1E0-\U0001F1FF])|"
                                        u"(\U0001F600-\U0001F64F])|"  # emoticons 2
                                        u"(\U0001F300-\U0001F5FF])|"  # symbols & pictographs
                                        u"(\U0001F680-\U0001F6FF])|"
                                        u"(\u2600-\u26FF])|"
                                        u"(\U0001F1F2\U0001F1F4)|"  # Macau flag
                                        u"([\U0001F1E6-\U0001F1FF]{2})|"  # flags
                                        u"([\U0001F600-\U0001F64F])"  # emoticons 3		
                                        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
                                        "+",
                                        flags=re.UNICODE)
                                    resultesp = traductor.translate(
                                        emoji_pattern.sub(r'', texto),
                                        dest='es')
                                except ValueError:
                                    self.my_logger.debug(
                                        '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.'
                                    )
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "TranslatedTweet":
                                    resultesp.text,
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(resultesp.text + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

            else:
                searchDif = self.twarc.search(translatedText.text)
                for tweet in searchDif:
                    tweetTime = ''.join(tweet['created_at'])
                    datetweet = parser.parse(tweetTime)
                    if (datetweet.date() == datetime.datetime.now().date()
                            or datetweet.date()
                            == (datetime.datetime.now().date() -
                                timedelta(1))):
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')

                                texto = tweet['full_text']

                                for c in texto:
                                    if c in emoji.UNICODE_EMOJI:
                                        texto = texto.replace(c, "")
                                texto = u'' + texto
                                try:
                                    emoji_pattern = re.compile(
                                        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
                                        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
                                        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
                                        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
                                        u"(\U0001F1E0-\U0001F1FF])|"
                                        u"(\U0001F600-\U0001F64F])|"  # emoticons 2
                                        u"(\U0001F300-\U0001F5FF])|"  # symbols & pictographs
                                        u"(\U0001F680-\U0001F6FF])|"
                                        u"(\u2600-\u26FF])|"
                                        u"(\U0001F1F2\U0001F1F4)|"  # Macau flag
                                        u"([\U0001F1E6-\U0001F1FF]{2})|"  # flags
                                        u"([\U0001F600-\U0001F64F])"  # emoticons 3		
                                        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
                                        "+",
                                        flags=re.UNICODE)
                                    resultesp = traductor.translate(
                                        emoji_pattern.sub(r'', texto),
                                        dest='es')
                                except ValueError:
                                    self.my_logger.debug(
                                        '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.'
                                    )
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "TranslatedTweet":
                                    resultesp.text,
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(resultesp.text + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

#Search Tweets that contains term or Hashtag

    def searchTweetOrHashtag(self, text, ruleId):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            repeated = False
            if self.executionMode == "daemon":
                tweets = self.twarc.search(text)
                for tweet in tweets:
                    tweetTime = parser.parse(''.join(tweet['created_at']))
                    timeFormed = time.strptime(
                        str(tweetTime.time()).split(',')[0], '%H:%M:%S')
                    createdAtSeconds = datetime.timedelta(
                        hours=timeFormed.tm_hour,
                        minutes=timeFormed.tm_min,
                        seconds=timeFormed.tm_sec).total_seconds()
                    nowTimeUtc = datetime.datetime.utcnow().time()
                    nowTimeFormed = time.strptime(
                        str(nowTimeUtc).split('.')[0], '%H:%M:%S')
                    nowTimeSeconds = datetime.timedelta(
                        hours=nowTimeFormed.tm_hour,
                        minutes=nowTimeFormed.tm_min,
                        seconds=nowTimeFormed.tm_sec).total_seconds()
                    if (nowTimeSeconds - createdAtSeconds <
                            300):  #time in 5 minutes
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'] + '\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

            else:
                tweets = self.twarc.search(text)
                for tweet in tweets:  #no daemon(tweets in this day and yesterday)
                    tweetTime = ''.join(tweet['created_at'])
                    datetweet = parser.parse(tweetTime)
                    if (datetweet.date() == datetime.datetime.now().date()
                            or datetweet.date()
                            == (datetime.datetime.now().date() -
                                timedelta(1))):
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'] + '\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

    #Search All Tweets or timeline from @user
    def searchUserTweets(self, user, ruleId, fullstring):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            tweets = self.twarc.timeline(None, user, None, None)
            repeated = False
            t_end = time.time() + 30
            for tweet in tweets:
                if time.time() < t_end:
                    for text in fullstring:
                        if text in tweet['full_text']:
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                else:
                    break

    def create_wordcloud(self, text, ruleId):
        mask = np.array(Image.open(path.join(self.currdir,
                                             "twitter_mask.png")))
        # create wordcloud object
        wc = WordCloud(background_color="white",
                       max_words=200,
                       mask=mask,
                       stopwords=self.stop_words)
        try:
            # generate wordcloud
            wc.generate(text)
            # save wordcloud
            wc.to_file(
                path.join(self.currdir + "/WordCloud/Twitter/",
                          "wcTwitterRule" + ruleId + ".png"))
            os.chmod(
                path.join(self.currdir + "/WordCloud/Twitter/",
                          "wcTwitterRule" + ruleId + ".png"), 0o777)
        except ValueError as e:
            error = True

    # custom functionality
    def run(self):
        self.logger.info("working...")
        OSINTRules = self.rules
        for element in OSINTRules:
            ruleId = element.get('metadata', False).get('id', False)
            self.wcloud = open(
                "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId +
                ".txt", "a+")
            checkUsername = element.get('_username', False)
            checkString = element.get('_string', False)
            if checkUsername:
                user = (''.join(element['_username']))
            if checkString:
                string = (','.join(element['_string']))
                fullstring = element['_string']
                checkLanguage = element.get('_language', False)
                if checkLanguage:
                    language = (''.join(element['_language']))
                    self.searchDifLanguage(string, language, ruleId)
                else:
                    self.searchTweetOrHashtag(string, ruleId)
                if checkUsername:
                    self.searchUserTweets(user, ruleId, fullstring)
        if not os.path.exists(self.currdir + "/WordCloud"):
            os.makedirs(self.currdir + "/WordCloud/")
            os.chmod(self.currdir + "/WordCloud/", 0o777)
        if not os.path.exists(self.currdir + "/WordCloud/Twitter"):
            os.makedirs(self.currdir + "/WordCloud/Twitter/")
            os.chmod(self.currdir + "/WordCloud/Twitter/", 0o777)
        for element in OSINTRules:
            ruleId = element.get('metadata', False).get('id', False)
            file_content = open(
                "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId +
                ".txt", "r")
            file_content = file_content.readlines()
            self.create_wordcloud(str(file_content), ruleId)
        self.createPlotMentions()
        self.createPlotHashtag()
        self.alertLogger.info("Twitter Analyzer Job Finished succesfully.")

    def exportReferenceHashtag(self, mensaje):
        lista = re.findall(r'#\w+', mensaje)
        return lista if lista != [] else np.NaN

    def exportReferenceMentions(self, mensaje):
        lista = re.findall(r'@\w+', mensaje)
        return lista if lista != [] else np.NaN

    def createPlotMentions(self):
        with io.open('/var/log/modosint/analyzer-twitter/graylog.txt',
                     'r') as f:
            dataMentions = f.readlines()
            data_json = json.dumps(
                list(map(lambda entry: eval(entry[:-1]), dataMentions)))
            data_twitter = pd.read_json(data_json)
            referenceMentions = data_twitter.short_message.map(
                self.exportReferenceMentions)
            referenceMentions.dropna(inplace=True)
            referenceMentions.head()
            referenceMentions = list(referenceMentions)
            referenceMentions_list = list(itertools.chain(*referenceMentions))
            count_referenceMentions = pd.Series(
                referenceMentions_list).value_counts()
            fig = plt.figure(figsize=(12, 8))
            sns.barplot(y=count_referenceMentions.iloc[:20].index,
                        x=count_referenceMentions.iloc[:20].values)
            fig.savefig(self.currdir + 'mentionsPlot.png')
            os.chmod(self.currdir + 'mentionsPlot.png', 0o777)

    def createPlotHashtag(self):
        with io.open('/var/log/modosint/analyzer-twitter/graylog.txt',
                     'r') as f:
            dataHashtag = f.readlines()
            data_json = json.dumps(
                list(map(lambda entry: eval(entry[:-1]), dataHashtag)))
            data_twitter = pd.read_json(data_json)
            referenceHash = data_twitter.short_message.map(
                self.exportReferenceHashtag)
            referenceHash.dropna(inplace=True)
            referenceHash.head()
            referenceHash = list(referenceHash)
            referenceHash_list = list(itertools.chain(*referenceHash))
            count_referenceHash = pd.Series(referenceHash_list).value_counts()
            fig = plt.figure(figsize=(12, 8))
            sns.barplot(y=count_referenceHash.iloc[:20].index,
                        x=count_referenceHash.iloc[:20].values)
            fig.savefig(self.currdir + 'mentionsHashtag.png')
            os.chmod(self.currdir + 'mentionsHashtag.png', 0o777)

Example #10

Show file

class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options",
                                              {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get(
            "web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get(
            "user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {} because account is not found or suspended".format(
                        screen_name)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                # if can't find the screen_name, ignore get timeline
                if not new_screen_name:
                    msg = "Screen name not found for user id {} because account is not found or suspended".format(
                        user_id)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
                    # reset the user_id, ignore the get timeline
                    user_id = None
                if new_screen_name and new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(
                        __name__, "timeline.{}.since_id".format(
                            user_id)) if incremental else None

                    self._harvest_tweets(
                        self.twarc.timeline(user_id=user_id,
                                            since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        account = "user {} (User ID: {})".format(
                            screen_name, user_id
                        ) if screen_name else "user ID: {}".format(user_id)
                        msg = "Unauthorized for {} because account is suspended or protected".format(
                            account)
                        log.exception(msg)
                        self.result.warnings.append(
                            Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        try:
            users = list(self.twarc.user_lookup(user_ids=(user_id, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["screen_name"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        try:
            users = list(self.twarc.user_lookup(screen_names=(screen_name, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["id_str"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(
                        url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query),
                                       max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(self.state_store.get_state(__name__, key),
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(
                tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])

Example #11

Show file

File: twitter_harvester.py Project: sebastian-nagel/sfm-twitter-harvester

class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors,
                           tweet_mode="extended")

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        query, geocode = self._search_parameters()
        self._harvest_tweets(
            self.twarc.search(query, geocode=geocode, since_id=since_id))

    def _search_parameters(self):
        if type(self.message["seeds"][0]["token"]) is dict:
            query = self.message["seeds"][0]["token"].get("query")
            geocode = self.message["seeds"][0]["token"].get("geocode")
        else:
            query = self.message["seeds"][0]["token"]
            geocode = None
        return query, geocode

    def _search_id(self):
        query, geocode = self._search_parameters()
        if query and not geocode:
            return query
        if geocode and not query:
            return geocode
        return ":".join([query, geocode])

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")
        language = self.message["seeds"][0]["token"].get("language")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              lang=language,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                result, user = self._lookup_user(screen_name, "screen_name")
                if result == "OK":
                    user_id = user["id_str"]
                    self.result.uids[seed_id] = user_id
                else:
                    msg = u"User id not found for {} because account is {}".format(
                        screen_name, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("token_{}".format(result), msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                result, user = self._lookup_user(user_id, "user_id")
                if result == "OK":
                    new_screen_name = user["screen_name"]
                    if new_screen_name and new_screen_name != screen_name:
                        self.result.token_updates[seed_id] = new_screen_name
                else:
                    msg = u"User {} (User ID: {}) not found because account is {}".format(
                        screen_name, user_id, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("uid_{}".format(result), msg, seed_id=seed_id))
                    user_id = None

            if user_id:
                # Get since_id from state_store
                since_id = self.state_store.get_state(
                    __name__, "timeline.{}.since_id".format(
                        user_id)) if incremental else None

                self._harvest_tweets(
                    self.twarc.timeline(user_id=user_id, since_id=since_id))

    def _lookup_user(self, id, id_type):
        url = "https://api.twitter.com/1.1/users/show.json"
        params = {id_type: id}

        # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]}
        # USER_PROTECTED: 200 and user object with "protected": true
        # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]}
        result = "OK"
        user = None
        try:
            resp = self.twarc.get(url, params=params, allow_404=True)
            user = resp.json()
            if user['protected']:
                result = "unauthorized"
        except requests.exceptions.HTTPError as e:
            try:
                resp_json = e.response.json()
            except json.decoder.JSONDecodeError:
                raise e
            if e.response.status_code == 404 and self._has_error_code(
                    resp_json, 50):
                result = "not_found"
            elif e.response.status_code == 403 and self._has_error_code(
                    resp_json, 63):
                result = "suspended"
            else:
                raise e
        return result, user

    @staticmethod
    def _has_error_code(resp, code):
        if isinstance(code, int):
            code = (code, )
        for error in resp['errors']:
            if error['code'] in code:
                return True
        return False

    @staticmethod
    def _result_to_reason(result):
        if result == "unauthorized":
            return "protected"
        elif result == "suspended":
            return "suspended"
        return "not found or deleted"

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and (max_tweet_id or 0) > (since_id or 0):
            self.state_store.set_state(
                __name__, u"{}.since_id".format(self._search_id()),
                max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(
                            self.state_store.get_state(__name__, key) or 0,
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None

        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                max_tweet_id = max(max_tweet_id or 0, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, _):
        self.result.increment_stats("tweets")

Example #12

Show file

def user_info_crawler(screen_name, user_dir, user_profile_f, user_profileimg_f, user_tweets_f, user_clean_tweets_f):
    try:
        # crawl user profile
        # sys.stdout.write('Get user profile >> ')
        # sys.stdout.flush()

        if not os.path.exists(os.path.join(user_dir, user_profile_f)):

            t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

            user_profile_data = t.user_lookup(ids=[screen_name], id_type="screen_name")

            for user_profile in user_profile_data:
                with open(os.path.join(user_dir, user_profile_f), 'w') as outfile:
                    json.dump(user_profile, outfile)

        # crawl user profile image
        # sys.stdout.write('Get user profile image >> ')
        # sys.stdout.flush()

        with open(os.path.join(user_dir, user_profile_f), 'r') as rf:

            user_profile_json = json.load(rf)

            if not os.path.exists(os.path.join(user_dir, user_profileimg_f)):

                # extract user profile image url
                user_profileimg_url = user_profile_json['profile_image_url']

                def image_converter(user_profileimg_url):
                    tmp_file = '../data/user/tmp' + user_profileimg_url[-4:]
                    if sys.version_info[0] == 2:
                        urllib.urlretrieve(user_profileimg_url, tmp_file)
                    elif sys.version_info[0] == 3:
                        urlretrieve(user_profileimg_url, tmp_file)
                    from PIL import Image
                    im = Image.open(tmp_file)
                    rgb_im = im.convert('RGB')
                    rgb_im.save(os.path.join(user_dir, user_profileimg_f))
                    os.remove(tmp_file)

                if user_profileimg_url:
                    user_profileimg_url = user_profileimg_url.replace('_normal', '_bigger')

                image_converter(user_profileimg_url)

        # crawl user tweets
        # sys.stdout.write('Get user tweets >> ')
        # sys.stdout.flush()

        if not os.path.exists(os.path.join(user_dir, user_tweets_f)):
            user_timeline_data = t.timeline(screen_name=screen_name)
            with open(os.path.join(user_dir, user_tweets_f), 'a') as outfile:
                for user_timeline in user_timeline_data:
                    json.dump(user_timeline, outfile)
                    outfile.write('\n')

        # clean user tweets
        # sys.stdout.write('Clean user tweets \n')
        # sys.stdout.flush()
        if not os.path.exists(os.path.join(user_dir, user_clean_tweets_f)):

            tweet_raw_lines = []
            with open(os.path.join(user_dir, user_tweets_f), 'r') as rf:
                for line in rf:
                    tweet_raw_lines.append(json.loads(line)['full_text'])

            clean_tweets = process_raw_tweets(tweet_raw_lines)

            with open(os.path.join(user_dir, user_clean_tweets_f), 'w') as wf:
                for tweet in clean_tweets:
                    if len(tweet) > 0:
                        wf.write(tweet + '\n')
            wf.close()

        return user_profile_json

    except Exception as e:
        # print(e)
        print("Could not predict user's role. Check account info, few tweets, incorrect image format...")

Example #13

Show file

def crawl_feed(feed_dict, credentials):
    twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'],
                  credentials['access_token'],
                  credentials['access_token_secret'])
    crawl_time = datetime.datetime.now()
    crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S')
    crawl_time_html = crawl_time.strftime('%B %d, %Y')
    crawl_name = feed_dict['crawl_name']
    crawl_type = feed_dict['crawl_type']
    short_name = feed_dict['short_name']
    search_string = feed_dict['search_string']

    feed_dir = feed_dict['feed_dir']
    json_dir = join(feed_dir, 'json')
    html_dir = join(feed_dir, 'html')
    media_dir = join(feed_dir, 'media')
    logs_dir = join(feed_dir, 'logs')

    for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    log_file = join(logs_dir, 'twarc.log')

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logger = logging.getLogger(crawl_name)
    handler = logging.FileHandler(log_file)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    base_filename = short_name + '-' + crawl_time_filename
    json_file = join(json_dir, base_filename + '.json')

    print("Searching Twitter API for {0}".format(search_string))
    print("Writing JSON and HTML files...")

    logger.info("starting search for %s", search_string)
    tweet_count = 0

    if crawl_type == "timeline":
        for tweet in twarc.timeline(screen_name=search_string):
            with open(json_file, 'a') as json_out:
                json_out.write("{}\n".format(json.dumps(tweet)))

            if "id_str" in tweet:
                logger.info("archived https://twitter.com/%s/status/%s",
                            tweet['user']['screen_name'], tweet["id_str"])
            elif 'limit' in tweet:
                logger.warn("%s tweets undelivered", tweet["limit"]["track"])
            elif 'warning' in tweet:
                logger.warn(tweet['warning']['message'])
            else:
                logger.warn(json.dumps(tweet))

            tweet_count += 1

    else:
        for tweet in twarc.search(search_string):
            with open(json_file, 'a') as json_out:
                json_out.write("{}\n".format(json.dumps(tweet)))

            if "id_str" in tweet:
                logger.info("archived https://twitter.com/%s/status/%s",
                            tweet['user']['screen_name'], tweet["id_str"])
            elif 'limit' in tweet:
                logger.warn("%s tweets undelivered", tweet["limit"]["track"])
            elif 'warning' in tweet:
                logger.warn(tweet['warning']['message'])
            else:
                logger.warn(json.dumps(tweet))

            tweet_count += 1

    if tweet_count == 0:
        logger.info("no new tweets matching %s", search_string)

        # Write an empty json file. Maybe don't do this?
        with open(json_file, 'w') as json_out:
            json_out.close()

    return base_filename, tweet_count, crawl_time_html

Example #14

Show file

File: twitter_harvester.py Project: gwu-libraries/sfm-twitter-harvester

class TwitterHarvester(BaseHarvester):
    def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False,
                 connection_errors=5, http_errors=5, debug_warcprox=False, tries=3):
        BaseHarvester.__init__(self, working_path, mq_config=mq_config,
                               stream_restart_interval_secs=stream_restart_interval_secs,
                               debug=debug, debug_warcprox=debug_warcprox, tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options", {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get("web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations))

    def sample(self):
        self._harvest_tweets(self.twarc.sample())

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {}".format(screen_name)
                    log.exception(msg)
                    self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                if new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(__name__,
                                                          "timeline.{}.since_id".format(
                                                              user_id)) if incremental else None

                    self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id)
                        log.exception(msg)
                        self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        users = list(self.twarc.user_lookup(user_ids=(user_id,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["screen_name"]
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        users = list(self.twarc.user_lookup(screen_names=(screen_name,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["id_str"]
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(__name__, key,
                                               max(self.state_store.get_state(__name__, key), tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])