コード例 #1
0
def get_account(item):
    """
	Uses the Twarc libtrary to surface all the tweet twarc can see via a twitter username
	Searches for media in all tweets - if it can find any it also tries to download that media item
	"""
    item.agent_name = agent_name + "_1_get_account"
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    t = Twarc(twitter_consumer_key, twitter_consumer_secret,
              twitter_access_token, twitter_access_token_secret)
    name = item.url.strip().replace("https://twitter.com/",
                                    "").replace("?", "")
    file_path = os.path.join(
        item.storage_folder,
        "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), name))
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    tweets = []
    for tweet in t.timeline(screen_name=name):
        tweets.append(tweet)
    tweets = filter_tweets_by_start_date(tweets, item.date_range)
    for tweet in tweets:
        get_assets(tweet, item.storage_folder)
    with open(file_path, "w") as outfile:
        json.dump(tweets, outfile)
    item.completed = True
    return item
コード例 #2
0
def collect_timelines(input_file, output_file, credentials_file):
    with open(credentials_file) as fp:
        credentials = tuple(map(str.strip, fp.readlines()))
    twarc_obj = Twarc(*credentials)
    df = pd.read_csv(input_file, sep="\t")
    with open(output_file, "w+") as fp:
        total = 0
        found_users = 0
        pbar = tqdm.tqdm(df.values)
        for uid, tid, u_statuses in pbar:
            found = 0
            pbar.set_description("User {}".format(uid))
            try:
                for tweet_json in twarc_obj.timeline(user_id="{}".format(uid)):
                    found += 1
                    if found > 190:
                        break
                    total += 1
                    print(json.dumps(tweet_json), file=fp)
                    pbar.set_postfix(found=found_users + 1, total=total)
            except requests.exceptions.HTTPError as e:
                pbar.write("Error for uid={}. {}".format(uid, e))
            else:
                found_users += 1
        pbar.close()
    print("Collected {} tweets.".format(total))
コード例 #3
0
def get_interactions(consumer_key, consumer_secret, access_token, access_token_secret):
    """
    Arguments are Twitter API credentials. To get them you can go here http://apps.twitter.com/.
    Saves pickled lists of tweet authors and users they mention, and a list of users considered.
    """
    from twarc import Twarc
    from tqdm import tqdm
    import pickle

    t = Twarc(consumer_key,
            consumer_secret,
            access_token,
            access_token_secret)

    list_ids = ["1335885096063295488",
                "1288082572195639296",
                "1287444819015618561",
                "1283739792702713856",
                "1081734288368898048",
                "910757441855459328",
                "193445218",
                "90205656",
                "85315110"]

    users = set([m['screen_name'] for lid in list_ids for m in t.list_members(lid)])

    users_to_exclude = ['premierleague',
                        'SpursOfficial',
                        'Arsenal',
                        'ManCity',
                        'sterling7',
                        'kylewalker2',
                        'HKane',
                        'benmendy23',
                        'dele_official',
                        'RobHolding95',
                        'm8arteta']

    [users.remove(u) for u in users_to_exclude]

    authors = []
    mentions = []

    for user in tqdm(users):
        tl = t.timeline(screen_name=user)
        tweets = [tt for tt in tl]
        m = [u['screen_name'] for tw in tweets for u in tw['entities']['user_mentions']]
        a = [user] * len(m)
        mentions.append(m)
        authors.append(a)

    flat_a = [item for sublist in authors for item in sublist]
    flat_m = [item for sublist in mentions for item in sublist]

    pickle.dump(flat_a, open('authors.p', 'wb'))
    pickle.dump(flat_m, open('mentions.p', 'wb'))
    pickle.dump(users, open('users.p', 'wb'))
コード例 #4
0
def read_timelines(after_date: datetime, handles: List[str]):
    consumer_key = os.environ.get('CONSUMER_KEY')
    consumer_secret = os.environ.get('CONSUMER_SECRET')
    access_token_key = os.environ.get('ACCESS_TOKEN')
    access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')
    # bearer_token = os.environ.get('BEARER_TOKEN')

    twarc = Twarc(consumer_key, consumer_secret, access_token_key, access_token_secret)

    for handle in handles:
        print(f'Scanning twitter handle @{handle}')
        for tweet in twarc.timeline(screen_name=handle):
            created_at = parse_twitter_datetime(tweet['created_at'])
            print(f'Found tweet created at @{created_at}')
            yield tweet
            if created_at <= after_date:
                break
コード例 #5
0
def get_tweets(tcconfig, up_to_pages=1, source_id="dgSHiftCodes"):
    logger.debug("get_tweets args: {}".format(
        [tcconfig, up_to_pages, source_id]))
    if tcconfig is not None:
        logger.info("Setting Twitter client credential config")
        ct = tcconfig["consumer_key"]
        cs = tcconfig["consumer_secret"]
        at = tcconfig["access_token"]
        ats = tcconfig["access_token_secret"]
        logger.debug("CT: {0}, CS: {1}, AT: {2}, ATS: {3}".format(
            ct, cs, at, ats))
    else:
        logger.error("No Twitter client config argument provided")
        raise Exception("tcconfig cannot be None")

    twsclient = Twarc(ct, cs, at, ats)

    return twsclient.timeline(screen_name=source_id, max_pages=up_to_pages)
コード例 #6
0
def UserTimeLine_Extract(variables_dict, target):
	'''
	'''
	# This creates an instance of Twarc.
	credentials = variables_dict['credentials'] 
	t = Twarc(consumer_key=credentials['consumer_key'],
			  consumer_secret=credentials['consumer_secret'],
			  access_token=credentials['access_token'],
			  access_token_secret=credentials['access_token_secret']
			  )
	tweet_list = []
	# go through user timeline
	#for tweet in t.timeline(user_id='1339835893'):
	for tweet in t.timeline(screen_name=target):
		tweet_json = json.dumps(tweet)
		tweet_list.append(tweet_json)
		# tweet infor
		print "{} is created at {} with the following text: ".format(tweet['id_str'], tweet['created_at'])
		print "{}".format(tweet['text'].encode('utf-8'))
		print "by {}. \n".format(tweet['user']['screen_name'])

	return tweet_list
コード例 #7
0
def get_account(item):
    item.agent_name = agent_name + "_1_get_account"
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    t = Twarc(twitter_consumer_key, twitter_consumer_secret,
              twitter_access_token, twitter_access_token_secret)
    name = item.url.strip().replace("https://twitter.com/",
                                    "").replace("?", "")
    file_path = os.path.join(
        item.storage_folder,
        "{}_{}.json".format(time.strftime("%d-%m-%Y_%H-%M-%S"), name))
    if not os.path.exists(item.storage_folder):
        os.makedirs(item.storage_folder)
    tweets = []
    for tweet in t.timeline(screen_name=name):
        tweets.append(tweet)
    tweets = filter_tweets_by_start_date(tweets, item.date_range)
    for tweet in tweets:
        get_assets(tweet, item.storage_folder)
    with open(file_path, "w") as outfile:
        json.dump(tweets, outfile)
    item.completed = True
    return item
コード例 #8
0
consumer_secret =  "2x8Q0WyWNV86XEWRAuYhJB0kUu4M9BosgemxMjnPbiu00t5HE7"
access_key = "602847795-0GJCA5vujrexWTCfK6ZtxZD2MZ8pCuA1zBKO5fNa"
access_secret = "6cN8sgBp7DiDITJbg0uCSlWoeY84YoLJs5HOxzxmqtjEj"


prev_date = timedelta(days=10)
today = datetime.now().date()
time_range = today - prev_date

t = Twarc(consumer_key, consumer_secret, access_key, access_secret)
for name in usa_list:
    print(name)
    file_name = r"C:\\Users\\ravik\\OneDrive\\Desktop\\UsertimelineReplies\\" + str(name) + ".json"
    max_poi_tweet = 0
    with open( file_name, "a", encoding='utf-8') as file:
        for tweet in t.timeline(screen_name=name):
            if 'retweeted_status' in tweet.keys():
                print("Its a retweet")
                continue
            if max_poi_tweet > 3000:
                break
            json.dump(tweet, file,  ensure_ascii=False)
            file.write("\n")
            max_poi_tweet +=1
            max_replies = 0
            if datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S %z %Y").date() >= time_range:
                for reply in t.replies(tweet):
                    #print("In")
                    #preprocessing(tweet, file)
                    if 'retweeted_status' in tweet.keys():
                        print("Its a retweet")
コード例 #9
0
ファイル: analyzer-twitter.py プロジェクト: x0rzkov/OsintTool
class AnalyzerProcess():
    def __init__(self, config, loggerObject, alerLoggerObject, rules,
                 executionMode):
        self.logger = loggerObject
        self.alertLogger = alerLoggerObject
        self.rules = rules
        self.config = config
        self.executionMode = executionMode
        self.access_token = "insert Twitter API access token"
        self.access_token_secret = "insert Twitter API token secret"
        self.consumer_key = "insert Twitter API consumer key"
        self.consumer_secret = "insert Twitter API consumer secret"
        self.twarc = Twarc(self.consumer_key, self.consumer_secret,
                           self.access_token, self.access_token_secret)
        self.currdir = "/home/centos/modosint-python3" + path.dirname(__file__)
        self.wcloud = ""
        self.stop_words = get_stop_words('spanish')
        newStopWords = ["http", "https", "co", "n'", "'", '"']
        self.stop_words.extend(newStopWords)

#Search Tweets that contais term in different Language

    def searchDifLanguage(self, text, language, ruleId):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            traductor = Translator()
            translatedText = traductor.translate(text, dest=language)
            repeated = False
            if self.executionMode == "daemon":
                searchDif = self.twarc.search(translatedText.text)
                for tweet in searchDif:
                    tweetTime = parser.parse(''.join(tweet['created_at']))
                    timeFormed = time.strptime(
                        str(tweetTime.time()).split(',')[0], '%H:%M:%S')
                    createdAtSeconds = datetime.timedelta(
                        hours=timeFormed.tm_hour,
                        minutes=timeFormed.tm_min,
                        seconds=timeFormed.tm_sec).total_seconds()
                    nowTimeUtc = datetime.datetime.utcnow().time()
                    nowTimeFormed = time.strptime(
                        str(nowTimeUtc).split('.')[0], '%H:%M:%S')
                    nowTimeSeconds = datetime.timedelta(
                        hours=nowTimeFormed.tm_hour,
                        minutes=nowTimeFormed.tm_min,
                        seconds=nowTimeFormed.tm_sec).total_seconds()
                    if (nowTimeSeconds - createdAtSeconds <
                            300):  #time in 5 minutes
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')

                                texto = tweet['full_text']

                                for c in texto:
                                    if c in emoji.UNICODE_EMOJI:
                                        texto = texto.replace(c, "")
                                texto = u'' + texto
                                try:
                                    emoji_pattern = re.compile(
                                        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
                                        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
                                        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
                                        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
                                        u"(\U0001F1E0-\U0001F1FF])|"
                                        u"(\U0001F600-\U0001F64F])|"  # emoticons 2
                                        u"(\U0001F300-\U0001F5FF])|"  # symbols & pictographs
                                        u"(\U0001F680-\U0001F6FF])|"
                                        u"(\u2600-\u26FF])|"
                                        u"(\U0001F1F2\U0001F1F4)|"  # Macau flag
                                        u"([\U0001F1E6-\U0001F1FF]{2})|"  # flags
                                        u"([\U0001F600-\U0001F64F])"  # emoticons 3		
                                        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
                                        "+",
                                        flags=re.UNICODE)
                                    resultesp = traductor.translate(
                                        emoji_pattern.sub(r'', texto),
                                        dest='es')
                                except ValueError:
                                    self.my_logger.debug(
                                        '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.'
                                    )
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "TranslatedTweet":
                                    resultesp.text,
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(resultesp.text + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

            else:
                searchDif = self.twarc.search(translatedText.text)
                for tweet in searchDif:
                    tweetTime = ''.join(tweet['created_at'])
                    datetweet = parser.parse(tweetTime)
                    if (datetweet.date() == datetime.datetime.now().date()
                            or datetweet.date()
                            == (datetime.datetime.now().date() -
                                timedelta(1))):
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')

                                texto = tweet['full_text']

                                for c in texto:
                                    if c in emoji.UNICODE_EMOJI:
                                        texto = texto.replace(c, "")
                                texto = u'' + texto
                                try:
                                    emoji_pattern = re.compile(
                                        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
                                        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
                                        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
                                        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
                                        u"(\U0001F1E0-\U0001F1FF])|"
                                        u"(\U0001F600-\U0001F64F])|"  # emoticons 2
                                        u"(\U0001F300-\U0001F5FF])|"  # symbols & pictographs
                                        u"(\U0001F680-\U0001F6FF])|"
                                        u"(\u2600-\u26FF])|"
                                        u"(\U0001F1F2\U0001F1F4)|"  # Macau flag
                                        u"([\U0001F1E6-\U0001F1FF]{2})|"  # flags
                                        u"([\U0001F600-\U0001F64F])"  # emoticons 3		
                                        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
                                        "+",
                                        flags=re.UNICODE)
                                    resultesp = traductor.translate(
                                        emoji_pattern.sub(r'', texto),
                                        dest='es')
                                except ValueError:
                                    self.my_logger.debug(
                                        '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.'
                                    )
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "TranslatedTweet":
                                    resultesp.text,
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(resultesp.text + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

#Search Tweets that contains term or Hashtag

    def searchTweetOrHashtag(self, text, ruleId):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            repeated = False
            if self.executionMode == "daemon":
                tweets = self.twarc.search(text)
                for tweet in tweets:
                    tweetTime = parser.parse(''.join(tweet['created_at']))
                    timeFormed = time.strptime(
                        str(tweetTime.time()).split(',')[0], '%H:%M:%S')
                    createdAtSeconds = datetime.timedelta(
                        hours=timeFormed.tm_hour,
                        minutes=timeFormed.tm_min,
                        seconds=timeFormed.tm_sec).total_seconds()
                    nowTimeUtc = datetime.datetime.utcnow().time()
                    nowTimeFormed = time.strptime(
                        str(nowTimeUtc).split('.')[0], '%H:%M:%S')
                    nowTimeSeconds = datetime.timedelta(
                        hours=nowTimeFormed.tm_hour,
                        minutes=nowTimeFormed.tm_min,
                        seconds=nowTimeFormed.tm_sec).total_seconds()
                    if (nowTimeSeconds - createdAtSeconds <
                            300):  #time in 5 minutes
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'] + '\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

            else:
                tweets = self.twarc.search(text)
                for tweet in tweets:  #no daemon(tweets in this day and yesterday)
                    tweetTime = ''.join(tweet['created_at'])
                    datetweet = parser.parse(tweetTime)
                    if (datetweet.date() == datetime.datetime.now().date()
                            or datetweet.date()
                            == (datetime.datetime.now().date() -
                                timedelta(1))):
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'] + '\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

    #Search All Tweets or timeline from @user
    def searchUserTweets(self, user, ruleId, fullstring):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            tweets = self.twarc.timeline(None, user, None, None)
            repeated = False
            t_end = time.time() + 30
            for tweet in tweets:
                if time.time() < t_end:
                    for text in fullstring:
                        if text in tweet['full_text']:
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                else:
                    break

    def create_wordcloud(self, text, ruleId):
        mask = np.array(Image.open(path.join(self.currdir,
                                             "twitter_mask.png")))
        # create wordcloud object
        wc = WordCloud(background_color="white",
                       max_words=200,
                       mask=mask,
                       stopwords=self.stop_words)
        try:
            # generate wordcloud
            wc.generate(text)
            # save wordcloud
            wc.to_file(
                path.join(self.currdir + "/WordCloud/Twitter/",
                          "wcTwitterRule" + ruleId + ".png"))
            os.chmod(
                path.join(self.currdir + "/WordCloud/Twitter/",
                          "wcTwitterRule" + ruleId + ".png"), 0o777)
        except ValueError as e:
            error = True

    # custom functionality
    def run(self):
        self.logger.info("working...")
        OSINTRules = self.rules
        for element in OSINTRules:
            ruleId = element.get('metadata', False).get('id', False)
            self.wcloud = open(
                "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId +
                ".txt", "a+")
            checkUsername = element.get('_username', False)
            checkString = element.get('_string', False)
            if checkUsername:
                user = (''.join(element['_username']))
            if checkString:
                string = (','.join(element['_string']))
                fullstring = element['_string']
                checkLanguage = element.get('_language', False)
                if checkLanguage:
                    language = (''.join(element['_language']))
                    self.searchDifLanguage(string, language, ruleId)
                else:
                    self.searchTweetOrHashtag(string, ruleId)
                if checkUsername:
                    self.searchUserTweets(user, ruleId, fullstring)
        if not os.path.exists(self.currdir + "/WordCloud"):
            os.makedirs(self.currdir + "/WordCloud/")
            os.chmod(self.currdir + "/WordCloud/", 0o777)
        if not os.path.exists(self.currdir + "/WordCloud/Twitter"):
            os.makedirs(self.currdir + "/WordCloud/Twitter/")
            os.chmod(self.currdir + "/WordCloud/Twitter/", 0o777)
        for element in OSINTRules:
            ruleId = element.get('metadata', False).get('id', False)
            file_content = open(
                "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId +
                ".txt", "r")
            file_content = file_content.readlines()
            self.create_wordcloud(str(file_content), ruleId)
        self.createPlotMentions()
        self.createPlotHashtag()
        self.alertLogger.info("Twitter Analyzer Job Finished succesfully.")

    def exportReferenceHashtag(self, mensaje):
        lista = re.findall(r'#\w+', mensaje)
        return lista if lista != [] else np.NaN

    def exportReferenceMentions(self, mensaje):
        lista = re.findall(r'@\w+', mensaje)
        return lista if lista != [] else np.NaN

    def createPlotMentions(self):
        with io.open('/var/log/modosint/analyzer-twitter/graylog.txt',
                     'r') as f:
            dataMentions = f.readlines()
            data_json = json.dumps(
                list(map(lambda entry: eval(entry[:-1]), dataMentions)))
            data_twitter = pd.read_json(data_json)
            referenceMentions = data_twitter.short_message.map(
                self.exportReferenceMentions)
            referenceMentions.dropna(inplace=True)
            referenceMentions.head()
            referenceMentions = list(referenceMentions)
            referenceMentions_list = list(itertools.chain(*referenceMentions))
            count_referenceMentions = pd.Series(
                referenceMentions_list).value_counts()
            fig = plt.figure(figsize=(12, 8))
            sns.barplot(y=count_referenceMentions.iloc[:20].index,
                        x=count_referenceMentions.iloc[:20].values)
            fig.savefig(self.currdir + 'mentionsPlot.png')
            os.chmod(self.currdir + 'mentionsPlot.png', 0o777)

    def createPlotHashtag(self):
        with io.open('/var/log/modosint/analyzer-twitter/graylog.txt',
                     'r') as f:
            dataHashtag = f.readlines()
            data_json = json.dumps(
                list(map(lambda entry: eval(entry[:-1]), dataHashtag)))
            data_twitter = pd.read_json(data_json)
            referenceHash = data_twitter.short_message.map(
                self.exportReferenceHashtag)
            referenceHash.dropna(inplace=True)
            referenceHash.head()
            referenceHash = list(referenceHash)
            referenceHash_list = list(itertools.chain(*referenceHash))
            count_referenceHash = pd.Series(referenceHash_list).value_counts()
            fig = plt.figure(figsize=(12, 8))
            sns.barplot(y=count_referenceHash.iloc[:20].index,
                        x=count_referenceHash.iloc[:20].values)
            fig.savefig(self.currdir + 'mentionsHashtag.png')
            os.chmod(self.currdir + 'mentionsHashtag.png', 0o777)
コード例 #10
0
class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options",
                                              {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get(
            "web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get(
            "user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {} because account is not found or suspended".format(
                        screen_name)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                # if can't find the screen_name, ignore get timeline
                if not new_screen_name:
                    msg = "Screen name not found for user id {} because account is not found or suspended".format(
                        user_id)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
                    # reset the user_id, ignore the get timeline
                    user_id = None
                if new_screen_name and new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(
                        __name__, "timeline.{}.since_id".format(
                            user_id)) if incremental else None

                    self._harvest_tweets(
                        self.twarc.timeline(user_id=user_id,
                                            since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        account = "user {} (User ID: {})".format(
                            screen_name, user_id
                        ) if screen_name else "user ID: {}".format(user_id)
                        msg = "Unauthorized for {} because account is suspended or protected".format(
                            account)
                        log.exception(msg)
                        self.result.warnings.append(
                            Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        try:
            users = list(self.twarc.user_lookup(user_ids=(user_id, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["screen_name"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        try:
            users = list(self.twarc.user_lookup(screen_names=(screen_name, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["id_str"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(
                        url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query),
                                       max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(self.state_store.get_state(__name__, key),
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(
                tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])
コード例 #11
0
class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors,
                           tweet_mode="extended")

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        query, geocode = self._search_parameters()
        self._harvest_tweets(
            self.twarc.search(query, geocode=geocode, since_id=since_id))

    def _search_parameters(self):
        if type(self.message["seeds"][0]["token"]) is dict:
            query = self.message["seeds"][0]["token"].get("query")
            geocode = self.message["seeds"][0]["token"].get("geocode")
        else:
            query = self.message["seeds"][0]["token"]
            geocode = None
        return query, geocode

    def _search_id(self):
        query, geocode = self._search_parameters()
        if query and not geocode:
            return query
        if geocode and not query:
            return geocode
        return ":".join([query, geocode])

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")
        language = self.message["seeds"][0]["token"].get("language")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              lang=language,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                result, user = self._lookup_user(screen_name, "screen_name")
                if result == "OK":
                    user_id = user["id_str"]
                    self.result.uids[seed_id] = user_id
                else:
                    msg = u"User id not found for {} because account is {}".format(
                        screen_name, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("token_{}".format(result), msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                result, user = self._lookup_user(user_id, "user_id")
                if result == "OK":
                    new_screen_name = user["screen_name"]
                    if new_screen_name and new_screen_name != screen_name:
                        self.result.token_updates[seed_id] = new_screen_name
                else:
                    msg = u"User {} (User ID: {}) not found because account is {}".format(
                        screen_name, user_id, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("uid_{}".format(result), msg, seed_id=seed_id))
                    user_id = None

            if user_id:
                # Get since_id from state_store
                since_id = self.state_store.get_state(
                    __name__, "timeline.{}.since_id".format(
                        user_id)) if incremental else None

                self._harvest_tweets(
                    self.twarc.timeline(user_id=user_id, since_id=since_id))

    def _lookup_user(self, id, id_type):
        url = "https://api.twitter.com/1.1/users/show.json"
        params = {id_type: id}

        # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]}
        # USER_PROTECTED: 200 and user object with "protected": true
        # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]}
        result = "OK"
        user = None
        try:
            resp = self.twarc.get(url, params=params, allow_404=True)
            user = resp.json()
            if user['protected']:
                result = "unauthorized"
        except requests.exceptions.HTTPError as e:
            try:
                resp_json = e.response.json()
            except json.decoder.JSONDecodeError:
                raise e
            if e.response.status_code == 404 and self._has_error_code(
                    resp_json, 50):
                result = "not_found"
            elif e.response.status_code == 403 and self._has_error_code(
                    resp_json, 63):
                result = "suspended"
            else:
                raise e
        return result, user

    @staticmethod
    def _has_error_code(resp, code):
        if isinstance(code, int):
            code = (code, )
        for error in resp['errors']:
            if error['code'] in code:
                return True
        return False

    @staticmethod
    def _result_to_reason(result):
        if result == "unauthorized":
            return "protected"
        elif result == "suspended":
            return "suspended"
        return "not found or deleted"

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and (max_tweet_id or 0) > (since_id or 0):
            self.state_store.set_state(
                __name__, u"{}.since_id".format(self._search_id()),
                max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(
                            self.state_store.get_state(__name__, key) or 0,
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None

        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                max_tweet_id = max(max_tweet_id or 0, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, _):
        self.result.increment_stats("tweets")
コード例 #12
0
def user_info_crawler(screen_name, user_dir, user_profile_f, user_profileimg_f, user_tweets_f, user_clean_tweets_f):
    try:
        # crawl user profile
        # sys.stdout.write('Get user profile >> ')
        # sys.stdout.flush()

        if not os.path.exists(os.path.join(user_dir, user_profile_f)):

            t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

            user_profile_data = t.user_lookup(ids=[screen_name], id_type="screen_name")

            for user_profile in user_profile_data:
                with open(os.path.join(user_dir, user_profile_f), 'w') as outfile:
                    json.dump(user_profile, outfile)

        # crawl user profile image
        # sys.stdout.write('Get user profile image >> ')
        # sys.stdout.flush()

        with open(os.path.join(user_dir, user_profile_f), 'r') as rf:

            user_profile_json = json.load(rf)

            if not os.path.exists(os.path.join(user_dir, user_profileimg_f)):

                # extract user profile image url
                user_profileimg_url = user_profile_json['profile_image_url']

                def image_converter(user_profileimg_url):
                    tmp_file = '../data/user/tmp' + user_profileimg_url[-4:]
                    if sys.version_info[0] == 2:
                        urllib.urlretrieve(user_profileimg_url, tmp_file)
                    elif sys.version_info[0] == 3:
                        urlretrieve(user_profileimg_url, tmp_file)
                    from PIL import Image
                    im = Image.open(tmp_file)
                    rgb_im = im.convert('RGB')
                    rgb_im.save(os.path.join(user_dir, user_profileimg_f))
                    os.remove(tmp_file)

                if user_profileimg_url:
                    user_profileimg_url = user_profileimg_url.replace('_normal', '_bigger')

                image_converter(user_profileimg_url)

        # crawl user tweets
        # sys.stdout.write('Get user tweets >> ')
        # sys.stdout.flush()

        if not os.path.exists(os.path.join(user_dir, user_tweets_f)):
            user_timeline_data = t.timeline(screen_name=screen_name)
            with open(os.path.join(user_dir, user_tweets_f), 'a') as outfile:
                for user_timeline in user_timeline_data:
                    json.dump(user_timeline, outfile)
                    outfile.write('\n')

        # clean user tweets
        # sys.stdout.write('Clean user tweets \n')
        # sys.stdout.flush()
        if not os.path.exists(os.path.join(user_dir, user_clean_tweets_f)):

            tweet_raw_lines = []
            with open(os.path.join(user_dir, user_tweets_f), 'r') as rf:
                for line in rf:
                    tweet_raw_lines.append(json.loads(line)['full_text'])

            clean_tweets = process_raw_tweets(tweet_raw_lines)

            with open(os.path.join(user_dir, user_clean_tweets_f), 'w') as wf:
                for tweet in clean_tweets:
                    if len(tweet) > 0:
                        wf.write(tweet + '\n')
            wf.close()

        return user_profile_json

    except Exception as e:
        # print(e)
        print("Could not predict user's role. Check account info, few tweets, incorrect image format...")
コード例 #13
0
def crawl_feed(feed_dict, credentials):
    twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'],
                  credentials['access_token'],
                  credentials['access_token_secret'])
    crawl_time = datetime.datetime.now()
    crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S')
    crawl_time_html = crawl_time.strftime('%B %d, %Y')
    crawl_name = feed_dict['crawl_name']
    crawl_type = feed_dict['crawl_type']
    short_name = feed_dict['short_name']
    search_string = feed_dict['search_string']

    feed_dir = feed_dict['feed_dir']
    json_dir = join(feed_dir, 'json')
    html_dir = join(feed_dir, 'html')
    media_dir = join(feed_dir, 'media')
    logs_dir = join(feed_dir, 'logs')

    for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    log_file = join(logs_dir, 'twarc.log')

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logger = logging.getLogger(crawl_name)
    handler = logging.FileHandler(log_file)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    base_filename = short_name + '-' + crawl_time_filename
    json_file = join(json_dir, base_filename + '.json')

    print("Searching Twitter API for {0}".format(search_string))
    print("Writing JSON and HTML files...")

    logger.info("starting search for %s", search_string)
    tweet_count = 0

    if crawl_type == "timeline":
        for tweet in twarc.timeline(screen_name=search_string):
            with open(json_file, 'a') as json_out:
                json_out.write("{}\n".format(json.dumps(tweet)))

            if "id_str" in tweet:
                logger.info("archived https://twitter.com/%s/status/%s",
                            tweet['user']['screen_name'], tweet["id_str"])
            elif 'limit' in tweet:
                logger.warn("%s tweets undelivered", tweet["limit"]["track"])
            elif 'warning' in tweet:
                logger.warn(tweet['warning']['message'])
            else:
                logger.warn(json.dumps(tweet))

            tweet_count += 1

    else:
        for tweet in twarc.search(search_string):
            with open(json_file, 'a') as json_out:
                json_out.write("{}\n".format(json.dumps(tweet)))

            if "id_str" in tweet:
                logger.info("archived https://twitter.com/%s/status/%s",
                            tweet['user']['screen_name'], tweet["id_str"])
            elif 'limit' in tweet:
                logger.warn("%s tweets undelivered", tweet["limit"]["track"])
            elif 'warning' in tweet:
                logger.warn(tweet['warning']['message'])
            else:
                logger.warn(json.dumps(tweet))

            tweet_count += 1

    if tweet_count == 0:
        logger.info("no new tweets matching %s", search_string)

        # Write an empty json file. Maybe don't do this?
        with open(json_file, 'w') as json_out:
            json_out.close()

    return base_filename, tweet_count, crawl_time_html
コード例 #14
0
class TwitterHarvester(BaseHarvester):
    def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False,
                 connection_errors=5, http_errors=5, debug_warcprox=False, tries=3):
        BaseHarvester.__init__(self, working_path, mq_config=mq_config,
                               stream_restart_interval_secs=stream_restart_interval_secs,
                               debug=debug, debug_warcprox=debug_warcprox, tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options", {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get("web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations))

    def sample(self):
        self._harvest_tweets(self.twarc.sample())

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {}".format(screen_name)
                    log.exception(msg)
                    self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                if new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(__name__,
                                                          "timeline.{}.since_id".format(
                                                              user_id)) if incremental else None

                    self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id)
                        log.exception(msg)
                        self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        users = list(self.twarc.user_lookup(user_ids=(user_id,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["screen_name"]
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        users = list(self.twarc.user_lookup(screen_names=(screen_name,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["id_str"]
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(__name__, key,
                                               max(self.state_store.get_state(__name__, key), tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])