コード例 #1
0
def stream_city(cf, city, keywords=None):
    bbox = {
        "great_syd": [149.971885992, -34.33117400499998, 151.63054702400007, -32.99606922499993],
        "great_mel": [144.33363404800002, -38.50298801599996, 145.8784120140001, -37.17509899299995],
        "great_brisbane": [152.07339276400012, -28.363962911999977, 153.54670756200005, -26.452339004999942],
        "great_ald": [138.435645001, -35.350296029999974, 139.04403010400003, -34.50022530299998]
    }
    
    if keywords == None:
        keywords = cf["search_words"]
    
    t = Twarc(**cf['account'])

    # no keyword restriction but from a specific city
    # reason see this https://stackoverflow.com/questions/22889122/how-to-add-a-location-filter-to-tweepy-module
    if not os.path.isdir(city+"/"):
        os.makedirs(city)
    
    path = city + "/" + str(datetime.date.today())+".jsonl"

    locations = ",".join([str(i) for i in bbox[city]])

    for tweet in t.filter(locations=locations):
        print("get one tweet") #TODO
        send_to_db(tweet)
コード例 #2
0
def main(get_method=None, input_hashtags=None, storage_location=None):
    if not os.path.exists(storage_location):
        os.makedirs(storage_location, exist_ok=True)

    hashtag_query = input_hashtags.strip().replace(",", "+OR+")

    try:
        tweets = 0
        t = Twarc(
            consumer_key,
            consumer_secret,
            access_token,
            access_token_secret,
            tweet_mode="extended",
        )

        print(
            "Started storing tweets related to "
            + input_hashtags
            + " at "
            + storage_location
            + " since "
            + str(datetime.datetime.now())
        )

        if get_method == "populate":
            for tweet in t.search(hashtag_query, lang=language):
                with open(
                    os.path.join(
                        storage_location + "tweet" + str(tweet["id"]) + ".json"
                    ),
                    "w",
                    encoding="utf8",
                ) as file:
                    json.dump(tweet, file)
                    tweets += 1

        elif get_method == "track":
            for tweet in t.filter(hashtag_query):
                with open(
                    storage_location + "/tweet" + str(tweet["id"]) + ".json",
                    "w",
                    encoding="utf8",
                ) as file:
                    json.dump(tweet, file)
                    tweets += 1
        else:
            print("No method defined, exiting...")

    except KeyboardInterrupt:
        print("Shutdown requested...successfully stored " + str(tweets) + " tweets")
    except BaseException:
        traceback.print_exc(file=sys.stdout)

    sys.exit(0)
コード例 #3
0
class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options",
                                              {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get(
            "web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get(
            "user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {} because account is not found or suspended".format(
                        screen_name)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                # if can't find the screen_name, ignore get timeline
                if not new_screen_name:
                    msg = "Screen name not found for user id {} because account is not found or suspended".format(
                        user_id)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
                    # reset the user_id, ignore the get timeline
                    user_id = None
                if new_screen_name and new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(
                        __name__, "timeline.{}.since_id".format(
                            user_id)) if incremental else None

                    self._harvest_tweets(
                        self.twarc.timeline(user_id=user_id,
                                            since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        account = "user {} (User ID: {})".format(
                            screen_name, user_id
                        ) if screen_name else "user ID: {}".format(user_id)
                        msg = "Unauthorized for {} because account is suspended or protected".format(
                            account)
                        log.exception(msg)
                        self.result.warnings.append(
                            Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        try:
            users = list(self.twarc.user_lookup(user_ids=(user_id, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["screen_name"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        try:
            users = list(self.twarc.user_lookup(screen_names=(screen_name, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["id_str"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(
                        url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query),
                                       max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(self.state_store.get_state(__name__, key),
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(
                tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])
コード例 #4
0
class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors,
                           tweet_mode="extended")

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        query, geocode = self._search_parameters()
        self._harvest_tweets(
            self.twarc.search(query, geocode=geocode, since_id=since_id))

    def _search_parameters(self):
        if type(self.message["seeds"][0]["token"]) is dict:
            query = self.message["seeds"][0]["token"].get("query")
            geocode = self.message["seeds"][0]["token"].get("geocode")
        else:
            query = self.message["seeds"][0]["token"]
            geocode = None
        return query, geocode

    def _search_id(self):
        query, geocode = self._search_parameters()
        if query and not geocode:
            return query
        if geocode and not query:
            return geocode
        return ":".join([query, geocode])

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")
        language = self.message["seeds"][0]["token"].get("language")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              lang=language,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                result, user = self._lookup_user(screen_name, "screen_name")
                if result == "OK":
                    user_id = user["id_str"]
                    self.result.uids[seed_id] = user_id
                else:
                    msg = u"User id not found for {} because account is {}".format(
                        screen_name, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("token_{}".format(result), msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                result, user = self._lookup_user(user_id, "user_id")
                if result == "OK":
                    new_screen_name = user["screen_name"]
                    if new_screen_name and new_screen_name != screen_name:
                        self.result.token_updates[seed_id] = new_screen_name
                else:
                    msg = u"User {} (User ID: {}) not found because account is {}".format(
                        screen_name, user_id, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("uid_{}".format(result), msg, seed_id=seed_id))
                    user_id = None

            if user_id:
                # Get since_id from state_store
                since_id = self.state_store.get_state(
                    __name__, "timeline.{}.since_id".format(
                        user_id)) if incremental else None

                self._harvest_tweets(
                    self.twarc.timeline(user_id=user_id, since_id=since_id))

    def _lookup_user(self, id, id_type):
        url = "https://api.twitter.com/1.1/users/show.json"
        params = {id_type: id}

        # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]}
        # USER_PROTECTED: 200 and user object with "protected": true
        # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]}
        result = "OK"
        user = None
        try:
            resp = self.twarc.get(url, params=params, allow_404=True)
            user = resp.json()
            if user['protected']:
                result = "unauthorized"
        except requests.exceptions.HTTPError as e:
            try:
                resp_json = e.response.json()
            except json.decoder.JSONDecodeError:
                raise e
            if e.response.status_code == 404 and self._has_error_code(
                    resp_json, 50):
                result = "not_found"
            elif e.response.status_code == 403 and self._has_error_code(
                    resp_json, 63):
                result = "suspended"
            else:
                raise e
        return result, user

    @staticmethod
    def _has_error_code(resp, code):
        if isinstance(code, int):
            code = (code, )
        for error in resp['errors']:
            if error['code'] in code:
                return True
        return False

    @staticmethod
    def _result_to_reason(result):
        if result == "unauthorized":
            return "protected"
        elif result == "suspended":
            return "suspended"
        return "not found or deleted"

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and (max_tweet_id or 0) > (since_id or 0):
            self.state_store.set_state(
                __name__, u"{}.since_id".format(self._search_id()),
                max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(
                            self.state_store.get_state(__name__, key) or 0,
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None

        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                max_tweet_id = max(max_tweet_id or 0, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, _):
        self.result.increment_stats("tweets")
コード例 #5
0
import json
import datetime
from twarc import Twarc

# Collection end
period_end = datetime.datetime(2017, 12, 31, 23, 59, 59, 999999)

# Twitter API keys - geobgu2
t = Twarc('JA5KZiEuU8HDIFDtLXwkHCpdx',
          'NdGoBYXuYHbHOAInNHHumjz0xeCp8zEYfbm0RW0dzpvcRY8Ovc',
          '2782755278-ARD36i5dPBU6fxRdgvomZoxuCOI3ewVVGPizZCf',
          'ceN8O8yIVV2C7o6CJyLYYo3CNIm48Tnojpxj69pqqv36u')

# Twitter stream request
t = t.filter(locations="\-72.21437,41.19034,-69.64939,43.30924")

# Collect tweets
while datetime.datetime.now(
) < period_end:  # Loop until collection period ends
    day_start = datetime.datetime.now()
    day_end = datetime.datetime(day_start.year, day_start.month, day_start.day,
                                day_start.hour, 59, 59, 999999)
    fh = open(
        "boston_geobgu2_" + day_start.strftime("%Y-%m-%d_%H:%M:%S") + ".json",
        "w")
    #fh.write("[")
    for tweet in t:  # Loop until hour ends
        x = tweet
        try:
            if x["geo"] != None:
                print(x["text"])
コード例 #6
0
if existing_files:
    last_file = sorted(existing_files)[-1]
    last_file_number = re.search(r'%s(\d+)\.json' % config.file_prefix, last_file)
    if last_file_number:
        file_count = int(last_file_number.groups()[0]) + 1

tweet_count = 0
queue = []
twarc = Twarc(config.consumer_key, config.consumer_secret, config.access_token, config.access_token_secret)
start_time = time.time()
log("Running... Press Ctrl-C to stop.")

while True:
    try:
        for tweet in twarc.filter(track=",".join(config.word_filter),
                                  locations=",".join(config.location_filter),
                                  lang=config.language_filter):
            queue.append(tweet)
            tweet_count += 1
            if config.print_tweets:
                log("Tweet: " + tweet.get("extended_tweet", {}).get("full_text", tweet.get("text", "")))

            # Print progress every 1000 tweets
            if tweet_count % 1000 == 0:
                log("%d tweets collected" % tweet_count)

            if len(queue) >= config.tweets_per_file:
                save(queue, file_number=file_count)
                file_count += 1
                queue = []
    except Exception as e:
コード例 #7
0
from datetime import datetime, timedelta

consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

already_checked = []
pbar = tqdm.tqdm(desc='tweet analysed', unit='tweets')


def isSuspicious(user):
    created = datetime.strptime(user['created_at'],
                                '%a %b %d %H:%M:%S +0000 %Y')
    now = datetime.now() - timedelta(days=90)
    if created > now:
        tqdm.tqdm.write("[!] Suspicious: "
                        "@{} ({}), created : {}".format(
                            user["screen_name"], user["id"],
                            created.strftime("%d/%m/%Y")))


if __name__ == "__main__":
    t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

    for tweet in t.filter(track="Strasbourg"):
        user = tweet["user"]
        if user["id"] not in already_checked:
            isSuspicious(user)
            already_checked.append(user["id"])
        pbar.update(1)
コード例 #8
0
# **Uniqueness Constraints:**
session = driver.session()

# Add uniqueness constraints.
session.run( "CREATE CONSTRAINT ON (t:Tweet) ASSERT t.id IS UNIQUE;")
session.run( "CREATE CONSTRAINT ON (u:User) ASSERT u.screen_name IS UNIQUE;")
session.run( "CREATE CONSTRAINT ON (h:Hashtag) ASSERT h.name IS UNIQUE;")
session.run( "CREATE CONSTRAINT ON (l:Link) ASSERT l.url IS UNIQUE;")
session.run( "CREATE CONSTRAINT ON (s:Source) ASSERT s.name IS UNIQUE;")


# **Twitter Follow List:**
list = '16796735,820954235357593602'


for tweets in t.filter(follow=list):

    try:
        query = '''
        UNWIND {tweets} AS t
        WITH t,
             t.entities AS e,
             t.user AS u,
             t.retweeted_status AS retweet
        WHERE t.id is not null
        MERGE (tweet:Tweet {id:t.id_str})
        SET tweet.text = t.text,
            tweet.created = t.created_at,
            tweet.favorites = t.favorite_count
        MERGE (user:User {screen_name:u.screen_name})
        SET user.name = u.name,
コード例 #9
0
class TwitterHarvester(BaseHarvester):
    def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False,
                 connection_errors=5, http_errors=5, debug_warcprox=False, tries=3):
        BaseHarvester.__init__(self, working_path, mq_config=mq_config,
                               stream_restart_interval_secs=stream_restart_interval_secs,
                               debug=debug, debug_warcprox=debug_warcprox, tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options", {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get("web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations))

    def sample(self):
        self._harvest_tweets(self.twarc.sample())

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {}".format(screen_name)
                    log.exception(msg)
                    self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                if new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(__name__,
                                                          "timeline.{}.since_id".format(
                                                              user_id)) if incremental else None

                    self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id)
                        log.exception(msg)
                        self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        users = list(self.twarc.user_lookup(user_ids=(user_id,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["screen_name"]
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        users = list(self.twarc.user_lookup(screen_names=(screen_name,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["id_str"]
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(__name__, key,
                                               max(self.state_store.get_state(__name__, key), tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])
コード例 #10
0
import datetime
from twarc import Twarc

# Collection end
period_end = datetime.datetime(2017, 12, 31, 23, 59, 59, 999999)

# Twitter API keys - geobgu2
t = Twarc(
  'JA5KZiEuU8HDIFDtLXwkHCpdx', 
  'NdGoBYXuYHbHOAInNHHumjz0xeCp8zEYfbm0RW0dzpvcRY8Ovc', 
  '2782755278-ARD36i5dPBU6fxRdgvomZoxuCOI3ewVVGPizZCf', 
  'ceN8O8yIVV2C7o6CJyLYYo3CNIm48Tnojpxj69pqqv36u'
  )

# Twitter stream request
t = t.filter(locations = "\-72.21437,41.19034,-69.64939,43.30924")

# Collect tweets
while datetime.datetime.now() < period_end: # Loop until collection period ends
    day_start = datetime.datetime.now()
    day_end = datetime.datetime(day_start.year, day_start.month, day_start.day, day_start.hour, 59, 59, 999999)
    fh = open("boston_geobgu2_" + day_start.strftime("%Y-%m-%d_%H:%M:%S") + ".json", "w")
    #fh.write("[")
    for tweet in t: # Loop until hour ends
        x = tweet
        try:
            if x["geo"] != None:
                print(x["text"])
                fh.write(json.dumps(x))
                fh.write("\n")
        except:
コード例 #11
0
import sys

print (" # Loading keys")

consumer_key = 'INSERT YOUR CONSUMER KEY HERE'
consumer_secret = 'INSERT YOUR CONSUMER SECRET HERE'
access_token = 	'INSERT YOUR TOKEN HERE' 
access_token_secret = 'INSERT YOUR TOKEN SECRET HERE'
twarc_auth = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

print (" # Reading search terms")

with open('tweet_terms.txt','r') as tweet_terms_file_content:
	my_tweet_terms = [line.strip() for line in tweet_terms_file_content]
	print (" # Search terms loaded")
	
	if len(my_tweet_terms) > 0:
		twitter_query = ",".join(my_tweet_terms)
		print " # Search terms: " + twitter_query
	
		for tweet in twarc_auth.filter(track = twitter_query):
			with open('data_dump.json', 'a') as json_output_file:
				json.dump(tweet, json_output_file, indent=4, sort_keys=True)
	else:
		print "No search terms provided, printing generic stream"
		for tweet in twarc.sample():
			print(tweet)
			
print (" # Authentication successful, dumping results")
tweet_terms_file_content.close()
json_output_file.close()
コード例 #12
0
ファイル: track.py プロジェクト: RenNedz/third-year-project
import os.path
from twarc import Twarc
from File_manager import File_manager
#usage, takes in a file called tags with name or hashtag to search by per line.

t = Twarc('4lJGm5YUrXgtwfMUmlo9L4KgH',
          'YIYeIiZGCJpolIASa56eqLJsEa54vGKFt07CkTai3SWKoCPx3w',
          '276679861-dgKpojdDSWRutxEG7NH2A2ZgD7xHtzcuwOisMo1T',
          '4ckJoGnRm5defgDegfV4opPETmarGuNQr9U6pAVEwR9sT')
out_file = File_manager.open_for_download("track")
in_file = open("tags", "r", 1)
for tag in in_file:
    print tag
    for tweet in t.filter(track=tag):
        tweet = str(tweet)
        out_file.write(tweet + "\n")
out_file.close()
in_file.close()