Example #1
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        data = json.loads(HTMLParser().unescape(raw_data))

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
 def process(self, tweet):
     status = Status.parse(api, json.loads(tweet))
     for lf in UNICODE_LINES:
         text = status.text.replace(lf, ' ')
     print "@%s (%s, %s, %s, %s): %s"%(status.user.screen_name, 
         status.user.lang, status.user.statuses_count, status.user.friends_count, 
         status.user.followers_count, text)
Example #3
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """

        if '{"delete"' in data:
            try:
                delete = json.loads(data)['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    return False
            except:
                delete = json.loads(data)['delete']['direct_message']
                if self.on_direct_message_delete(delete['id'], delete['user_id']) is False:
                    return False
        elif '{"direct_message"' in data:
            message = DirectMessage.parse(self.api, json.loads(data)['direct_message'])
            if self.on_direct_message(message) is False:
                return False
        elif '{"target"' in data:
            event = json.loads(data)
            if self.on_event(event) is False:
                return False
        elif '{"limit"' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
        elif '"in_reply_to_user_id_str"' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
Example #4
0
def test_end_to_end(filename, connections, expected, tmpdir):
    api = MockAPI(connections=connections)

    with open(filename, 'r') as f:
        status = Status.parse(api, json.load(fp=f))

    l = LessListener(api=api, post_replies=True, gather='tweets', state_dir=str(tmpdir))

    # 100% festivity for all of December
    l.december_greetings = ('It is cold outside.',)
    l.festive_probability = 1.
    assert l.get_festive_probability(dt.date(2016, 12, 5)) == 1.

    l.on_status(status)

    # Never reply to the same toot twice
    l.on_status(status)

    # Rate-limit replies for same word
    setattr(status, 'id', status.id + 1)
    l.on_status(status)

    if expected is None:
        assert api._updates == []
    else:
        assert len(api._updates) == 1
        u = api._updates[0]
        assert u['status'] == expected

    for k, before in connections.items():
        after = api._connections[k]
        assert ('following' in after) == ('followed_by' in before), \
            (k, before, after)
Example #5
0
 def on_data(self, data):
     if time.time() >= self.started + self.duration:
         stats = open('{0}-sample.stats'.format(int(self.started)), 'w+')
         stats.write("================= STATISTICS =================" + "\n")
         stats.write("Start time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n")
         stats.write("End time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n")
         stats.write("First Tweet ID: " + self.first_tweet_id + "\n")
         stats.write("Last Tweet ID: " + self.last_tweet_id + "\n")
         stats.write("Language: " + self.lang + "\n")
         stats.write("Language classification threshold: " + str(self.lang_threshold) + "\n")
         stats.write("Above threshold: " + str(self.counter[self.lang + '-above']) + "\n")
         stats.write("Below threshold: " + str(self.counter[self.lang + '-below']) + "\n")
         stats.write("Exluded: " + str(self.counter['excluded']) + "\n")
         return False
     elif 'in_reply_to_status_id' in data: 
         status = Status.parse(self.api, json.loads(data))
         langclass = langid.classify(status.text)
         
         if (self.counter == {self.lang + '-above':0, self.lang + '-below':0, 'excluded':0}):
             self.first_tweet_id = str(status.id)
         self.last_tweet_id = str(status.id)
         
         if (langclass[0] == self.lang):                
             if langclass[1] >= self.lang_threshold:
                 self.above_output.write(data)
                 self.counter[self.lang + '-above'] += 1
             else:
                 self.below_output.write(data)
                 self.counter[self.lang + '-below'] += 1
         else:
             self.excl_output.write(data)
             self.counter['excluded'] += 1
            
         return True
Example #6
0
 def on_data(self, data):
     if "entities" in data:
         data = json.loads(data)
         user_mentions = data["entities"]["user_mentions"]
         screen_names = [mention["screen_name"] for mention in user_mentions]
         if "testeMagazine" in screen_names:
             status = Tweet.parse(self.api, data)
             self.on_mention(status)
Example #7
0
  def on_data(self, raw_data):
    """Called when raw data is received from connection.

    This is where all the data comes first. Normally we could use (inherit)
    the on_data() in tweepy.StreamListener, but it unnecessarily and naively
    reports unknown event types as errors (to simple log); also, we might want
    to tweak it further later on.

    But for now, this is basically taken from tweepy's on_data().

    Return False to stop stream and close connection.
    """

    self.processing_data = True

    data = json.loads(raw_data)

    if 'in_reply_to_status_id' in data:
      status = Status.parse(self.api, data)
      if self.on_status(status) is False:
        return False
    elif 'delete' in data:
      delete = data['delete']['status']
      if self.on_delete(delete['id'], delete['user_id']) is False:
        return False
    elif 'event' in data:
      status = Status.parse(self.api, data)
      if self.on_event(status) is False:
        return False
    elif 'direct_message' in data:
      status = Status.parse(self.api, data)
      if self.on_direct_message(status) is False:
        return False
    elif 'limit' in data:
      if self.on_limit(data['limit']['track']) is False:
        return False
    elif 'disconnect' in data:
      if self.on_disconnect(data['disconnect']) is False:
        return False
    else:
      log.debug('TwitterBotStreamListener::on_data(): got event/stream data of'
          ' unknown type. Raw data follows:\n%s', data)

    self.processing_data = False
Example #8
0
def test_sanitize(filename, expected):
    api = NonCallableMock()

    with open(os.path.join('tests', filename), 'r') as f:
        status = Status.parse(api, json.load(f))

    text = get_sanitized_text(status)
    assert '&' not in text
    assert 'http' not in text
    assert text == expected
Example #9
0
 def _read_from_table(self):
     self.running = True
     conn = StatusSource.engine.connect()
     meta = MetaData()
     table = Table(self.table_name, meta, autoload=True, autoload_with=StatusSource.engine)
     cmd = select([table])
     results = conn.execute(cmd)
     for result in results:
         status = Status.parse(None, result)
         self.listener.on_status(status)
         if self.running == False:
             break
Example #10
0
    def on_data(self, raw_data):
        # called on recieval of raw data
        data = json.loads(raw_data)

        # start of if tree
        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
Example #11
0
def test_save_tweet(tmpdir, id_, expected_filename):
    api = MockAPI(connections={})
    foo = tmpdir.join('foo')

    l = LessListener(api=api, gather=str(foo), state_dir=str(tmpdir))
    s = Status.parse(api=api, json={
        'id': int(id_),
        'id_str': id_,
    })
    l.save_tweet(s)

    j = tmpdir.join('foo', expected_filename)
    assert j.check()
Example #12
0
    def test_patched_status(self):
        """@todo: Docstring for test_patched_status.
        :returns: @todo

        """
        from tweepy.models import Status
        from crawler.tweepy_patch import patch
        patch()
        s = Status.parse('test_api', {'a': 1, 'b': 2})
        # pylint: disable=E1101,W0212
        self.assertEqual(s._raw, '{"a": 1, "b": 2}')
        self.assertEqual(s.a, 1)
        self.assertEqual(s.b, 2)
Example #13
0
    def on_data(self, raw_data):
        data = json.loads(raw_data)
        if self.verbose:
            print data
            print '-'*60

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'friends' in data:
            pass # ignore
        elif 'delete' in data:
            pass # ignore
        elif 'user_suspend' in data:
            pass # ignore
        else:
            logging.error("Unknown message type: " + str(raw_data))
Example #14
0
 def on_data(self, data):
     
     if 'in_reply_to_status_id' in data:
         status = Status.parse(self.api, json.loads(data))
         if self.on_status(status, data) is False:
             return False
     elif 'delete' in data:
         delete = json.loads(data)['delete']['status']
         if self.on_delete(delete['id'], delete['user_id']) is False:
              return False
     elif 'limit' in data:
         if self.on_limit(json.loads(data)['limit']['track']) is False:
             return False
Example #15
0
    def save_status(self, data):
        """TODO"""
        status = Status.parse(self.api, json.loads(data))

        if not status.geo:
            # _datafile.write(data+'\n')
            return

        if Author.objects.filter(owner__userprofile__twitter_id=status.user.id_str).exists():
            # this tweet's author is on stargazer
            return

        try:
            author = Author.objects.filter(source=Author.T_TWITTER, external_id=status.user.id_str).get()
        except Author.DoesNotExist:
            author = Author(
                name=status.user.screen_name,
                avatar_uri=status.user.profile_image_url,
                source=Author.T_TWITTER,
                external_id=status.user.id_str,
            )
            author.save()

        try:
            post = Post.objects.filter(source=Post.T_TWITTER, external_id=status.id_str).get()
        except Post.DoesNotExist:
            lat = float(status.geo["coordinates"][0])
            lng = float(status.geo["coordinates"][1])

            try:
                addr = self._latlng2addr.get(lat, lng)
            except (LatLng2Addr.ConnectionFailed, LatLng2Addr.GeocodingFailed) as e:
                addr = ""

            # twitter api response in UTC
            created = status.created_at + timedelta(hours=8)

            post = Post(
                content=status.text,
                author=author,
                latitude=lat,
                longitude=lng,
                address=addr,
                source=Post.T_TWITTER,
                external_id=status.id_str,
                external_data=data,
                created=created,
            )
            post.save()

        return
Example #16
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        data = json.loads(raw_data)

        if "in_reply_to_status_id" in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif "delete" in data:
            delete = data["delete"]["status"]
            if self.on_delete(delete["id"], delete["user_id"]) is False:
                return False
        elif "event" in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif "direct_message" in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif "friends" in data:
            if self.on_friends(data["friends"]) is False:
                return False
        elif "limit" in data:
            if self.on_limit(data["limit"]["track"]) is False:
                return False
        elif "disconnect" in data:
            if self.on_disconnect(data["disconnect"]) is False:
                return False
        elif "warning" in data:
            if self.on_warning(data["warning"]) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
 def get_place(status: Status) -> dict:
     place: dict = {
         "coordinates": [],
         "country": "N/A",
         "country_code": "N/A",
         "full_name": "N/A"
     }
     try:
         if status.__getattribute__("place") is not None:
             status_place: dict = status.__getattribute__("place").__dict__
             coordinates: list = status_place.get(
                 "bounding_box").__dict__.get("coordinates", [])
             country: str = status_place.get("country", "N/A")
             country_code: str = status_place.get("country_code", "N/A")
             full_name: str = status_place.get("full_name", "N/A")
             place: dict = {
                 "coordinates": coordinates,
                 "country": country,
                 "country_code": country_code,
                 "full_name": full_name
             }
     except Exception as e:
         pass
     return place
Example #18
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        from tweepy.models import Status
        data = json.loads(raw_data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        else:
            self.bot._log("Unknown message type: " + str(raw_data))
Example #19
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
Example #20
0
 def __init__(self, tweetDict):
     self.tweet = Status.parse(API(), tweetDict["tweet"])
     try:
         self.keywords = tweetDict["keywords"]
     except KeyError:
         pass
     try:
         self.groups = tweetDict["groups"]
     except KeyError:
         pass
     self.tokens = []
     self.filt_tokens = []
     for token in tweetDict["tokens"]:
         t = Token(token)
         self.tokens.append(t)
         if not t.filter_token():
             self.filt_tokens.append(t)
Example #21
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """

        if "in_reply_to_status_id" in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif "delete" in data:
            delete = json.loads(data)["delete"]["status"]
            if self.on_delete(delete["id"], delete["user_id"]) is False:
                return False
        elif "limit" in data:
            if self.on_limit(json.loads(data)["limit"]["track"]) is False:
                return False
Example #22
0
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
Example #23
0
    def on_data(self, data):
        if time.time() >= self.started + self.duration:
            stats = open('{0}-sample.stats'.format(int(self.started)), 'w+')
            stats.write("================= STATISTICS =================" +
                        "\n")
            stats.write("Start time: " + time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n")
            stats.write("End time: " + time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n")
            stats.write("First Tweet ID: " + self.first_tweet_id + "\n")
            stats.write("Last Tweet ID: " + self.last_tweet_id + "\n")
            stats.write("Language: " + self.lang + "\n")
            stats.write("Language classification threshold: " +
                        str(self.lang_threshold) + "\n")
            stats.write("Above threshold: " +
                        str(self.counter[self.lang + '-above']) + "\n")
            stats.write("Below threshold: " +
                        str(self.counter[self.lang + '-below']) + "\n")
            stats.write("Exluded: " + str(self.counter['excluded']) + "\n")
            return False
        elif 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            langclass = langid.classify(status.text)

            if (self.counter == {
                    self.lang + '-above': 0,
                    self.lang + '-below': 0,
                    'excluded': 0
            }):
                self.first_tweet_id = str(status.id)
            self.last_tweet_id = str(status.id)

            if (langclass[0] == self.lang):
                if langclass[1] >= self.lang_threshold:
                    self.above_output.write(data)
                    self.counter[self.lang + '-above'] += 1
                else:
                    self.below_output.write(data)
                    self.counter[self.lang + '-below'] += 1
            else:
                self.excl_output.write(data)
                self.counter['excluded'] += 1

            return True
 def process(self, tweet):
     status = Status.parse(api, json.loads(tweet))
     out = {"screen_name": status.user.screen_name, 
         "id": status.id,
         "lang": status.user.lang, 
         "statuses_count": status.user.statuses_count, 
         "friend_count": status.user.friends_count, 
         "followers_count":status.user.followers_count,
         "profile_image_url": status.user.profile_image_url,
         "text": status.text.encode('utf8'),
         "entities": status.entities,
         "created_at": status.created_at.strftime("%Y-%m-%d %H:%M:%S"),
         "geo":status.geo,
         "location":status.user.location,
         "timezone":status.user.time_zone}
     now = time.strftime(self.fmt)
     if now != self.time:
         self.time = str(now)
         self.fid.close()
         self.fid = gzip.open(os.path.join(self.path, self.base + '-' + self.time + '.txt.gz'), 'ab')
     self.fid.write(json.dumps(out) + '\n')
Example #25
0
    def test_sending_images(self):
        # ensure there is an image as the mock object will not do anything
        shutil.copy('./image.jpg', '/tmp/image.jpg')
        client = boto3.client('s3')
        client.download_file = MagicMock(return_value=None)

        auth = tweepy.OAuthHandler('foo', 'bar')
        api = tweepy.API(auth)
        api.update_with_media = MagicMock(return_value=Status())

        tweet_images = TweetS3Images(api, client)
        tweet_images.send_image('test_bucket', 'image.jpg', cleanup=True)

        client.download_file.assert_called_with('test_bucket', 'image.jpg',
                                                '/tmp/image.jpg')
        api.update_with_media.assert_called_with(
            filename='image.jpg',
            status='New image image.jpg brought to you by lambda-tweet',
            file=tweet_images.get_file())
        self.assertFalse(os.path.exists('/tmp/image-test.jpg'),
                         'The image was not cleaned up correctly.')
Example #26
0
    def _get_status(self, data):

        status = Status.parse(self.api, self.json.loads(data))

        if status.user.screen_name in self.block_users:
            raise TweepError(">> User ignored: @%s" % status.user.screen_name)
        try:
            status = status.retweeted_status
        except AttributeError as atr:
            if not self.original:
                text = self._proccess_status(status.text)
                trunc_text = (text[:72] + '...') if len(text) > 75 else text
                raise TweepError(">> Original tweet ignored: %s" % trunc_text)

        if status.is_quote_status:
            if self.quoted:
                status = status.quoted_status
            else:
                text = self._proccess_status(status.text)
                raise TweepError(">> Quoted tweet ignored: %s" % text)

        return status
Example #27
0
    async def on_data(self, raw_data):
        """|coroutine|

        This is called when raw data is received from the stream.
        This method handles sending the data to other methods, depending on the
        message type.

        Parameters
        ----------
        raw_data : JSON
            The raw data from the stream

        References
        ----------
        https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/guides/streaming-message-types
        """
        data = json.loads(raw_data)

        if "in_reply_to_status_id" in data:
            status = Status.parse(None, data)
            return await self.on_status(status)
        if "delete" in data:
            delete = data["delete"]["status"]
            return await self.on_delete(delete["id"], delete["user_id"])
        if "disconnect" in data:
            return await self.on_disconnect_message(data["disconnect"])
        if "limit" in data:
            return await self.on_limit(data["limit"]["track"])
        if "scrub_geo" in data:
            return await self.on_scrub_geo(data["scrub_geo"])
        if "status_withheld" in data:
            return await self.on_status_withheld(data["status_withheld"])
        if "user_withheld" in data:
            return await self.on_user_withheld(data["user_withheld"])
        if "warning" in data:
            return await self.on_warning(data["warning"])

        log.warning("Received unknown message type: %s", raw_data)
Example #28
0
  def on_data(self, data):
    """Called when raw data is received from connection.

    Override this method if you wish to manually handle
    the stream data. Return False to stop stream and close connection.
    """

    if 'in_reply_to_status_id' in data:
      status = Status.parse(self.api, json.loads(data))
      return self.on_status(status)
    elif 'delete' in data:
      delete = json.loads(data)['delete']['status']
      if self.on_delete(delete['id'], delete['user_id']) is False:
        return False
    elif 'limit' in data:
        if self.on_limit(json.loads(data)['limit']['track']) is False:
          return False
    elif 'sender_id' in data and 'recipient_id' in data:
      dm = DirectMessage.parse(self.api, json.loads(data))
      return self.on_dm(dm)
    elif 'event' in data and 'follow' in data:
      content = json.loads(data)
      if 'event' in content and content['event'] == 'follow':
        return self.on_follow(content)
Example #29
0
 hashtag = 0
 url = 0
 question = 0
 exclamation = 0
 pos_term = 0
 neg_term = 0
 pos_emoticon = 0
 neg_emoticon = 0
 reply = 0
 moment_morning = 0
 moment_afternoon = 0
 moment_evening = 0
 moment_night = 0
 retweeted = 0
 
 status = Status.parse(api, json.loads(tweet[0]))
 
 if status.id in error_list_tweet_ids:
     tweets_discarded_error += 1
 elif status.text.startswith("RT @"):
     tweets_discarded_retweet += 1     
 else:
     tweets_considered += 1
     if regex_username.search(status.text) != None:
         tweets_username += 1
         username = 1
     if regex_hashtag.search(status.text) != None:
         tweets_hashtag += 1
         hashtag = 1
     if regex_url.search(status.text) != None:
         tweets_url += 1
def make_mock_statuses(json_text):
    tweet_array = json.loads(json_text)
    statuses = Status.parse_list(None, tweet_array)
    return statuses
Example #31
0
    def on_data(self, data):
        full_text = ""

        data2 = json.loads(data)

        if 'extended_tweet' in data2:
            if ('full_text' in data2["extended_tweet"]):

                full_text = bytes(
                    str(data2["extended_tweet"]["full_text"]).encode("utf-8"))
                full_text = full_text.decode('utf-8')
                print(
                    'FUL TEXT *******************************************************************************'
                )
                print(full_text)

            #print(self.find_between( data, '"extended_tweet":{"full_text":"','",'))
            #print(data)
        if ("retweeted_status" in data2):
            if ('full_text' in data2["retweeted_status"]):
                full_text = bytes(
                    str(data2["retweeted_status"]["full_text"]).encode(
                        "utf-8"))
                full_text = full_text.decode('utf-8')
                print(
                    'FUL TEXT *******************************************************************************'
                )
                print(full_text)
        #print(full_text)

        data = json.loads(data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status, full_text) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
Example #32
0
 def on_data(self, data):
     """
     Generic class for site streams that just print each
     action that comes in - override these methods to actually
     process them
     """
     if 'for_user' in data:
         parsed_data = json.loads(data)
         user_id = parsed_data['for_user']
         if 'message' in data:
             message = parsed_data['message']
             if u'friends' in message:
                 if self.on_friends(user_id, message['friends']) is False:
                     return False
             elif u'event' in message:
                 if message[u'event'] == u'follow':
                     if self.on_follow(
                         user_id=user_id,
                         source=message[u'source'],
                         target=message[u'target'],
                         time=message[u'created_at']
                     ) is False:
                         return False
                 elif message[u'event'] == u'unfollow':
                     if self.on_unfollow(
                         user_id,
                         source=message[u'source'],
                         target=message[u'target'],
                         time=message[u'created_at']
                     ) is False:
                         return False
                 elif message[u'event'] == u'favorite':
                     if self.on_favorite(
                         user_id,
                         source=message[u'source'],
                         favorited=message[u'target_object'],
                         time=message[u'created_at']
                     ) is False:
                         return False
                 elif message[u'event'] == u'unfavorite':
                     if self.on_unfavorite(
                         user_id,
                         source=message[u'source'],
                         favorited=message[u'target_object']
                     ) is False:
                         return False
             # Need this second check - could be a retweet of
             # a tweet mentioning the user of interest
             elif (u'retweeted_status' in message and
                 int(message[u'retweeted_status'][u'user'][u'id']) ==
                 int(user_id)
             ):
                 if self.on_retweet(user_id, message) is False:
                     return False
             elif u'text' in message:
                 status = Status.parse(self.api, message)
                 # tweet from the user of interest
                 if status.author.id == user_id:
                     if self.on_user_status(user_id, status) is False:
                         return False
                 else:   # tweet mentioning the user of interest
                     if self.on_user_mention(user_id, status) is False:
                         return False
             elif u'direct_message' in message:
                 if self.on_direct_message(
                     user_id, message[u'direct_message']
                 ) is False:
                     return False
             else:
                 print parsed_data
Example #33
0
 def __init__(self, status: Status):
     self.created_at: str = preprocess_date(
         status.__getattribute__("created_at"))
     self.id: int = status.__getattribute__("id")
     self.hashtags: list = status.__getattribute__("entities").get(
         "hashtags", [])
     self.user_mentions: list = status.__getattribute__("entities").get(
         "user_mentions", [])
     # self.urls: list = status.__getattribute__("entities").get("urls", [])
     # self.media: list = status.__getattribute__("entities").get("media", [])
     self.text: str = self.get_text(status=status)
     self.retweet_count: int = status.__getattribute__("retweet_count")
     self.retweeted: bool = status.__getattribute__("retweeted")
     self.user_id: int = status.__getattribute__("user").__getattribute__(
         "id")
     self.profile_image_url: str = status.__getattribute__(
         "user").__getattribute__("profile_image_url")
     self.screen_name: str = status.__getattribute__(
         "user").__getattribute__("screen_name")
     self.possibly_sensitive: bool = status.__getattribute__("possibly_sensitive") if \
         hasattr(status, 'possibly_sensitive') else False
     self.favorite_count: int = status.__getattribute__("favorite_count")
     self.favorited: bool = status.__getattribute__("favorited")
     self.lang: str = detect(self.text)
     self.sentiment_analysis: dict = {}
     self.source: str = status.__getattribute__("source")
     self.geolocation: str = self.get_geolocation(status=status)
     self.place: dict = self.get_place(status=status)
     self.url: str = f"https://twitter.com/user/status/{status.__getattribute__('id')}"
     self.uuid: str = self.get_128_uuid(data_str=str(self.id))
     self.user_uuid: str = self.get_128_uuid(data_str=str(self.user_id))
Example #34
0
 def load_status():
     with open('./tests/cassettes/sample-tweet.json') as infile:
         status = Status.parse(api=None, json=load(infile))
         return status
Example #35
0
	def update_tweets(self):
		print "Updating tweets"

		statuses = []
		try:
			while True:
				item = self.incoming.pop() # It's gonna throw up someday!
				if "in_reply_to_status_id" in item:
					statuses.append(Status.parse(self.stream.api, json.loads(item)))
				# Ignore anything other than status updates for now
				#else:
				#	statuses.append(json.loads(item))
		except IndexError:
			pass
		
		broadcast = {}
		broadcast['general'] = {}
		broadcast['channels'] = {}
				
		for s in statuses:
			tags = re.findall("#([\w]+)(?iu)", s.text) # Case-insensitive, Unicode matching
			print "Tags: "
			print tags
			self.db.execute("INSERT INTO tweets (id, user_id, screen_name, profile_image_url, created_at, text) VALUES (%s,%s,%s,%s,%s,%s)", s.id, s.user.id, s.user.screen_name, s.user.profile_image_url, s.created_at, s.text)

			# Establish HABTM relationships, tweets with tags
			for t in tags:
				t = t.lower() # Force all to lowercase
				print "Inserting tag: %s" % t
				self.db.execute('''INSERT INTO hashtags (tag) VALUES (%s) ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), tag=%s; 
					INSERT INTO hashtags_tweets (hash_id, tweet_id) VALUES (LAST_INSERT_ID(), %s)''', t, t, s.id)
				
				# Count the votes while we're at it
				if t in campboard['sessions']:
					
					# Attach the tweet to the broadcast channel
					if not broadcast['channels'].has_key(t):
						broadcast['channels'][t] = {}
					
					broadcast['channels'][t]['recent_tweets'] = []
					broadcast['channels'][t]['recent_tweets'].append(
						{
							'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id,
							'user': {
								'id': s.user.id,
								'screen_name': s.user.screen_name,
								'profile_image_url': s.user.profile_image_url
							}				
						}
					)
			
					vote_type = None
					if re.search('\+1', s.text):
						#vote_type = "positive"
						self.db.execute("INSERT INTO session_votes (`session`, positive) VALUES (%s, 1) ON DUPLICATE KEY UPDATE positive=positive+1", t)
					elif re.search('\-1', s.text):
						#vote_type = "negative"
						self.db.execute("INSERT INTO session_votes (`session`, negative) VALUES (%s, 1) ON DUPLICATE KEY UPDATE negative=negative+1", t)
		

		broadcast['general']['recent_tweets'] = [
			{
				'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id,
				'user': {
					'id': s.user.id,
					'screen_name': s.user.screen_name,
					'profile_image_url': s.user.profile_image_url
				}
			}
			for s in statuses
		]
	
		return broadcast
Example #36
0
def bulk_load(listkey, tweets):
    with open("/home/marcua/data/tweets/%s" % (listkey), "w") as tmpfile:
        print "file %s" % (tmpfile.name)
        for jsontweet in tweets:
            tweet = Status.parse(api, json.loads(jsontweet))
            tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
import unittest
import logging
import sys
from tweepy.models import Status
from TwitterWatcher.tweet_tracker import TweetTracker
from tests.database.mock_database import MockDatabase


dummy_status = Status()
dummy_status._json = {
		'id': 1,
		'id_str': '1',
		'text': 'test',
		'user': {
				'screen_name': 'test_user'
		}
}

dummy_reply = Status()
dummy_reply._json = {
		'id': 2,
		'id_str': '2',
		'text': 'test reply',
		'user': {
				'screen_name': 'test_reply_user'
		},
		'in_reply_to_status_id': 1
}

class TwitterWatcherDatabaseTests(unittest.TestCase):
		def setUp(self):
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        """
        self.count += 1
        
        data = json.loads(raw_data)
        
        if self.count >50000:
            self.statusf.close()
            self.userf.close()
            self.deletef.close()
            self.count = 0
            ts = time.strftime("./data/%Y%m%d%H%M")
            self.statusf = open(ts+'_status.csv','w',newline='')
            self.statusw = csv.writer(self.statusf)
            self.statusw.writerow(['id', 'created_at', 'coordinates',\
                               'hashtags', 'user_mentions', 'symbols', 'urls', \
                               'media', \
                               'in_reply_to_screen_name', \
                               'in_reply_to_user_id_str', \
                               'in_reply_to_status_id_str', \
                               'place', 'retweeted_status_id', 'source', \
                               'text', 'user id' \
                               # some other attributes exsits, they are list below
                               #, status.withheld_copyright, \#optional
                               #status.withheld_in_countries, \#optional
                               #status.withheld_scope, \#optional
                               #status.truncated, \#default False
                               #status.retweeted, status.retweet_count, \#for no rt
                               #status.scopes, possibly_sensitive, \
                               #status.lang, status.fiter_level, \lang=en
                               #status.favorited, status.favorite_count, \
                               #status.current_user_retweet, \
                               #status.contributors, status.annotations \
                               ])
            self.userf = open(ts+'_user.csv','w',newline='')
            self.userw = csv.writer(self.userf)
            self.userw.writerow(['created_at', 'default_profile', \
                             #user.default_profile_image, \
                             'description', \
                             #user.entities, \
                             'favourites_count', \
                             #user.follow_request_sent, user.following,\#relate to given user
                             'followers_count', 'friends_count', \
                             'geo_enabled', 'id_str', 'is_translator', \
                             'lang', 'listed_count', 'location', \
                             #user.notifications, \
                             'name', \
                             #user.profile_background_color, user.profile_background_image_url, \
                             #user.profile_background_image_url_https, user.profile_background_tile, \
                             #user.profile_banner_url, user.profile_image_url, \
                             #user.profile_image_url_https, user.profile_link_color, \
                             #user.profile_sidebar_border_color, user.profile_sidebar_fill_color, \
                             #user.profile_text_color, user.profile_use_background_image, \
                             'protected', 'screen_name', \
                             #user.show_all_inline_media, user.status, \
                             'statuses_count', 'time_zone', 'user.url', \
                             #user.utc_offset, \
                             #user.withheld_in_countries, user.withheld_scope, 
                             'verified'])
            self.deletef = open(ts+'_delete.csv','w',newline='')
            self.deletew = csv.writer(self.deletef)
            self.deletew.writerow(['status_id','user_id'])

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
        else:
            logging.error("Unknown message type: " + str(raw_data))
            return False
        return True
Example #39
0
    hashtag = 0
    url = 0
    question = 0
    exclamation = 0
    pos_term = 0
    neg_term = 0
    pos_emoticon = 0
    neg_emoticon = 0
    reply = 0
    moment_morning = 0
    moment_afternoon = 0
    moment_evening = 0
    moment_night = 0
    retweeted = 0

    status = Status.parse(api, tweet)

    if tweet['id'] in error_list_tweet_ids:
        tweets_discarded_error += 1
    elif tweet['text'].startswith("RT @"):
        tweets_discarded_retweet += 1
    else:
        tweets_considered += 1
        if regex_username.search(tweet['text']) != None:
            tweets_username += 1
            username = 1
        if regex_hashtag.search(tweet['text']) != None:
            tweets_hashtag += 1
            hashtag = 1
        if regex_url.search(tweet['text']) != None:
            tweets_url += 1
Example #40
0
 hashtag = 0
 url = 0
 question = 0
 exclamation = 0
 pos_term = 0
 neg_term = 0
 pos_emoticon = 0
 neg_emoticon = 0
 reply = 0
 moment_morning = 0
 moment_afternoon = 0
 moment_evening = 0
 moment_night = 0
 retweeted = 0
 
 status = Status.parse(api, tweet)
 
 if tweet['id'] in error_list_tweet_ids:
     tweets_discarded_error += 1
 elif tweet['text'].startswith("RT @"):
     tweets_discarded_retweet += 1     
 else:
     tweets_considered += 1
     if regex_username.search(tweet['text']) != None:
         tweets_username += 1
         username = 1
     if regex_hashtag.search(tweet['text']) != None:
         tweets_hashtag += 1
         hashtag = 1
     if regex_url.search(tweet['text']) != None:
         tweets_url += 1
Example #41
0
    def setUp(self):
        def load_status():
            with open('./tests/cassettes/sample-tweet.json') as infile:
                status = Status.parse(api=None, json=load(infile))
                return status

        self._status = Status.parse(
            api=None,
            json={
                'created_at': 'Fri Dec 01 01:53:45 +0000 2017',
                'id': 936412976520876032,
                'id_str': '936412976520876032',
                'text': '@realDonaldTrump https://t.co/0BW86RBIRH',
                'display_text_range': [17, 40],
                'source':
                '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
                'truncated': False,
                'in_reply_to_status_id': 936395008139198464,
                'in_reply_to_status_id_str': '936395008139198464',
                'in_reply_to_user_id': 25073877,
                'in_reply_to_user_id_str': '25073877',
                'in_reply_to_screen_name': 'realDonaldTrump',
                'user': {
                    'id': 29363354,
                    'id_str': '29363354',
                    'name': 'Kate',
                    'screen_name': 'k8_doo',
                    'location': 'United States',
                    'url': None,
                    'description':
                    'Follow me if you want to know how far I walked, hiked or ran today for #charitymiles',
                    'translator_type': 'none',
                    'protected': False,
                    'verified': False,
                    'followers_count': 322,
                    'friends_count': 943,
                    'listed_count': 3,
                    'favourites_count': 26916,
                    'statuses_count': 3334,
                    'created_at': 'Tue Apr 07 02:56:52 +0000 2009',
                    'utc_offset': -18000,
                    'time_zone': 'Eastern Time (US & Canada)',
                    'geo_enabled': True,
                    'lang': 'en',
                    'contributors_enabled': False,
                    'is_translator': False,
                    'profile_background_color': 'EBEBEB',
                    'profile_background_image_url':
                    'http://abs.twimg.com/images/themes/theme7/bg.gif',
                    'profile_background_image_url_https':
                    'https://abs.twimg.com/images/themes/theme7/bg.gif',
                    'profile_background_tile': False,
                    'profile_link_color': '990000',
                    'profile_sidebar_border_color': 'DFDFDF',
                    'profile_sidebar_fill_color': 'F3F3F3',
                    'profile_text_color': '333333',
                    'profile_use_background_image': True,
                    'profile_image_url':
                    'http://pbs.twimg.com/profile_images/823305825297006593/LhjPdILK_normal.jpg',
                    'profile_image_url_https':
                    'https://pbs.twimg.com/profile_images/823305825297006593/LhjPdILK_normal.jpg',
                    'profile_banner_url':
                    'https://pbs.twimg.com/profile_banners/29363354/1485126381',
                    'default_profile': False,
                    'default_profile_image': False,
                    'following': None,
                    'follow_request_sent': None,
                    'notifications': None
                },
                'geo': None,
                'coordinates': None,
                'place': {
                    'bounding_box': {
                        'coordinates': [[1, 2], [3, 2, 1]]
                    }
                },
                'contributors': None,
                'quoted_status_id': 936379603651883008,
                'quoted_status_id_str': '936379603651883008',
                'quoted_status': {
                    'created_at': 'Thu Nov 30 23:41:09 +0000 2017',
                    'id': 936379603651883008,
                    'id_str': '936379603651883008',
                    'text':
                    'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting… https://t.co/PcsatAL7Lu',
                    'display_text_range': [0, 140],
                    'source':
                    '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
                    'truncated': True,
                    'in_reply_to_status_id': None,
                    'in_reply_to_status_id_str': None,
                    'in_reply_to_user_id': None,
                    'in_reply_to_user_id_str': None,
                    'in_reply_to_screen_name': None,
                    'user': {
                        'id': 329433192,
                        'id_str': '329433192',
                        'name': 'Jeremy Dickey',
                        'screen_name': 'JeremyDDickey',
                        'location': 'Washington, D.C.',
                        'url': 'https://medium.com/@JeremyDDickey',
                        'description':
                        'City Government Media Specialist. Aspiring CJ Cregg. Graduate of @MercyhurstU & @LCCLondon. RTs = you got my attention. Tweets are my own. Sarcasm also my own.',
                        'translator_type': 'none',
                        'protected': False,
                        'verified': False,
                        'followers_count': 1860,
                        'friends_count': 2452,
                        'listed_count': 129,
                        'favourites_count': 5864,
                        'statuses_count': 64253,
                        'created_at': 'Tue Jul 05 02:20:11 +0000 2011',
                        'utc_offset': -18000,
                        'time_zone': 'Eastern Time (US & Canada)',
                        'geo_enabled': True,
                        'lang': 'en',
                        'contributors_enabled': False,
                        'is_translator': False,
                        'profile_background_color': '1A1B1F',
                        'profile_background_image_url':
                        'http://pbs.twimg.com/profile_background_images/474534472373649408/gaee5mbF.png',
                        'profile_background_image_url_https':
                        'https://pbs.twimg.com/profile_background_images/474534472373649408/gaee5mbF.png',
                        'profile_background_tile': False,
                        'profile_link_color': '3B94D9',
                        'profile_sidebar_border_color': 'FFFFFF',
                        'profile_sidebar_fill_color': '252429',
                        'profile_text_color': '666666',
                        'profile_use_background_image': False,
                        'profile_image_url':
                        'http://pbs.twimg.com/profile_images/932429063280627713/HnHFID4p_normal.jpg',
                        'profile_image_url_https':
                        'https://pbs.twimg.com/profile_images/932429063280627713/HnHFID4p_normal.jpg',
                        'profile_banner_url':
                        'https://pbs.twimg.com/profile_banners/329433192/1443752276',
                        'default_profile': False,
                        'default_profile_image': False,
                        'following': None,
                        'follow_request_sent': None,
                        'notifications': None
                    },
                    'geo': None,
                    'coordinates': None,
                    'place': {
                        'id': '6417871953fa5e86',
                        'url':
                        'https://api.twitter.com/1.1/geo/id/6417871953fa5e86.json',
                        'place_type': 'city',
                        'name': 'Silver Spring',
                        'full_name': 'Silver Spring, MD',
                        'country_code': 'US',
                        'country': 'United States',
                        'bounding_box': {
                            'type':
                            'Polygon',
                            'coordinates': [[[-77.064086, 38.979735],
                                             [-77.064086, 39.036964],
                                             [-76.97162, 39.036964],
                                             [-76.97162, 38.979735]]]
                        },
                        'attributes': {}
                    },
                    'contributors': None,
                    'is_quote_status': False,
                    'extended_tweet': {
                        'full_text':
                        'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting #Christmas https://t.co/wYoLJRO2r6',
                        'display_text_range': [0, 117],
                        'entities': {
                            'hashtags': [{
                                'text': 'Christmas',
                                'indices': [107, 117]
                            }],
                            'urls': [],
                            'user_mentions': [{
                                'screen_name': 'BarackObama',
                                'name': 'Barack Obama',
                                'id': 813286,
                                'id_str': '813286',
                                'indices': [13, 25]
                            }, {
                                'screen_name': 'realDonaldTrump',
                                'name': 'Donald J. Trump',
                                'id': 25073877,
                                'id_str': '25073877',
                                'indices': [65, 81]
                            }],
                            'symbols': [],
                            'media': [{
                                'id': 936379576682450944,
                                'id_str': '936379576682450944',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'medium': {
                                        'w': 1200,
                                        'h': 800,
                                        'resize': 'fit'
                                    },
                                    'small': {
                                        'w': 680,
                                        'h': 453,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'large': {
                                        'w': 1752,
                                        'h': 1168,
                                        'resize': 'fit'
                                    }
                                }
                            }, {
                                'id': 936379575839358977,
                                'id_str': '936379575839358977',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'small': {
                                        'w': 680,
                                        'h': 680,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'medium': {
                                        'w': 1200,
                                        'h': 1200,
                                        'resize': 'fit'
                                    },
                                    'large': {
                                        'w': 2048,
                                        'h': 2048,
                                        'resize': 'fit'
                                    }
                                }
                            }]
                        },
                        'extended_entities': {
                            'media': [{
                                'id': 936379576682450944,
                                'id_str': '936379576682450944',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ4sWkAAvTDD.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'medium': {
                                        'w': 1200,
                                        'h': 800,
                                        'resize': 'fit'
                                    },
                                    'small': {
                                        'w': 680,
                                        'h': 453,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'large': {
                                        'w': 1752,
                                        'h': 1168,
                                        'resize': 'fit'
                                    }
                                }
                            }, {
                                'id': 936379575839358977,
                                'id_str': '936379575839358977',
                                'indices': [118, 141],
                                'media_url':
                                'http://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'media_url_https':
                                'https://pbs.twimg.com/media/DP6wQ1jWAAE7CdA.jpg',
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'expanded_url':
                                'https://twitter.com/JeremyDDickey/status/936379603651883008/photo/1',
                                'type': 'photo',
                                'sizes': {
                                    'small': {
                                        'w': 680,
                                        'h': 680,
                                        'resize': 'fit'
                                    },
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    },
                                    'medium': {
                                        'w': 1200,
                                        'h': 1200,
                                        'resize': 'fit'
                                    },
                                    'large': {
                                        'w': 2048,
                                        'h': 2048,
                                        'resize': 'fit'
                                    }
                                }
                            }]
                        }
                    },
                    'quote_count': 56,
                    'reply_count': 44,
                    'retweet_count': 326,
                    'favorite_count': 385,
                    'entities': {
                        'hashtags': [],
                        'urls': [{
                            'url': 'https://t.co/PcsatAL7Lu',
                            'expanded_url':
                            'https://twitter.com/i/web/status/936379603651883008',
                            'display_url': 'twitter.com/i/web/status/9…',
                            'indices': [108, 131]
                        }],
                        'user_mentions': [{
                            'screen_name': 'BarackObama',
                            'name': 'Barack Obama',
                            'id': 813286,
                            'id_str': '813286',
                            'indices': [13, 25]
                        }, {
                            'screen_name': 'realDonaldTrump',
                            'name': 'Donald J. Trump',
                            'id': 25073877,
                            'id_str': '25073877',
                            'indices': [65, 81]
                        }],
                        'symbols': []
                    },
                    'favorited': False,
                    'retweeted': False,
                    'possibly_sensitive': False,
                    'filter_level': 'low',
                    'lang': 'en'
                },
                'is_quote_status': True,
                'quote_count': 0,
                'reply_count': 0,
                'retweet_count': 0,
                'favorite_count': 0,
                'entities': {
                    'hashtags': [],
                    'urls': [{
                        'url': 'https://t.co/0BW86RBIRH',
                        'expanded_url':
                        'https://twitter.com/jeremyddickey/status/936379603651883008',
                        'display_url': 'twitter.com/jeremyddickey/…',
                        'indices': [17, 40]
                    }],
                    'user_mentions': [{
                        'screen_name': 'realDonaldTrump',
                        'name': 'Donald J. Trump',
                        'id': 25073877,
                        'id_str': '25073877',
                        'indices': [0, 16]
                    }],
                    'symbols': []
                },
                'favorited': False,
                'retweeted': False,
                'possibly_sensitive': False,
                'filter_level': 'low',
                'lang': 'und',
                'timestamp_ms': '1512093225971'
            })

        self._status_backup = deepcopy(self._status)
Example #42
0
from tweepy.models import Status

from teebr.text.utils import normalize_text
from teebr.features import filter_status

CLUSTERS = 40
DIMS = 100

tweets = []

#tw_count = 0

with open("raw_tweets.jsons") as f:
    for line in f:
        j = loads(line)
        t = Status.parse(None, j)
        if filter_status(t):
            tweet = normalize_text(t.text)
            tweets.append(tweet)
            #tw_count += 1
            #if tw_count >= 2000:
            #    break

# less tweets for the tests
#tweets = tweets[:10000]

print "tweets: %d" % len(tweets)

#hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None)
#vectorizer = make_pipeline(hasher, TfidfTransformer())
Example #43
0
def gen_tuple(jsontweet):
    tweet = Status.parse(api, json.loads(jsontweet))
    retweeted = (getattr(tweet, 'retweeted_status', None) != None)
    return (tweet.author.id, tweet.created_at, convert_to_utf8_str(tweet.text), retweeted)
    def on_data(self, data):                
        '''Parse raw data from twitter and pass the status object to on_status()
        
        Call when raw data is passed from twitter.        
        If this function return False, it stop listening to the streamining.
        
        gSave_raw_json: if true, write json raw text to the ../json/
                        Set it to true only if you would like to debug.
                        
                           
        '''
        
        try:
            self.on_data_running = True
            self.log("Get raw data from Twitter", screen_only=True)
            
            if gSave_raw_json:
                ### save the json into disk ###
                parsed_data = tweepy.utils.import_simplejson().loads(data)
                
                if "id" not in parsed_data.keys():  #may return {"limit":{"track":73}} or {delete...}, ignore this data
                    return True #chucheng, this line is equal to check if 'delete'/;limit' in data
                    
                folder_name = parsed_data["id"]%1000
                
                
                try:
                    if not os.path.exists("../json/"+str(folder_name)):
                        os.makedirs("../json/"+str(folder_name))
                except OSError as ose:
                    self.log("OS ERROR")
                    pass
                
                filename = "../json/"+str(folder_name) + "/" + str(parsed_data["id"]) + ".json" 
                #print filename # for debug
                output = open(filename,"w")
                output.write(data)
                output.write('\n')
                output.close()
                ### done ###
                        
            # Chucheng 4/25/2011:
            #   We must override the method, because the original one might             
            #   return false, cause a stop of the listerner.
            #   In short, you cannot simply call:
            #       tweepy.StreamListener.on_data(self, data) 
            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, json.loads(data))
                if self.on_status(status) is False: #Trigger on_status now!!
                    self.log('in_reply_to_status_id in data: on_status() returns False. (this line should never be reached)')
            else:
                pass #do nothing, the data we get is not what we need.
                    
            """ These lines should never be triggered in that we check :
                
            
            elif 'delete' in data:
                delete = json.loads(data)['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    self.log('delete in data: a delete notice arrives for a status')
            elif 'limit' in data:
                if self.on_limit(json.loads(data)['limit']['track']) is False:
                    self.log('limit in data: a limitation notice arrvies')       
            """
            
            self.on_data_running = False # This variable signal whether 
                                         # we are in the middle of processing data.
        
            if self.running == False: # see: StreamingCrawler.stop_listner()
                return False #stop the listener while catching a SIGTERM
            
        except Exception as e:
            self.on_data_running = False            
            self.log("Error:" + str(e), sys.exc_traceback)

        return True
Example #45
0
 def __init__(self, status: Status):
     self.created_at: datetime = get_datetime_from_date(
         status.__getattribute__("created_at"))
     self.id: int = status.__getattribute__("id")
     self.hashtags: list = status.__getattribute__("entities").get(
         "hashtags", [])
     self.user_mentions: list = status.__getattribute__("entities").get(
         "user_mentions", [])
     self.urls: list = status.__getattribute__("entities").get("urls", [])
     self.media: list = status.__getattribute__("entities").get("media", [])
     self.is_quote_status: bool = status.__getattribute__("is_quote_status")
     self.quote_count: int = status.__getattribute__("quote_count") if\
         hasattr(status, 'quote_count') else 0
     self.text: str = self.get_text(status=status)
     self.retweet_count: int = status.__getattribute__("retweet_count")
     self.retweeted: bool = status.__getattribute__("retweeted")
     self.user_id: int = status.__getattribute__("user").__getattribute__(
         "id")
     self.possibly_sensitive: bool = status.__getattribute__("possibly_sensitive") if\
         hasattr(status, 'possibly_sensitive') else False
     self.favorite_count: int = status.__getattribute__("favorite_count")
     self.favorited: bool = status.__getattribute__("favorited")
     self.lang: str = detect(self.text)
     self.url: str = f"https://twitter.com/user/status/{status.__getattribute__('id')}"
     self.sentiment_analysis: dict = {}
     self.source: str = status.__getattribute__("source")
     self.coordinates: dict = status.__getattribute__("coordinates") if\
         hasattr(status, 'coordinates') else {}
     self.place: dict = self.get_place(status=status)
     self.reply_count: int = status.__getattribute__("reply_count") if\
         hasattr(status, 'quote_count') else 0
     self.uuid: str = self.get_128_uuid(data_str=str(self.id))
     self.user_uuid: str = self.get_128_uuid(data_str=str(self.user_id))
Example #46
0
    hashtag = 0
    url = 0
    question = 0
    exclamation = 0
    pos_term = 0
    neg_term = 0
    pos_emoticon = 0
    neg_emoticon = 0
    reply = 0
    moment_morning = 0
    moment_afternoon = 0
    moment_evening = 0
    moment_night = 0
    retweeted = 0

    status = Status.parse(api, json.loads(tweet[0]))

    if status.id in error_list_tweet_ids:
        tweets_discarded_error += 1
    elif status.text.startswith("RT @"):
        tweets_discarded_retweet += 1
    else:
        tweets_considered += 1
        if regex_username.search(status.text) != None:
            tweets_username += 1
            username = 1
        if regex_hashtag.search(status.text) != None:
            tweets_hashtag += 1
            hashtag = 1
        if regex_url.search(status.text) != None:
            tweets_url += 1
Example #47
0
def test_skip_check():
    filt = skip_check([])
    tweet = Status()
    tweet.text = 'This is a test #nowplaying'
    assert filt(tweet) is True
Example #48
0
def test_skip_check_custom(text, passed):
    filt = skip_check(['#nowplaying', '@boring'])
    tweet = Status()
    tweet.text = text
    assert filt(tweet) is passed
Example #49
0
def bulk_load(listkey, tweets):
    with open('/home/marcua/data/tweets/%s' % (listkey), 'w') as tmpfile:
        print "file %s" % (tmpfile.name)
        for jsontweet in tweets:
            tweet = Status.parse(api, json.loads(jsontweet))
            tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
Example #50
0
    def save_tweets(self):
        while True:
            raw_data = self.q.get()

            data = json.loads(raw_data)

            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, data)

                is_retweet = False
                retweeted_id = 0
                if hasattr(status, 'retweeted_status'):
                    is_retweet = True
                    retweeted_id = status.retweeted_status.id

                    if hasattr(status.retweeted_status, 'extended_tweet'):
                        text = status.retweeted_status.extended_tweet[
                            'full_text']
                    else:
                        text = status.retweeted_status.text

                else:
                    if hasattr(status, 'extended_tweet'):
                        text = status.extended_tweet['full_text']
                    else:
                        text = status.text

                is_quote = hasattr(status, "quoted_status")
                quoted_text = ""
                quoted_id = 0
                if is_quote:
                    quoted_id = status.quoted_status.id

                    if hasattr(status.quoted_status, "extended_tweet"):
                        quoted_text = status.quoted_status.extended_tweet[
                            "full_text"]
                    else:
                        quoted_text = status.quoted_status.text

                for keyword_obj in self.keyword_obj_list:
                    keyword = keyword_obj.keyword

                    if keyword.lower() in text.lower() or keyword.lower(
                    ) in quoted_text.lower():
                        tweet_obj = Tweet.objects.create(
                            keyword=keyword_obj,
                            tweet_id=status.id,
                            created_at=make_aware(status.created_at),
                            user_id=status.user.id,
                            retweeted_id=retweeted_id,
                            quoted_id=quoted_id,
                            text=text,
                            quoted_text=quoted_text)

                        lang = detect(keyword)
                        if lang == 'en':
                            text = text_utils.pre_process(text)

                        triple_list = knowledge_graph_extract.extract_entity(
                            text, lang=lang)
                        for triple in triple_list:
                            Knowledge.objects.create(tweet=tweet_obj,
                                                     k_subject=triple[0],
                                                     k_predicate=triple[1],
                                                     k_object=triple[2],
                                                     subject_type=triple[3],
                                                     object_type=triple[4])

            self.q.task_done()
 def on_data(self, data):
     tweet = Status.parse(tweepy_api, json.loads(data))
     self.handler(tweet)
Example #52
0
 def parse_tweet(tweet):
     """ Parse a JSON tweet into a tweepy object and insert missing author. """
     t = Status.parse(self.api, tweet)
     t.author = current_user
     return t
Example #53
0
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.

        """
        data = json.loads(raw_data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False

        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False

        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False

        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False

        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False

        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False

        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False

        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False

        else:
            return False

        # If this tweet contains text.
        if "user" in list(data.keys()):

            # --------------------------------------------------------------- #
            # Stupid print for fun.
            uname = data["user"]["screen_name"]
            umsg = data["text"]
            nspc = (20 - len(uname))
            if nspc < 1:
                nspc = 1
            spc = " " * nspc
            if not umsg.startswith("RT"):
                print("<tweet>", uname, spc, umsg.replace("\n", ""))
            # --------------------------------------------------------------- #

            # Write the tweet to the buffer.
            self.buffer.write(raw_data)

            # Running counter.
            self.count += 1

            # If the buffer is full, then cycle the buffer.
            if self.count % self.save_interval == 0:
                self.swap_buffer()

            # If the counter is a check-in interval, do all the check-in tasks.
            if self.count % check_in_interval == 0:

                # Shutdown if the `runtime` `run` value is False.
                if checkin_killstream():
                    return False

                # pause if there are too many files in the new tweet directory.
                if not checkin_pausestream():
                    return False