Exemple #1
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        data = json.loads(HTMLParser().unescape(raw_data))

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
            logging.error("Unknown message type: " + str(raw_data))
Exemple #2
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        data = json.loads(HTMLParser().unescape(raw_data))

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
            logging.error("Unknown message type: " + str(raw_data))
Exemple #3
    def on_data(self, raw_data):
        """Called when raw data is received from connection.
        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
            data = json.loads(raw_data)

            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, data)
                if self.on_status(status) is False:
                    return False
            elif 'delete' in data:
                delete = data['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    return False
            elif 'event' in data:
                status = Status.parse(self.api, data)
                if self.on_event(status) is False:
                    return False
            elif 'direct_message' in data:
                status = Status.parse(self.api, data)
                if self.on_direct_message(status) is False:
                    return False
            elif 'friends' in data:
                if self.on_friends(data['friends']) is False:
                    return False
            elif 'limit' in data:
                if self.on_limit(data['limit']['track']) is False:
                    return False
            elif 'disconnect' in data:
                if self.on_disconnect(data['disconnect']) is False:
                    return False
            elif 'warning' in data:
                if self.on_warning(data['warning']) is False:
                    return False
            elif 'scrub_geo' in data:
                if self.on_scrub_geo(data['scrub_geo']) is False:
                    return False
            elif 'status_withheld' in data:
                if self.on_status_withheld(data['status_withheld']) is False:
                    return False
            elif 'user_withheld' in data:
                if self.on_user_withheld(data['user_withheld']) is False:
                    return False
                insert_logger.error("Unknown message type: %s", raw_data)
        except IncompleteRead as e:
            return True
Exemple #4
    def on_data(self, raw_data):
        data = json.loads(raw_data)

        if self.my_screen_name == data['user']['screen_name']:
            return True

            data['tweet_text'] = data['extended_tweet']['full_text']
        except KeyError:
                data['tweet_text'] = data['text']
            except KeyError:
                data['tweet_text'] = u''

        if 'retweeted_status' in data:
            self.logger.info('retweet detected')
            status = Status.parse(self.api, data)
            if self.on_status(status, is_retweet=True) is False:
                return False
        elif 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
            self.logger.error('Unknown message type: %s', str(raw_data))
Exemple #5
    def on_data(self, raw_data):
        """This is called when raw data is received from the stream.
        This method handles sending the data to other methods, depending on the
        message type.

        data = json.loads(raw_data)

        if "in_reply_to_status_id" in data:
            status = Status.parse(None, data)
            return self.on_status(status)
        if "delete" in data:
            delete = data["delete"]["status"]
            return self.on_delete(delete["id"], delete["user_id"])
        if "disconnect" in data:
            return self.on_disconnect_message(data["disconnect"])
        if "limit" in data:
            return self.on_limit(data["limit"]["track"])
        if "scrub_geo" in data:
            return self.on_scrub_geo(data["scrub_geo"])
        if "status_withheld" in data:
            return self.on_status_withheld(data["status_withheld"])
        if "user_withheld" in data:
            return self.on_user_withheld(data["user_withheld"])
        if "warning" in data:
            return self.on_warning(data["warning"])

        log.error("Received unknown message type: %s", raw_data)
Exemple #6
 def on_data(self, data):
     if time.time() >= self.started + self.duration:
         stats = open('{0}-sample.stats'.format(int(self.started)), 'w+')
         stats.write("================= STATISTICS =================" + "\n")
         stats.write("Start time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n")
         stats.write("End time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n")
         stats.write("First Tweet ID: " + self.first_tweet_id + "\n")
         stats.write("Last Tweet ID: " + self.last_tweet_id + "\n")
         stats.write("Language: " + self.lang + "\n")
         stats.write("Language classification threshold: " + str(self.lang_threshold) + "\n")
         stats.write("Above threshold: " + str(self.counter[self.lang + '-above']) + "\n")
         stats.write("Below threshold: " + str(self.counter[self.lang + '-below']) + "\n")
         stats.write("Exluded: " + str(self.counter['excluded']) + "\n")
         return False
     elif 'in_reply_to_status_id' in data: 
         status = Status.parse(self.api, json.loads(data))
         langclass = langid.classify(status.text)
         if (self.counter == {self.lang + '-above':0, self.lang + '-below':0, 'excluded':0}):
             self.first_tweet_id = str(status.id)
         self.last_tweet_id = str(status.id)
         if (langclass[0] == self.lang):                
             if langclass[1] >= self.lang_threshold:
                 self.counter[self.lang + '-above'] += 1
                 self.counter[self.lang + '-below'] += 1
             self.counter['excluded'] += 1
         return True
Exemple #7
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        data = json.loads(raw_data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            return self.on_status(status)
        if 'delete' in data:
            delete = data['delete']['status']
            return self.on_delete(delete['id'], delete['user_id'])
        if 'limit' in data:
            return self.on_limit(data['limit']['track'])
        if 'disconnect' in data:
            return self.on_disconnect(data['disconnect'])
        if 'warning' in data:
            return self.on_warning(data['warning'])
        if 'scrub_geo' in data:
            return self.on_scrub_geo(data['scrub_geo'])
        if 'status_withheld' in data:
            return self.on_status_withheld(data['status_withheld'])
        if 'user_withheld' in data:
            return self.on_user_withheld(data['user_withheld'])

        log.error("Unknown message type: %s", raw_data)
Exemple #8
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.

        if '{"delete"' in data:
                delete = json.loads(data)['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    return False
                delete = json.loads(data)['delete']['direct_message']
                if self.on_direct_message_delete(delete['id'], delete['user_id']) is False:
                    return False
        elif '{"direct_message"' in data:
            message = DirectMessage.parse(self.api, json.loads(data)['direct_message'])
            if self.on_direct_message(message) is False:
                return False
        elif '{"target"' in data:
            event = json.loads(data)
            if self.on_event(event) is False:
                return False
        elif '{"limit"' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
        elif '"in_reply_to_user_id_str"' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
Exemple #9
    def on_data(self, data):

        if self.print_data:

        self._print_status(Status.parse(self.api, self.json.loads(data)))
Exemple #10
def test_end_to_end(filename, connections, expected, tmpdir):
    api = MockAPI(connections=connections)

    with open(filename, 'r') as f:
        status = Status.parse(api, json.load(fp=f))

    l = LessListener(api=api, post_replies=True, gather='tweets', state_dir=str(tmpdir))

    # 100% festivity for all of December
    l.december_greetings = ('It is cold outside.',)
    l.festive_probability = 1.
    assert l.get_festive_probability(dt.date(2016, 12, 5)) == 1.


    # Never reply to the same toot twice

    # Rate-limit replies for same word
    setattr(status, 'id', status.id + 1)

    if expected is None:
        assert api._updates == []
        assert len(api._updates) == 1
        u = api._updates[0]
        assert u['status'] == expected

    for k, before in connections.items():
        after = api._connections[k]
        assert ('following' in after) == ('followed_by' in before), \
            (k, before, after)
 def process(self, tweet):
     status = Status.parse(api, json.loads(tweet))
     for lf in UNICODE_LINES:
         text = status.text.replace(lf, ' ')
     print "@%s (%s, %s, %s, %s): %s"%(status.user.screen_name, 
         status.user.lang, status.user.statuses_count, status.user.friends_count, 
         status.user.followers_count, text)
Exemple #12
 def on_data(self, data):
     if "entities" in data:
         data = json.loads(data)
         user_mentions = data["entities"]["user_mentions"]
         screen_names = [mention["screen_name"] for mention in user_mentions]
         if "testeMagazine" in screen_names:
             status = Tweet.parse(self.api, data)
Exemple #13
def get(name, mx=-1):
    ss = []
    with open("%s%s%s" % (_prefix, name, _suffix)) as f:
        for i, l in enumerate(f):
            if mx > 0 and i > mx:
            ss.append(Status.parse(None, loads(l)))
    return ss
Exemple #14
  def on_data(self, raw_data):
    """Called when raw data is received from connection.

    This is where all the data comes first. Normally we could use (inherit)
    the on_data() in tweepy.StreamListener, but it unnecessarily and naively
    reports unknown event types as errors (to simple log); also, we might want
    to tweak it further later on.

    But for now, this is basically taken from tweepy's on_data().

    Return False to stop stream and close connection.

    self.processing_data = True

    data = json.loads(raw_data)

    if 'in_reply_to_status_id' in data:
      status = Status.parse(self.api, data)
      if self.on_status(status) is False:
        return False
    elif 'delete' in data:
      delete = data['delete']['status']
      if self.on_delete(delete['id'], delete['user_id']) is False:
        return False
    elif 'event' in data:
      status = Status.parse(self.api, data)
      if self.on_event(status) is False:
        return False
    elif 'direct_message' in data:
      status = Status.parse(self.api, data)
      if self.on_direct_message(status) is False:
        return False
    elif 'limit' in data:
      if self.on_limit(data['limit']['track']) is False:
        return False
    elif 'disconnect' in data:
      if self.on_disconnect(data['disconnect']) is False:
        return False
      log.debug('TwitterBotStreamListener::on_data(): got event/stream data of'
          ' unknown type. Raw data follows:\n%s', data)

    self.processing_data = False
Exemple #15
def test_sanitize(filename, expected):
    api = NonCallableMock()

    with open(os.path.join('tests', filename), 'r') as f:
        status = Status.parse(api, json.load(f))

    text = get_sanitized_text(status)
    assert '&' not in text
    assert 'http' not in text
    assert text == expected
Exemple #16
 def _read_from_table(self):
     self.running = True
     conn = StatusSource.engine.connect()
     meta = MetaData()
     table = Table(self.table_name, meta, autoload=True, autoload_with=StatusSource.engine)
     cmd = select([table])
     results = conn.execute(cmd)
     for result in results:
         status = Status.parse(None, result)
         if self.running == False:
Exemple #17
    def on_data(self, raw_data):
        # called on recieval of raw data
        data = json.loads(raw_data)

        # start of if tree
        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
    def test_patched_status(self):
        """@todo: Docstring for test_patched_status.
        :returns: @todo

        from tweepy.models import Status
        from crawler.tweepy_patch import patch
        s = Status.parse('test_api', {'a': 1, 'b': 2})
        # pylint: disable=E1101,W0212
        self.assertEqual(s._raw, '{"a": 1, "b": 2}')
        self.assertEqual(s.a, 1)
        self.assertEqual(s.b, 2)
Exemple #19
def test_save_tweet(tmpdir, id_, expected_filename):
    api = MockAPI(connections={})
    foo = tmpdir.join('foo')

    l = LessListener(api=api, gather=str(foo), state_dir=str(tmpdir))
    s = Status.parse(api=api, json={
        'id': int(id_),
        'id_str': id_,

    j = tmpdir.join('foo', expected_filename)
    assert j.check()
    def on_data(self, data):

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status, data) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
Exemple #21
 def on_data(self, data):
     if 'in_reply_to_status_id' in data:
         status = Status.parse(self.api, json.loads(data))
         if self.on_status(status, data) is False:
             return False
     elif 'delete' in data:
         delete = json.loads(data)['delete']['status']
         if self.on_delete(delete['id'], delete['user_id']) is False:
              return False
     elif 'limit' in data:
         if self.on_limit(json.loads(data)['limit']['track']) is False:
             return False
Exemple #22
    def on_data(self, raw_data):
        data = json.loads(raw_data)
        if self.verbose:
            print data
            print '-' * 60

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'friends' in data:
            pass  # ignore
        elif 'delete' in data:
            pass  # ignore
        elif 'user_suspend' in data:
            pass  # ignore
            logging.error("Unknown message type: " + str(raw_data))
Exemple #23
    def on_data(self, raw_data):
        data = json.loads(raw_data)
        if self.verbose:
            print data
            print '-'*60

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'friends' in data:
            pass # ignore
        elif 'delete' in data:
            pass # ignore
        elif 'user_suspend' in data:
            pass # ignore
            logging.error("Unknown message type: " + str(raw_data))
Exemple #24
    def save_status(self, data):
        status = Status.parse(self.api, json.loads(data))

        if not status.geo:
            # _datafile.write(data+'\n')

        if Author.objects.filter(owner__userprofile__twitter_id=status.user.id_str).exists():
            # this tweet's author is on stargazer

            author = Author.objects.filter(source=Author.T_TWITTER, external_id=status.user.id_str).get()
        except Author.DoesNotExist:
            author = Author(

            post = Post.objects.filter(source=Post.T_TWITTER, external_id=status.id_str).get()
        except Post.DoesNotExist:
            lat = float(status.geo["coordinates"][0])
            lng = float(status.geo["coordinates"][1])

                addr = self._latlng2addr.get(lat, lng)
            except (LatLng2Addr.ConnectionFailed, LatLng2Addr.GeocodingFailed) as e:
                addr = ""

            # twitter api response in UTC
            created = status.created_at + timedelta(hours=8)

            post = Post(

Exemple #25
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        data = json.loads(raw_data)

        if "in_reply_to_status_id" in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif "delete" in data:
            delete = data["delete"]["status"]
            if self.on_delete(delete["id"], delete["user_id"]) is False:
                return False
        elif "event" in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif "direct_message" in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif "friends" in data:
            if self.on_friends(data["friends"]) is False:
                return False
        elif "limit" in data:
            if self.on_limit(data["limit"]["track"]) is False:
                return False
        elif "disconnect" in data:
            if self.on_disconnect(data["disconnect"]) is False:
                return False
        elif "warning" in data:
            if self.on_warning(data["warning"]) is False:
                return False
            logging.error("Unknown message type: " + str(raw_data))
Exemple #26
    def post_tweet(self, media_id, status, in_reply_to_status_id):
        request_data = {
            'status': status,
            'media_ids': media_id,
            'in_reply_to_status_id': in_reply_to_status_id

        req = self.post(url=POST_TWEET_URL,
                            key: val
                            for key, val in request_data.items()
                            if val is not None
        return Status.parse(self.api, req.json())
Exemple #27
 def __init__(self, tweetDict):
     self.tweet = Status.parse(API(), tweetDict["tweet"])
         self.keywords = tweetDict["keywords"]
     except KeyError:
         self.groups = tweetDict["groups"]
     except KeyError:
     self.tokens = []
     self.filt_tokens = []
     for token in tweetDict["tokens"]:
         t = Token(token)
         if not t.filter_token():
Exemple #28
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
Exemple #29
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
Exemple #30
    def on_data(self, data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.

        if "in_reply_to_status_id" in data:
            status = Status.parse(self.api, json.loads(data))
            if self.on_status(status) is False:
                return False
        elif "delete" in data:
            delete = json.loads(data)["delete"]["status"]
            if self.on_delete(delete["id"], delete["user_id"]) is False:
                return False
        elif "limit" in data:
            if self.on_limit(json.loads(data)["limit"]["track"]) is False:
                return False
Exemple #31
    def on_data(self, data):
        if time.time() >= self.started + self.duration:
            stats = open('{0}-sample.stats'.format(int(self.started)), 'w+')
            stats.write("================= STATISTICS =================" +
            stats.write("Start time: " + time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n")
            stats.write("End time: " + time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n")
            stats.write("First Tweet ID: " + self.first_tweet_id + "\n")
            stats.write("Last Tweet ID: " + self.last_tweet_id + "\n")
            stats.write("Language: " + self.lang + "\n")
            stats.write("Language classification threshold: " +
                        str(self.lang_threshold) + "\n")
            stats.write("Above threshold: " +
                        str(self.counter[self.lang + '-above']) + "\n")
            stats.write("Below threshold: " +
                        str(self.counter[self.lang + '-below']) + "\n")
            stats.write("Exluded: " + str(self.counter['excluded']) + "\n")
            return False
        elif 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, json.loads(data))
            langclass = langid.classify(status.text)

            if (self.counter == {
                    self.lang + '-above': 0,
                    self.lang + '-below': 0,
                    'excluded': 0
                self.first_tweet_id = str(status.id)
            self.last_tweet_id = str(status.id)

            if (langclass[0] == self.lang):
                if langclass[1] >= self.lang_threshold:
                    self.counter[self.lang + '-above'] += 1
                    self.counter[self.lang + '-below'] += 1
                self.counter['excluded'] += 1

            return True
 def process(self, tweet):
     status = Status.parse(api, json.loads(tweet))
     out = {"screen_name": status.user.screen_name, 
         "id": status.id,
         "lang": status.user.lang, 
         "statuses_count": status.user.statuses_count, 
         "friend_count": status.user.friends_count, 
         "profile_image_url": status.user.profile_image_url,
         "text": status.text.encode('utf8'),
         "entities": status.entities,
         "created_at": status.created_at.strftime("%Y-%m-%d %H:%M:%S"),
     now = time.strftime(self.fmt)
     if now != self.time:
         self.time = str(now)
         self.fid = gzip.open(os.path.join(self.path, self.base + '-' + self.time + '.txt.gz'), 'ab')
     self.fid.write(json.dumps(out) + '\n')
Exemple #33
    def _get_status(self, data):

        status = Status.parse(self.api, self.json.loads(data))

        if status.user.screen_name in self.block_users:
            raise TweepError(">> User ignored: @%s" % status.user.screen_name)
            status = status.retweeted_status
        except AttributeError as atr:
            if not self.original:
                text = self._proccess_status(status.text)
                trunc_text = (text[:72] + '...') if len(text) > 75 else text
                raise TweepError(">> Original tweet ignored: %s" % trunc_text)

        if status.is_quote_status:
            if self.quoted:
                status = status.quoted_status
                text = self._proccess_status(status.text)
                raise TweepError(">> Quoted tweet ignored: %s" % text)

        return status
Exemple #34
  def on_data(self, data):
    """Called when raw data is received from connection.

    Override this method if you wish to manually handle
    the stream data. Return False to stop stream and close connection.

    if 'in_reply_to_status_id' in data:
      status = Status.parse(self.api, json.loads(data))
      return self.on_status(status)
    elif 'delete' in data:
      delete = json.loads(data)['delete']['status']
      if self.on_delete(delete['id'], delete['user_id']) is False:
        return False
    elif 'limit' in data:
        if self.on_limit(json.loads(data)['limit']['track']) is False:
          return False
    elif 'sender_id' in data and 'recipient_id' in data:
      dm = DirectMessage.parse(self.api, json.loads(data))
      return self.on_dm(dm)
    elif 'event' in data and 'follow' in data:
      content = json.loads(data)
      if 'event' in content and content['event'] == 'follow':
        return self.on_follow(content)
Exemple #35
def gen_tuple(jsontweet):
    tweet = Status.parse(api, json.loads(jsontweet))
    retweeted = (getattr(tweet, 'retweeted_status', None) != None)
    return (tweet.author.id, tweet.created_at, convert_to_utf8_str(tweet.text), retweeted)
Exemple #36
	def update_tweets(self):
		print "Updating tweets"

		statuses = []
			while True:
				item = self.incoming.pop() # It's gonna throw up someday!
				if "in_reply_to_status_id" in item:
					statuses.append(Status.parse(self.stream.api, json.loads(item)))
				# Ignore anything other than status updates for now
				#	statuses.append(json.loads(item))
		except IndexError:
		broadcast = {}
		broadcast['general'] = {}
		broadcast['channels'] = {}
		for s in statuses:
			tags = re.findall("#([\w]+)(?iu)", s.text) # Case-insensitive, Unicode matching
			print "Tags: "
			print tags
			self.db.execute("INSERT INTO tweets (id, user_id, screen_name, profile_image_url, created_at, text) VALUES (%s,%s,%s,%s,%s,%s)", s.id, s.user.id, s.user.screen_name, s.user.profile_image_url, s.created_at, s.text)

			# Establish HABTM relationships, tweets with tags
			for t in tags:
				t = t.lower() # Force all to lowercase
				print "Inserting tag: %s" % t
				self.db.execute('''INSERT INTO hashtags (tag) VALUES (%s) ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), tag=%s; 
					INSERT INTO hashtags_tweets (hash_id, tweet_id) VALUES (LAST_INSERT_ID(), %s)''', t, t, s.id)
				# Count the votes while we're at it
				if t in campboard['sessions']:
					# Attach the tweet to the broadcast channel
					if not broadcast['channels'].has_key(t):
						broadcast['channels'][t] = {}
					broadcast['channels'][t]['recent_tweets'] = []
							'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id,
							'user': {
								'id': s.user.id,
								'screen_name': s.user.screen_name,
								'profile_image_url': s.user.profile_image_url
					vote_type = None
					if re.search('\+1', s.text):
						#vote_type = "positive"
						self.db.execute("INSERT INTO session_votes (`session`, positive) VALUES (%s, 1) ON DUPLICATE KEY UPDATE positive=positive+1", t)
					elif re.search('\-1', s.text):
						#vote_type = "negative"
						self.db.execute("INSERT INTO session_votes (`session`, negative) VALUES (%s, 1) ON DUPLICATE KEY UPDATE negative=negative+1", t)

		broadcast['general']['recent_tweets'] = [
				'text': s.text, 'created_at': unicode(s.created_at), 'id': s.id,
				'user': {
					'id': s.user.id,
					'screen_name': s.user.screen_name,
					'profile_image_url': s.user.profile_image_url
			for s in statuses
		return broadcast
Exemple #37
    hashtag = 0
    url = 0
    question = 0
    exclamation = 0
    pos_term = 0
    neg_term = 0
    pos_emoticon = 0
    neg_emoticon = 0
    reply = 0
    moment_morning = 0
    moment_afternoon = 0
    moment_evening = 0
    moment_night = 0
    retweeted = 0

    status = Status.parse(api, json.loads(tweet[0]))

    if status.id in error_list_tweet_ids:
        tweets_discarded_error += 1
    elif status.text.startswith("RT @"):
        tweets_discarded_retweet += 1
        tweets_considered += 1
        if regex_username.search(status.text) != None:
            tweets_username += 1
            username = 1
        if regex_hashtag.search(status.text) != None:
            tweets_hashtag += 1
            hashtag = 1
        if regex_url.search(status.text) != None:
            tweets_url += 1
Exemple #38
from tweepy.models import Status

from teebr.text.utils import normalize_text
from teebr.features import filter_status

DIMS = 100

tweets = []

#tw_count = 0

with open("raw_tweets.jsons") as f:
    for line in f:
        j = loads(line)
        t = Status.parse(None, j)
        if filter_status(t):
            tweet = normalize_text(t.text)
            #tw_count += 1
            #if tw_count >= 2000:
            #    break

# less tweets for the tests
#tweets = tweets[:10000]

print "tweets: %d" % len(tweets)

#hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None)
#vectorizer = make_pipeline(hasher, TfidfTransformer())
Exemple #39
    hashtag = 0
    url = 0
    question = 0
    exclamation = 0
    pos_term = 0
    neg_term = 0
    pos_emoticon = 0
    neg_emoticon = 0
    reply = 0
    moment_morning = 0
    moment_afternoon = 0
    moment_evening = 0
    moment_night = 0
    retweeted = 0

    status = Status.parse(api, tweet)

    if tweet['id'] in error_list_tweet_ids:
        tweets_discarded_error += 1
    elif tweet['text'].startswith("RT @"):
        tweets_discarded_retweet += 1
        tweets_considered += 1
        if regex_username.search(tweet['text']) != None:
            tweets_username += 1
            username = 1
        if regex_hashtag.search(tweet['text']) != None:
            tweets_hashtag += 1
            hashtag = 1
        if regex_url.search(tweet['text']) != None:
            tweets_url += 1
Exemple #40
 def on_data(self, data):
     Generic class for site streams that just print each
     action that comes in - override these methods to actually
     process them
     if 'for_user' in data:
         parsed_data = json.loads(data)
         user_id = parsed_data['for_user']
         if 'message' in data:
             message = parsed_data['message']
             if u'friends' in message:
                 if self.on_friends(user_id, message['friends']) is False:
                     return False
             elif u'event' in message:
                 if message[u'event'] == u'follow':
                     if self.on_follow(
                     ) is False:
                         return False
                 elif message[u'event'] == u'unfollow':
                     if self.on_unfollow(
                     ) is False:
                         return False
                 elif message[u'event'] == u'favorite':
                     if self.on_favorite(
                     ) is False:
                         return False
                 elif message[u'event'] == u'unfavorite':
                     if self.on_unfavorite(
                     ) is False:
                         return False
             # Need this second check - could be a retweet of
             # a tweet mentioning the user of interest
             elif (u'retweeted_status' in message and
                 int(message[u'retweeted_status'][u'user'][u'id']) ==
                 if self.on_retweet(user_id, message) is False:
                     return False
             elif u'text' in message:
                 status = Status.parse(self.api, message)
                 # tweet from the user of interest
                 if status.author.id == user_id:
                     if self.on_user_status(user_id, status) is False:
                         return False
                 else:   # tweet mentioning the user of interest
                     if self.on_user_mention(user_id, status) is False:
                         return False
             elif u'direct_message' in message:
                 if self.on_direct_message(
                     user_id, message[u'direct_message']
                 ) is False:
                     return False
                 print parsed_data
Exemple #41
 def parse_tweet(tweet):
     """ Parse a JSON tweet into a tweepy object and insert missing author. """
     t = Status.parse(self.api, tweet)
     t.author = current_user
     return t
Exemple #42
    def save_tweets(self):
        while True:
            raw_data = self.q.get()

            data = json.loads(raw_data)

            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, data)

                is_retweet = False
                retweeted_id = 0
                if hasattr(status, 'retweeted_status'):
                    is_retweet = True
                    retweeted_id = status.retweeted_status.id

                    if hasattr(status.retweeted_status, 'extended_tweet'):
                        text = status.retweeted_status.extended_tweet[
                        text = status.retweeted_status.text

                    if hasattr(status, 'extended_tweet'):
                        text = status.extended_tweet['full_text']
                        text = status.text

                is_quote = hasattr(status, "quoted_status")
                quoted_text = ""
                quoted_id = 0
                if is_quote:
                    quoted_id = status.quoted_status.id

                    if hasattr(status.quoted_status, "extended_tweet"):
                        quoted_text = status.quoted_status.extended_tweet[
                        quoted_text = status.quoted_status.text

                for keyword_obj in self.keyword_obj_list:
                    keyword = keyword_obj.keyword

                    if keyword.lower() in text.lower() or keyword.lower(
                    ) in quoted_text.lower():
                        tweet_obj = Tweet.objects.create(

                        lang = detect(keyword)
                        if lang == 'en':
                            text = text_utils.pre_process(text)

                        triple_list = knowledge_graph_extract.extract_entity(
                            text, lang=lang)
                        for triple in triple_list:

 def on_data(self, data):
     tweet = Status.parse(tweepy_api, json.loads(data))
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.
        self.count += 1
        data = json.loads(raw_data)
        if self.count >50000:
            self.count = 0
            ts = time.strftime("./data/%Y%m%d%H%M")
            self.statusf = open(ts+'_status.csv','w',newline='')
            self.statusw = csv.writer(self.statusf)
            self.statusw.writerow(['id', 'created_at', 'coordinates',\
                               'hashtags', 'user_mentions', 'symbols', 'urls', \
                               'media', \
                               'in_reply_to_screen_name', \
                               'in_reply_to_user_id_str', \
                               'in_reply_to_status_id_str', \
                               'place', 'retweeted_status_id', 'source', \
                               'text', 'user id' \
                               # some other attributes exsits, they are list below
                               #, status.withheld_copyright, \#optional
                               #status.withheld_in_countries, \#optional
                               #status.withheld_scope, \#optional
                               #status.truncated, \#default False
                               #status.retweeted, status.retweet_count, \#for no rt
                               #status.scopes, possibly_sensitive, \
                               #status.lang, status.fiter_level, \lang=en
                               #status.favorited, status.favorite_count, \
                               #status.current_user_retweet, \
                               #status.contributors, status.annotations \
            self.userf = open(ts+'_user.csv','w',newline='')
            self.userw = csv.writer(self.userf)
            self.userw.writerow(['created_at', 'default_profile', \
                             #user.default_profile_image, \
                             'description', \
                             #user.entities, \
                             'favourites_count', \
                             #user.follow_request_sent, user.following,\#relate to given user
                             'followers_count', 'friends_count', \
                             'geo_enabled', 'id_str', 'is_translator', \
                             'lang', 'listed_count', 'location', \
                             #user.notifications, \
                             'name', \
                             #user.profile_background_color, user.profile_background_image_url, \
                             #user.profile_background_image_url_https, user.profile_background_tile, \
                             #user.profile_banner_url, user.profile_image_url, \
                             #user.profile_image_url_https, user.profile_link_color, \
                             #user.profile_sidebar_border_color, user.profile_sidebar_fill_color, \
                             #user.profile_text_color, user.profile_use_background_image, \
                             'protected', 'screen_name', \
                             #user.show_all_inline_media, user.status, \
                             'statuses_count', 'time_zone', 'user.url', \
                             #user.utc_offset, \
                             #user.withheld_in_countries, user.withheld_scope, 
            self.deletef = open(ts+'_delete.csv','w',newline='')
            self.deletew = csv.writer(self.deletef)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
            logging.error("Unknown message type: " + str(raw_data))
            return False
        return True
Exemple #45
    def on_data(self, data):
        full_text = ""

        data2 = json.loads(data)

        if 'extended_tweet' in data2:
            if ('full_text' in data2["extended_tweet"]):

                full_text = bytes(
                full_text = full_text.decode('utf-8')
                    'FUL TEXT *******************************************************************************'

            #print(self.find_between( data, '"extended_tweet":{"full_text":"','",'))
        if ("retweeted_status" in data2):
            if ('full_text' in data2["retweeted_status"]):
                full_text = bytes(
                full_text = full_text.decode('utf-8')
                    'FUL TEXT *******************************************************************************'

        data = json.loads(data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status, full_text) is False:
                return False
        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False
        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False
        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False
        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False
        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False
            logging.error("Unknown message type: " + str(raw_data))
Exemple #46
    def setUp(self):
        def load_status():
            with open('./tests/cassettes/sample-tweet.json') as infile:
                status = Status.parse(api=None, json=load(infile))
                return status

        self._status = Status.parse(
                'created_at': 'Fri Dec 01 01:53:45 +0000 2017',
                'id': 936412976520876032,
                'id_str': '936412976520876032',
                'text': '@realDonaldTrump https://t.co/0BW86RBIRH',
                'display_text_range': [17, 40],
                '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
                'truncated': False,
                'in_reply_to_status_id': 936395008139198464,
                'in_reply_to_status_id_str': '936395008139198464',
                'in_reply_to_user_id': 25073877,
                'in_reply_to_user_id_str': '25073877',
                'in_reply_to_screen_name': 'realDonaldTrump',
                'user': {
                    'id': 29363354,
                    'id_str': '29363354',
                    'name': 'Kate',
                    'screen_name': 'k8_doo',
                    'location': 'United States',
                    'url': None,
                    'Follow me if you want to know how far I walked, hiked or ran today for #charitymiles',
                    'translator_type': 'none',
                    'protected': False,
                    'verified': False,
                    'followers_count': 322,
                    'friends_count': 943,
                    'listed_count': 3,
                    'favourites_count': 26916,
                    'statuses_count': 3334,
                    'created_at': 'Tue Apr 07 02:56:52 +0000 2009',
                    'utc_offset': -18000,
                    'time_zone': 'Eastern Time (US & Canada)',
                    'geo_enabled': True,
                    'lang': 'en',
                    'contributors_enabled': False,
                    'is_translator': False,
                    'profile_background_color': 'EBEBEB',
                    'profile_background_tile': False,
                    'profile_link_color': '990000',
                    'profile_sidebar_border_color': 'DFDFDF',
                    'profile_sidebar_fill_color': 'F3F3F3',
                    'profile_text_color': '333333',
                    'profile_use_background_image': True,
                    'default_profile': False,
                    'default_profile_image': False,
                    'following': None,
                    'follow_request_sent': None,
                    'notifications': None
                'geo': None,
                'coordinates': None,
                'place': {
                    'bounding_box': {
                        'coordinates': [[1, 2], [3, 2, 1]]
                'contributors': None,
                'quoted_status_id': 936379603651883008,
                'quoted_status_id_str': '936379603651883008',
                'quoted_status': {
                    'created_at': 'Thu Nov 30 23:41:09 +0000 2017',
                    'id': 936379603651883008,
                    'id_str': '936379603651883008',
                    'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting… https://t.co/PcsatAL7Lu',
                    'display_text_range': [0, 140],
                    '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
                    'truncated': True,
                    'in_reply_to_status_id': None,
                    'in_reply_to_status_id_str': None,
                    'in_reply_to_user_id': None,
                    'in_reply_to_user_id_str': None,
                    'in_reply_to_screen_name': None,
                    'user': {
                        'id': 329433192,
                        'id_str': '329433192',
                        'name': 'Jeremy Dickey',
                        'screen_name': 'JeremyDDickey',
                        'location': 'Washington, D.C.',
                        'url': 'https://medium.com/@JeremyDDickey',
                        'City Government Media Specialist. Aspiring CJ Cregg. Graduate of @MercyhurstU & @LCCLondon. RTs = you got my attention. Tweets are my own. Sarcasm also my own.',
                        'translator_type': 'none',
                        'protected': False,
                        'verified': False,
                        'followers_count': 1860,
                        'friends_count': 2452,
                        'listed_count': 129,
                        'favourites_count': 5864,
                        'statuses_count': 64253,
                        'created_at': 'Tue Jul 05 02:20:11 +0000 2011',
                        'utc_offset': -18000,
                        'time_zone': 'Eastern Time (US & Canada)',
                        'geo_enabled': True,
                        'lang': 'en',
                        'contributors_enabled': False,
                        'is_translator': False,
                        'profile_background_color': '1A1B1F',
                        'profile_background_tile': False,
                        'profile_link_color': '3B94D9',
                        'profile_sidebar_border_color': 'FFFFFF',
                        'profile_sidebar_fill_color': '252429',
                        'profile_text_color': '666666',
                        'profile_use_background_image': False,
                        'default_profile': False,
                        'default_profile_image': False,
                        'following': None,
                        'follow_request_sent': None,
                        'notifications': None
                    'geo': None,
                    'coordinates': None,
                    'place': {
                        'id': '6417871953fa5e86',
                        'place_type': 'city',
                        'name': 'Silver Spring',
                        'full_name': 'Silver Spring, MD',
                        'country_code': 'US',
                        'country': 'United States',
                        'bounding_box': {
                            'coordinates': [[[-77.064086, 38.979735],
                                             [-77.064086, 39.036964],
                                             [-76.97162, 39.036964],
                                             [-76.97162, 38.979735]]]
                        'attributes': {}
                    'contributors': None,
                    'is_quote_status': False,
                    'extended_tweet': {
                        'On the left: @BarackObama’s National Tree Lighting\nOn the right: @realDonaldTrump’s National Tree Lighting #Christmas https://t.co/wYoLJRO2r6',
                        'display_text_range': [0, 117],
                        'entities': {
                            'hashtags': [{
                                'text': 'Christmas',
                                'indices': [107, 117]
                            'urls': [],
                            'user_mentions': [{
                                'screen_name': 'BarackObama',
                                'name': 'Barack Obama',
                                'id': 813286,
                                'id_str': '813286',
                                'indices': [13, 25]
                            }, {
                                'screen_name': 'realDonaldTrump',
                                'name': 'Donald J. Trump',
                                'id': 25073877,
                                'id_str': '25073877',
                                'indices': [65, 81]
                            'symbols': [],
                            'media': [{
                                'id': 936379576682450944,
                                'id_str': '936379576682450944',
                                'indices': [118, 141],
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'type': 'photo',
                                'sizes': {
                                    'medium': {
                                        'w': 1200,
                                        'h': 800,
                                        'resize': 'fit'
                                    'small': {
                                        'w': 680,
                                        'h': 453,
                                        'resize': 'fit'
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    'large': {
                                        'w': 1752,
                                        'h': 1168,
                                        'resize': 'fit'
                            }, {
                                'id': 936379575839358977,
                                'id_str': '936379575839358977',
                                'indices': [118, 141],
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'type': 'photo',
                                'sizes': {
                                    'small': {
                                        'w': 680,
                                        'h': 680,
                                        'resize': 'fit'
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    'medium': {
                                        'w': 1200,
                                        'h': 1200,
                                        'resize': 'fit'
                                    'large': {
                                        'w': 2048,
                                        'h': 2048,
                                        'resize': 'fit'
                        'extended_entities': {
                            'media': [{
                                'id': 936379576682450944,
                                'id_str': '936379576682450944',
                                'indices': [118, 141],
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'type': 'photo',
                                'sizes': {
                                    'medium': {
                                        'w': 1200,
                                        'h': 800,
                                        'resize': 'fit'
                                    'small': {
                                        'w': 680,
                                        'h': 453,
                                        'resize': 'fit'
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    'large': {
                                        'w': 1752,
                                        'h': 1168,
                                        'resize': 'fit'
                            }, {
                                'id': 936379575839358977,
                                'id_str': '936379575839358977',
                                'indices': [118, 141],
                                'url': 'https://t.co/wYoLJRO2r6',
                                'display_url': 'pic.twitter.com/wYoLJRO2r6',
                                'type': 'photo',
                                'sizes': {
                                    'small': {
                                        'w': 680,
                                        'h': 680,
                                        'resize': 'fit'
                                    'thumb': {
                                        'w': 150,
                                        'h': 150,
                                        'resize': 'crop'
                                    'medium': {
                                        'w': 1200,
                                        'h': 1200,
                                        'resize': 'fit'
                                    'large': {
                                        'w': 2048,
                                        'h': 2048,
                                        'resize': 'fit'
                    'quote_count': 56,
                    'reply_count': 44,
                    'retweet_count': 326,
                    'favorite_count': 385,
                    'entities': {
                        'hashtags': [],
                        'urls': [{
                            'url': 'https://t.co/PcsatAL7Lu',
                            'display_url': 'twitter.com/i/web/status/9…',
                            'indices': [108, 131]
                        'user_mentions': [{
                            'screen_name': 'BarackObama',
                            'name': 'Barack Obama',
                            'id': 813286,
                            'id_str': '813286',
                            'indices': [13, 25]
                        }, {
                            'screen_name': 'realDonaldTrump',
                            'name': 'Donald J. Trump',
                            'id': 25073877,
                            'id_str': '25073877',
                            'indices': [65, 81]
                        'symbols': []
                    'favorited': False,
                    'retweeted': False,
                    'possibly_sensitive': False,
                    'filter_level': 'low',
                    'lang': 'en'
                'is_quote_status': True,
                'quote_count': 0,
                'reply_count': 0,
                'retweet_count': 0,
                'favorite_count': 0,
                'entities': {
                    'hashtags': [],
                    'urls': [{
                        'url': 'https://t.co/0BW86RBIRH',
                        'display_url': 'twitter.com/jeremyddickey/…',
                        'indices': [17, 40]
                    'user_mentions': [{
                        'screen_name': 'realDonaldTrump',
                        'name': 'Donald J. Trump',
                        'id': 25073877,
                        'id_str': '25073877',
                        'indices': [0, 16]
                    'symbols': []
                'favorited': False,
                'retweeted': False,
                'possibly_sensitive': False,
                'filter_level': 'low',
                'lang': 'und',
                'timestamp_ms': '1512093225971'

        self._status_backup = deepcopy(self._status)
def bulk_load(listkey, tweets):
    with open("/home/marcua/data/tweets/%s" % (listkey), "w") as tmpfile:
        print "file %s" % (tmpfile.name)
        for jsontweet in tweets:
            tweet = Status.parse(api, json.loads(jsontweet))
            tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
Exemple #48
 def load_status():
     with open('./tests/cassettes/sample-tweet.json') as infile:
         status = Status.parse(api=None, json=load(infile))
         return status
Exemple #49
 hashtag = 0
 url = 0
 question = 0
 exclamation = 0
 pos_term = 0
 neg_term = 0
 pos_emoticon = 0
 neg_emoticon = 0
 reply = 0
 moment_morning = 0
 moment_afternoon = 0
 moment_evening = 0
 moment_night = 0
 retweeted = 0
 status = Status.parse(api, json.loads(tweet[0]))
 if status.id in error_list_tweet_ids:
     tweets_discarded_error += 1
 elif status.text.startswith("RT @"):
     tweets_discarded_retweet += 1     
     tweets_considered += 1
     if regex_username.search(status.text) != None:
         tweets_username += 1
         username = 1
     if regex_hashtag.search(status.text) != None:
         tweets_hashtag += 1
         hashtag = 1
     if regex_url.search(status.text) != None:
         tweets_url += 1
Exemple #50
 hashtag = 0
 url = 0
 question = 0
 exclamation = 0
 pos_term = 0
 neg_term = 0
 pos_emoticon = 0
 neg_emoticon = 0
 reply = 0
 moment_morning = 0
 moment_afternoon = 0
 moment_evening = 0
 moment_night = 0
 retweeted = 0
 status = Status.parse(api, tweet)
 if tweet['id'] in error_list_tweet_ids:
     tweets_discarded_error += 1
 elif tweet['text'].startswith("RT @"):
     tweets_discarded_retweet += 1     
     tweets_considered += 1
     if regex_username.search(tweet['text']) != None:
         tweets_username += 1
         username = 1
     if regex_hashtag.search(tweet['text']) != None:
         tweets_hashtag += 1
         hashtag = 1
     if regex_url.search(tweet['text']) != None:
         tweets_url += 1
Exemple #51
    def on_data(self, raw_data):
        """Called when raw data is received from connection.

        Override this method if you wish to manually handle
        the stream data. Return False to stop stream and close connection.

        data = json.loads(raw_data)

        if 'in_reply_to_status_id' in data:
            status = Status.parse(self.api, data)
            if self.on_status(status) is False:
                return False

        elif 'delete' in data:
            delete = data['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False

        elif 'event' in data:
            status = Status.parse(self.api, data)
            if self.on_event(status) is False:
                return False

        elif 'direct_message' in data:
            status = Status.parse(self.api, data)
            if self.on_direct_message(status) is False:
                return False

        elif 'friends' in data:
            if self.on_friends(data['friends']) is False:
                return False

        elif 'limit' in data:
            if self.on_limit(data['limit']['track']) is False:
                return False

        elif 'disconnect' in data:
            if self.on_disconnect(data['disconnect']) is False:
                return False

        elif 'warning' in data:
            if self.on_warning(data['warning']) is False:
                return False

            return False

        # If this tweet contains text.
        if "user" in list(data.keys()):

            # --------------------------------------------------------------- #
            # Stupid print for fun.
            uname = data["user"]["screen_name"]
            umsg = data["text"]
            nspc = (20 - len(uname))
            if nspc < 1:
                nspc = 1
            spc = " " * nspc
            if not umsg.startswith("RT"):
                print("<tweet>", uname, spc, umsg.replace("\n", ""))
            # --------------------------------------------------------------- #

            # Write the tweet to the buffer.

            # Running counter.
            self.count += 1

            # If the buffer is full, then cycle the buffer.
            if self.count % self.save_interval == 0:

            # If the counter is a check-in interval, do all the check-in tasks.
            if self.count % check_in_interval == 0:

                # Shutdown if the `runtime` `run` value is False.
                if checkin_killstream():
                    return False

                # pause if there are too many files in the new tweet directory.
                if not checkin_pausestream():
                    return False
Exemple #52
def bulk_load(listkey, tweets):
    with open('/home/marcua/data/tweets/%s' % (listkey), 'w') as tmpfile:
        print "file %s" % (tmpfile.name)
        for jsontweet in tweets:
            tweet = Status.parse(api, json.loads(jsontweet))
            tmpfile.write(convert_to_utf8_str(tweet.text) + "\n")
    def on_data(self, data):                
        '''Parse raw data from twitter and pass the status object to on_status()
        Call when raw data is passed from twitter.        
        If this function return False, it stop listening to the streamining.
        gSave_raw_json: if true, write json raw text to the ../json/
                        Set it to true only if you would like to debug.
            self.on_data_running = True
            self.log("Get raw data from Twitter", screen_only=True)
            if gSave_raw_json:
                ### save the json into disk ###
                parsed_data = tweepy.utils.import_simplejson().loads(data)
                if "id" not in parsed_data.keys():  #may return {"limit":{"track":73}} or {delete...}, ignore this data
                    return True #chucheng, this line is equal to check if 'delete'/;limit' in data
                folder_name = parsed_data["id"]%1000
                    if not os.path.exists("../json/"+str(folder_name)):
                except OSError as ose:
                    self.log("OS ERROR")
                filename = "../json/"+str(folder_name) + "/" + str(parsed_data["id"]) + ".json" 
                #print filename # for debug
                output = open(filename,"w")
                ### done ###
            # Chucheng 4/25/2011:
            #   We must override the method, because the original one might             
            #   return false, cause a stop of the listerner.
            #   In short, you cannot simply call:
            #       tweepy.StreamListener.on_data(self, data) 
            if 'in_reply_to_status_id' in data:
                status = Status.parse(self.api, json.loads(data))
                if self.on_status(status) is False: #Trigger on_status now!!
                    self.log('in_reply_to_status_id in data: on_status() returns False. (this line should never be reached)')
                pass #do nothing, the data we get is not what we need.
            """ These lines should never be triggered in that we check :
            elif 'delete' in data:
                delete = json.loads(data)['delete']['status']
                if self.on_delete(delete['id'], delete['user_id']) is False:
                    self.log('delete in data: a delete notice arrives for a status')
            elif 'limit' in data:
                if self.on_limit(json.loads(data)['limit']['track']) is False:
                    self.log('limit in data: a limitation notice arrvies')       
            self.on_data_running = False # This variable signal whether 
                                         # we are in the middle of processing data.
            if self.running == False: # see: StreamingCrawler.stop_listner()
                return False #stop the listener while catching a SIGTERM
        except Exception as e:
            self.on_data_running = False            
            self.log("Error:" + str(e), sys.exc_traceback)

        return True