Beispiel #1
0
    def __init__(self, project_dir=None):
        """
        Constructor for Twicorder class. Sets up the task manager, query
        exchange, worker thread and query types.

        Keyword Args:
            project_dir (str): Path to Twicorder project directory

        """
        if project_dir:
            ProjectManager.project_dir = project_dir

        # Todo: Only import logger after project dir is set, to ensure logging
        #       to project dir. This is ugly and needs a better solution.
        from twicorder.utils import TwiLogger
        global logger
        logger = TwiLogger()

        # Test setup before continuing
        try:
            from twicorder.config import Config
            from twicorder.tasks import TaskManager
            from twicorder.auth import Auth
            Config.get()
            TaskManager.load()
            Auth.session()
        except TwicorderException as error:
            logger.critical(error)
            sys.exit(1)
            return

        from twicorder.tasks import TaskManager
        self._task_manager = TaskManager()
        self._worker_thread = WorkerThread()
        self._query_types = {}
Beispiel #2
0
    def _read_loop(self, resp):
        charset = resp.headers.get('content-type', default='')
        enc_search = re.search('charset=(?P<enc>\S*)', charset)
        if enc_search is not None:
            encoding = enc_search.group('enc')
        else:
            encoding = 'utf-8'

        buf = ReadBuffer(resp.raw, self.chunk_size, encoding=encoding)

        while self.running and not resp.raw.closed:
            length = 0
            try:
                while not resp.raw.closed:
                    line = buf.read_line() or ''
                    stripped_line = line.strip()
                    if not stripped_line:
                        # keep-alive new lines are expected
                        self.listener.keep_alive()
                    elif stripped_line.isdigit():
                        length = int(stripped_line)
                        break
                    else:
                        raise TweepError('Expecting length, unexpected value found')

                next_status_obj = buf.read_len(length)
            except Exception as error:
                TwiLogger.exception('Unable to process response: \n')
                continue
            if self.running and next_status_obj:
                self._data(next_status_obj)

        if resp.raw.closed:
            self.on_closed(resp)
Beispiel #3
0
 def id_to_screenname(self):
     now = datetime.now()
     time_since_lookup = now - (self._id_to_screenname_time or now)
     expiry = timedelta(minutes=15)
     if self._id_to_screenname and time_since_lookup <= expiry:
         return self._id_to_screenname
     for follow_id in self.follow:
         user = self.api.get_user(follow_id)
         self._id_to_screenname[follow_id] = '@{}'.format(user.screen_name)
     self._id_to_screenname_time = datetime.now()
     TwiLogger.info(self._id_to_screenname)
     return self._id_to_screenname
Beispiel #4
0
    def on_data(self, json_data):
        """
        Defines the actions to take on data capture. Caching all available user
        data and writing tweet data to disk.

        Args:
            json_data (str): String containing tweet data on JSON format

        Returns:
            bool: True if successful

        """
        self._rate_limit_retry_count = 0
        os.makedirs(self.output_dir, exist_ok=True)
        file_path = os.path.join(self.output_dir, self.file_name)
        data = json.loads(json_data)
        if data.get('created_at'):
            users = utils.collect_key_values('user', data)
            for user in users:
                user['recorded_at'] = data['created_at']
                self.users[user['id_str']] = user
            if self.config.get('full_user_mentions', False):
                self.update_mentions(data)

            # Add tweet to MongoDB
            if self.config.get('use_mongo', True) and self.mongo_collection:
                try:
                    mongo_data = copy.deepcopy(data)
                    mongo_data = utils.timestamp_to_datetime(mongo_data)
                    mongo_data = utils.stream_to_search(mongo_data)
                    self.mongo_collection.replace_one(
                        {'id': mongo_data['id']},
                        mongo_data,
                        upsert=True
                    )
                except Exception:
                    TwiLogger.exception(
                        'Twicorder Listener: Unable to connect to MongoDB: '
                    )

        self._data.append(data)
        utils.write(json.dumps(data) + '\n', file_path)
        timestamp = '{:%d %b %Y %H:%M:%S}'.format(datetime.now())
        tweet = self.get_full_text(data)
        if not tweet:
            return True
        user = data.get('user', {}).get('screen_name', '-')
        oneline_tweet = tweet.replace('\n', ' ')
        TwiLogger.info(f'{timestamp}, @{user}: {oneline_tweet}')
        return True
Beispiel #5
0
def stats():

    from collections import Counter

    try:
        collection = mongo.create_collection()
        data = {
            'All Tweets': f'{collection.count():,}',
        }
        accounts = {
            'slpng_giants',
            'slpng_giants_be',
            'slpng_giants_bg',
            'slpng_giants_br',
            'slpng_giants_ca',
            'slpng_giants_ch',
            'slpng_giants_de',
            'slpng_giants_es',
            'slpng_giants_eu',
            'slpng_giants_fr',
            'slpng_giants_it',
            'slpng_giants_nl',
            'slpng_giants_no',
            'slpng_giants_nz',
            'slpng_giants_oz',
            'slpng_giants_se',
        }
        for account in sorted(accounts):
            data[f'@{account}'] = (
                f'{collection.find({"user.screen_name": account}).count():,}')

        counter = Counter()
        # for tweet in collection.find({"user.screen_name": 'slpng_giants'}):
        #     try:
        #         d = tweet['created_at']
        #         counter[f'Date({d.year}, {d.month - 1}, {d.day})'] += 1
        #     except Exception:
        #         continue
        date_count = sorted([f'[ new {k}, {v} ],' for k, v in counter.items()])
        return render_template('stats.html',
                               title='Stats',
                               data=data,
                               date_count='\n'.join(date_count))
    except Exception:
        TwiLogger.exception('TwiBrowser stats error: ')
        return redirect(url_for('index'))
Beispiel #6
0
def create_collection(db_name='slpng_giants', collection_name='tweets'):
    """
    Create collection for the given database. Skip an return early if collection
    exists.

    Args:
        db_name (str): Database name
        collection_name (str): Collection name

    Returns:
        Collection: Created collection.

    """
    try:
        client = MongoClient()
        if not is_connected(client):
            return
        db = client[db_name]
        if collection_name in db.list_collection_names():
            return db[collection_name]
        collection = db[collection_name]
        collection.create_index('id', unique=True)
        collection.create_index('created_at')
        collection.create_index('retweet_count')
        collection.create_index('favorite_count')
        collection.create_index('in_reply_to_status_id')
        collection.create_index('in_reply_to_user_id')
        collection.create_index('in_reply_to_screen_name')
        collection.create_index('entities.hashtags')
        collection.create_index('user.created_at')
        collection.create_index('user.screen_name')
        collection.create_index('user.id')
        collection.create_index('user.followers_count')
        collection.create_index('user.favourites_count')
        collection.create_index('user.verified')
        collection.create_index('user.statuses_count')
        collection.create_index([('full_text', TEXT)],
                                default_language='english')
        return collection
    except Exception:
        TwiLogger.exception('Unable to connect to MongoDB: ')
        return
Beispiel #7
0
def backfill(path=None, db_name='slpng_giants', collection_name='tweets'):
    tweets = create_collection(db_name, collection_name)

    config = Config.get()
    save_dir = os.path.expanduser(path or config['output_dir'])

    paths = glob.glob(os.path.join(save_dir, '**', '*.t*'), recursive=True)
    t0 = datetime.now()
    for idx, path in enumerate(paths):
        if os.path.basename(os.path.dirname(path)) != 'stream':
            continue
        try:
            for lidx, line in enumerate(utils.readlines(path)):
                try:
                    data = json.loads(line)
                except Exception:
                    TwiLogger.exception(
                        f'Backfill: Unable to read line {path}:{lidx + 1}')
                    continue
                else:
                    if data.get('delete'):
                        continue
                    if os.path.basename(os.path.dirname(path)) == 'stream':
                        data = utils.stream_to_search(data)
                    data = utils.timestamp_to_datetime(data)
                    tweets.replace_one({'id': data['id']}, data, upsert=True)
            t_delta = datetime.now() - t0
            average = t_delta / (idx + 1)
            remaining = str((len(paths) - (idx + 1)) * average).split('.')[0]

            TwiLogger.info(f'{idx + 1}/{len(paths)} '
                           f'{remaining} '
                           f'{os.sep.join(path.split(os.sep)[-3:])}')
        except Exception:
            TwiLogger.exception(f'Backfill: Unable to read file: {path}')
Beispiel #8
0
    def add(self, query):
        """
        Finds appropriate queue for given end point and adds it.

        Args:
            query (BaseQuery): Query object

        """
        queue = self.get_queue(query.endpoint)
        if query in queue.queue:
            TwiLogger.info(f'Query with ID {query.uid} is already in the queue.')
            return
        thread = self.threads.get(query.endpoint)
        if thread and thread.query == query:
            TwiLogger.info(f'Query with ID {query.uid} is already running.')
            return
        queue.put(query)
        TwiLogger.info(query)
Beispiel #9
0
 def run(self):
     """
     Fetches query from queue and executes it.
     """
     while True:
         self._query = self.queue.get()
         if self.query is None:
             TwiLogger.info(f'Terminating thread "{self.name}"')
             break
         while not self.query.done:
             try:
                 self.query.run()
             except Exception:
                 import traceback
                 TwiLogger.exception(traceback.format_exc())
                 break
             TwiLogger.info(self.query.fetch_log())
             time.sleep(.2)
         time.sleep(.5)
         self.queue.task_done()
Beispiel #10
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import time

from datetime import datetime
from queue import Queue
from threading import Thread

from twicorder.utils import TwiLogger

logger = TwiLogger()


class RateLimitCentral:
    """
    Class keeping track of end points and their rate limits.
    """
    _limits = {}

    @classmethod
    def update(cls, endpoint, header):
        """
        Update endpoint with latest rate limit information.

        Args:
            endpoint (str): Endpoint
            header (dict): Query response header

        """
        limit_keys = {
Beispiel #11
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from twicorder.web.browser import app
from twicorder.utils import TwiLogger

if __name__ == '__main__':
    try:
        app.run('localhost')
    except Exception:
        TwiLogger.exception('TwiBrowser Error: ')
Beispiel #12
0
 def track(self):
     track_list = [t for t in self.config.get('track') or [] if t] or None
     if track_list and self.follow_also_tracks:
         track_list += self.id_to_screenname.values()
     TwiLogger.info('Tracking: ', track_list)
     return track_list