def consumer_queue(q):
    engine = create_engine(URL(**CONF['database']['connect_args']),
                           pool_size=1,
                           pool_recycle=CONF['database']['pool_recycle'],
                           client_encoding='utf8')
    Session = scoped_session(sessionmaker(bind=engine))
    session = Session()
    parser = Parser(session,
                    platform_id=1,
                    saved_tweet=True,
                    file_save_null_byte_tweet='null_byte_tweet.txt')
    e_fp = open('exception_tweet.txt', 'w')
    while True:
        jd = q.get()
        if jd == 'STOP':
            logger.info('Task finished!')
            q.put('STOP')
            break
        try:
            parser.parse(jd)
        except Exception as e:
            logger.warning('Exception: %s, when parsing: %s', e, jd['id'])
            e_fp.write(jd['id'])
            e_fp.write('\n')
    e_fp.close()
def main_test(engine,
              session,
              min_id=None,
              max_id=None,
              window_size=1000,
              drop_first=False):
    parser = Parser(session,
                    platform_id=1,
                    saved_tweet=True,
                    file_save_null_byte_tweet='null_byte_tweet.txt')
    # init tables
    init_tables(engine, drop_first)
    if min_id is None:
        q = """ SELECT tw.id
                FROM tweet AS tw
                JOIN  twitter_network_edge AS te ON te.tweet_raw_id=tw.raw_id
                ORDER BY te.id DESC LIMIT 1"""
        min_id = engine.execute(text(q)).scalar()
        if min_id is None:
            min_id = 0
    if max_id is None:
        q = """ SELECT MAX(id) FROM tweet"""
        max_id = engine.execute(text(q)).scalar()
        if max_id is None:
            max_id = 0
            logger.error('No data in tweet table!')
            return None
    w_open_left = min_id
    w_close_right = min_id + window_size
    counter = min_id
    while True:
        logger.info('Current paring tweet id is %s ...', counter)
        q = """
            SELECT tw.json_data
            FROM tweet AS tw
            WHERE tw.id>:l AND tw.id<=:r
            ORDER BY tw.id
            """
        if w_close_right > max_id:
            w_close_right = max_id
        if w_open_left >= max_id:
            logger.info('Max tweet id reached, Done!')
            break
        for jd, in engine.execute(
                text(q).bindparams(l=w_open_left, r=w_close_right)):
            try:
                parser.parse(jd)
            except Exception:
                logger.error('Tweet raw id is: %s', jd['id'])
                raise
            counter += 1
        w_open_left = w_close_right
        w_close_right += window_size
    if parser.fp:
        parser.fp.close()
Example #3
0
    def reparse_db(cls, session, args):
        """Load tweets from file into database.
        """
        def iter_rows_0(rs):
            """Return iterable for row[0] in rs"""
            for row in rs:
                yield row[0]

        parser = Parser()
        bucket_size = args['--window-size']
        plain_sql = args['--plain-sql']
        delete_tables = args['--delete-tables']
        ignore_tables = args['--ignore-tables']
        counter = 0
        table_deletes_sql = dict(ass_tweet="""\
            DELETE FROM ass_tweet AS atw
            USING tweet AS tw, UNNEST(:ids) AS t(raw_id)
            WHERE tw.raw_id=t.raw_id AND atw.id=tw.id
            """,
                                 ass_tweet_url="""\
            DELETE FROM ass_tweet_url AS atu
            USING tweet AS tw, UNNEST(:ids) AS t(raw_id)
            WHERE tw.raw_id=t.raw_id AND atu.id=tw.id
            """,
                                 ass_tweet_hashtag="""\
            DELETE FROM ass_tweet_hashtag AS ath
            USING tweet AS tw, UNNEST(:ids) AS t(raw_id)
            WHERE tw.raw_id=t.raw_id AND ath.id=tw.id
            """,
                                 twitter_network_edge="""\
            DELETE FROM twitter_network_edge AS tne
            USING UNNEST(:ids) AS t(raw_id)
            WHERE tne.tweet_raw_id=t.raw_id
            """)

        for tn in delete_tables:
            del_tn = table_deletes_sql.get(tn)
            if del_tn is None:
                raise ValueError('Unsupported deletion of table %s', tn)
        platform_id = get_platform_id(session, N_PLATFORM_TWITTER)
        logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
        rs = session.execute(text(plain_sql))
        affected_ids = [row[0] for row in rs]
        logger.info('Total number of tweets to reparse: %s', len(affected_ids))
        w_query = """\
        SELECT tw.json_data AS jd
        FROM UNNEST(:ids) AS t(raw_id)
        JOIN tweet AS tw ON tw.raw_id=t.raw_id
        """
        for chunk in chunked_iterable(affected_ids, bucket_size):
            for tn in delete_tables:
                del_tn = table_deletes_sql[tn]
                try:
                    session.execute(text(del_tn).bindparams(ids=chunk))
                    session.commit()
                    logger.info('Table %s deleted successfully!', tn)
                except SQLAlchemyError as err:
                    logger.exception(err)
                    raise
            rs = session.execute(text(w_query).bindparams(ids=chunk))
            jds = iter_rows_0(rs)
            parser.bulk_parse_and_save(jds,
                                       session,
                                       platform_id,
                                       multiprocesses=True,
                                       ignore_tables=ignore_tables)
            counter += len(chunk)
            logger.info('Current Number of reparsed tweets: %s', counter)
        logger.info('Total number of reparsed tweets: %s!', counter)
        logger.info('Reparse done, exit!')
Example #4
0
 def load_tweets(cls, session, args):
     """Load tweets from file into database.
     """
     parser = Parser()
     ntweets = args['--number-of-tweets']
     strict_on_error = args['--strict-on-error']
     true_counter = 0
     counter = 0
     jds = []
     f = xopen(args['<filepath>'])
     platform_id = get_platform_id(session, N_PLATFORM_TWITTER)
     bucket_size = args['--window-size']
     try:
         for line in f:
             counter += 1
             if line:
                 try:
                     jd = json.loads(line)
                     if 'in_reply_to_status_id' in jd and 'user' in jd and\
                             'text' in jd:
                         jds.append(json.loads(line))
                         true_counter += 1
                     else:
                         logger.error('Not a tweet at line %s, raw data %r',
                                      counter, jd)
                         if strict_on_error:
                             sys.exit(1)
                         continue
                 except Exception as e:
                     msg = 'JSON loads error at line %s: %r, raw data: %r'
                     logger.error(msg, counter, e, line)
                     if strict_on_error:
                         sys.exit(1)
                     continue
             else:
                 logger.error('Empty line at line %s', counter)
             if ntweets is not None and ntweets == true_counter:
                 logger.warning(
                     'Reaching the number of tweets %s at line %s', ntweets,
                     counter)
                 # break the loop
                 break
             if true_counter % bucket_size == 0:
                 logger.warning('Reading %s lines, %s tweets parsed',
                                counter, true_counter)
                 parser.bulk_parse_and_save(jds,
                                            session,
                                            platform_id,
                                            multiprocesses=True)
                 jds = []
     except Exception as err:
         logger.exception(err)
         logger.info('Saving successfully read tweets ...')
     if jds:
         parser.bulk_parse_and_save(jds,
                                    session,
                                    platform_id,
                                    multiprocesses=True)
         jds = []
         logger.warning('Reading %s lines, %s tweets parsed', counter,
                        true_counter)
Example #5
0
    def _monitor(self):
        """Monitor the queue for tweet incoming and then parse and save it into
        the database.

        This method runs on a separate, internal thread.
        The thread will terminate if it sees a sentinel object in the queue.
        """
        # scoped_session
        # Session itself is not thread safe, we use scoped_session.
        # Each thread uses only one scoped_session object
        # We never delete anything from database in this function.
        # set expire_on_commit=False to avoid re-fetch of these existed objects
        session = Session(expire_on_commit=False)
        parser = Parser(**self.parser_kwargs)
        platform_id = get_platform_id(session, name=N_PLATFORM_TWITTER)
        has_task_done = hasattr(self.queue, 'task_done')
        while not self._stop.isSet():
            if self.is_connection_failed is True:
                self.on_db_server_down(session)
                continue
            # normal bulk insert process
            try:
                # fill the bucket
                for i in range(self.bucket_size):
                    # dequeue with block=True
                    jd = self.queue.get(True)
                    if has_task_done is True:
                        self.queue.task_done()
                    if jd is not self._sentinel:
                        self.global_counter += 1
                        self.bucket.append(jd)
                    else:
                        break
                # consume this bucket
                self.consume_this_bucket(parser, session, platform_id)
                self.bucket = []
            # database is shutdown unexpectedly
            except OperationalError as err:
                session.rollback()
                if 'server closed the connection unexpectedly' in repr(
                        err) or 'could not connect to server' in repr(err):
                    logger.critical('Causion: database server is down!')
                    self.is_connection_failed = True
                else:
                    logger.error(err)
                    self.on_db_bulk_save_error()
            except SQLAlchemyError as err:
                session.rollback()
                logger.exception(err)
                self.on_db_bulk_save_error()
            except BaseException as err:
                # unexpected exception, logging (will exit)
                logger.exception(err)
                raise
        # There might still be records in the queue.
        while True:
            try:
                jd = self.queue.get(False)
                if has_task_done:
                    self.queue.task_done()
                if jd is self._sentinel:
                    break
                self.bucket.append(jd)
            except queue.Empty:
                break
        if self.bucket:
            try:
                self.consume_this_bucket(parser, session, platform_id)
                self.bucket = []
            except SQLAlchemyError as err:
                session.rollback()
                logger.exception('Consumer thread: %s', err)
                self.on_db_bulk_save_error()
        if self._fp_db_down is not None:
            self._fp_db_down.close()
        if self._fp_db_bulk_save is not None:
            self._fp_db_bulk_save.close()