def consumer_queue(q): engine = create_engine(URL(**CONF['database']['connect_args']), pool_size=1, pool_recycle=CONF['database']['pool_recycle'], client_encoding='utf8') Session = scoped_session(sessionmaker(bind=engine)) session = Session() parser = Parser(session, platform_id=1, saved_tweet=True, file_save_null_byte_tweet='null_byte_tweet.txt') e_fp = open('exception_tweet.txt', 'w') while True: jd = q.get() if jd == 'STOP': logger.info('Task finished!') q.put('STOP') break try: parser.parse(jd) except Exception as e: logger.warning('Exception: %s, when parsing: %s', e, jd['id']) e_fp.write(jd['id']) e_fp.write('\n') e_fp.close()
def main_test(engine, session, min_id=None, max_id=None, window_size=1000, drop_first=False): parser = Parser(session, platform_id=1, saved_tweet=True, file_save_null_byte_tweet='null_byte_tweet.txt') # init tables init_tables(engine, drop_first) if min_id is None: q = """ SELECT tw.id FROM tweet AS tw JOIN twitter_network_edge AS te ON te.tweet_raw_id=tw.raw_id ORDER BY te.id DESC LIMIT 1""" min_id = engine.execute(text(q)).scalar() if min_id is None: min_id = 0 if max_id is None: q = """ SELECT MAX(id) FROM tweet""" max_id = engine.execute(text(q)).scalar() if max_id is None: max_id = 0 logger.error('No data in tweet table!') return None w_open_left = min_id w_close_right = min_id + window_size counter = min_id while True: logger.info('Current paring tweet id is %s ...', counter) q = """ SELECT tw.json_data FROM tweet AS tw WHERE tw.id>:l AND tw.id<=:r ORDER BY tw.id """ if w_close_right > max_id: w_close_right = max_id if w_open_left >= max_id: logger.info('Max tweet id reached, Done!') break for jd, in engine.execute( text(q).bindparams(l=w_open_left, r=w_close_right)): try: parser.parse(jd) except Exception: logger.error('Tweet raw id is: %s', jd['id']) raise counter += 1 w_open_left = w_close_right w_close_right += window_size if parser.fp: parser.fp.close()
def reparse_db(cls, session, args): """Load tweets from file into database. """ def iter_rows_0(rs): """Return iterable for row[0] in rs""" for row in rs: yield row[0] parser = Parser() bucket_size = args['--window-size'] plain_sql = args['--plain-sql'] delete_tables = args['--delete-tables'] ignore_tables = args['--ignore-tables'] counter = 0 table_deletes_sql = dict(ass_tweet="""\ DELETE FROM ass_tweet AS atw USING tweet AS tw, UNNEST(:ids) AS t(raw_id) WHERE tw.raw_id=t.raw_id AND atw.id=tw.id """, ass_tweet_url="""\ DELETE FROM ass_tweet_url AS atu USING tweet AS tw, UNNEST(:ids) AS t(raw_id) WHERE tw.raw_id=t.raw_id AND atu.id=tw.id """, ass_tweet_hashtag="""\ DELETE FROM ass_tweet_hashtag AS ath USING tweet AS tw, UNNEST(:ids) AS t(raw_id) WHERE tw.raw_id=t.raw_id AND ath.id=tw.id """, twitter_network_edge="""\ DELETE FROM twitter_network_edge AS tne USING UNNEST(:ids) AS t(raw_id) WHERE tne.tweet_raw_id=t.raw_id """) for tn in delete_tables: del_tn = table_deletes_sql.get(tn) if del_tn is None: raise ValueError('Unsupported deletion of table %s', tn) platform_id = get_platform_id(session, N_PLATFORM_TWITTER) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) rs = session.execute(text(plain_sql)) affected_ids = [row[0] for row in rs] logger.info('Total number of tweets to reparse: %s', len(affected_ids)) w_query = """\ SELECT tw.json_data AS jd FROM UNNEST(:ids) AS t(raw_id) JOIN tweet AS tw ON tw.raw_id=t.raw_id """ for chunk in chunked_iterable(affected_ids, bucket_size): for tn in delete_tables: del_tn = table_deletes_sql[tn] try: session.execute(text(del_tn).bindparams(ids=chunk)) session.commit() logger.info('Table %s deleted successfully!', tn) except SQLAlchemyError as err: logger.exception(err) raise rs = session.execute(text(w_query).bindparams(ids=chunk)) jds = iter_rows_0(rs) parser.bulk_parse_and_save(jds, session, platform_id, multiprocesses=True, ignore_tables=ignore_tables) counter += len(chunk) logger.info('Current Number of reparsed tweets: %s', counter) logger.info('Total number of reparsed tweets: %s!', counter) logger.info('Reparse done, exit!')
def load_tweets(cls, session, args): """Load tweets from file into database. """ parser = Parser() ntweets = args['--number-of-tweets'] strict_on_error = args['--strict-on-error'] true_counter = 0 counter = 0 jds = [] f = xopen(args['<filepath>']) platform_id = get_platform_id(session, N_PLATFORM_TWITTER) bucket_size = args['--window-size'] try: for line in f: counter += 1 if line: try: jd = json.loads(line) if 'in_reply_to_status_id' in jd and 'user' in jd and\ 'text' in jd: jds.append(json.loads(line)) true_counter += 1 else: logger.error('Not a tweet at line %s, raw data %r', counter, jd) if strict_on_error: sys.exit(1) continue except Exception as e: msg = 'JSON loads error at line %s: %r, raw data: %r' logger.error(msg, counter, e, line) if strict_on_error: sys.exit(1) continue else: logger.error('Empty line at line %s', counter) if ntweets is not None and ntweets == true_counter: logger.warning( 'Reaching the number of tweets %s at line %s', ntweets, counter) # break the loop break if true_counter % bucket_size == 0: logger.warning('Reading %s lines, %s tweets parsed', counter, true_counter) parser.bulk_parse_and_save(jds, session, platform_id, multiprocesses=True) jds = [] except Exception as err: logger.exception(err) logger.info('Saving successfully read tweets ...') if jds: parser.bulk_parse_and_save(jds, session, platform_id, multiprocesses=True) jds = [] logger.warning('Reading %s lines, %s tweets parsed', counter, true_counter)
def _monitor(self): """Monitor the queue for tweet incoming and then parse and save it into the database. This method runs on a separate, internal thread. The thread will terminate if it sees a sentinel object in the queue. """ # scoped_session # Session itself is not thread safe, we use scoped_session. # Each thread uses only one scoped_session object # We never delete anything from database in this function. # set expire_on_commit=False to avoid re-fetch of these existed objects session = Session(expire_on_commit=False) parser = Parser(**self.parser_kwargs) platform_id = get_platform_id(session, name=N_PLATFORM_TWITTER) has_task_done = hasattr(self.queue, 'task_done') while not self._stop.isSet(): if self.is_connection_failed is True: self.on_db_server_down(session) continue # normal bulk insert process try: # fill the bucket for i in range(self.bucket_size): # dequeue with block=True jd = self.queue.get(True) if has_task_done is True: self.queue.task_done() if jd is not self._sentinel: self.global_counter += 1 self.bucket.append(jd) else: break # consume this bucket self.consume_this_bucket(parser, session, platform_id) self.bucket = [] # database is shutdown unexpectedly except OperationalError as err: session.rollback() if 'server closed the connection unexpectedly' in repr( err) or 'could not connect to server' in repr(err): logger.critical('Causion: database server is down!') self.is_connection_failed = True else: logger.error(err) self.on_db_bulk_save_error() except SQLAlchemyError as err: session.rollback() logger.exception(err) self.on_db_bulk_save_error() except BaseException as err: # unexpected exception, logging (will exit) logger.exception(err) raise # There might still be records in the queue. while True: try: jd = self.queue.get(False) if has_task_done: self.queue.task_done() if jd is self._sentinel: break self.bucket.append(jd) except queue.Empty: break if self.bucket: try: self.consume_this_bucket(parser, session, platform_id) self.bucket = [] except SQLAlchemyError as err: session.rollback() logger.exception('Consumer thread: %s', err) self.on_db_bulk_save_error() if self._fp_db_down is not None: self._fp_db_down.close() if self._fp_db_bulk_save is not None: self._fp_db_bulk_save.close()