def saver_queue(q2, number_of_workers): engine = create_engine(URL(**CONF['database']['connect_args']), pool_size=1, pool_recycle=CONF['database']['pool_recycle'], client_encoding='utf8') Session = scoped_session(sessionmaker(bind=engine)) session = Session() workers_status = [1 for i in range(number_of_workers)] while True: pid, status, uusers = q2.get() if status == 'STOP': logger.info( 'Saver process: STOP sign of worker process %s received from q2', pid) workers_status[pid] = 0 if sum(workers_status) == 0: logger.warning('All STOP signs received from q2.') logger.warning('Data saving task done!') break else: logger.info('Saver process: size of uusers is %s', len(uusers)) stmt_do_nothing = insert(TwitterUserUnion).values( uusers).on_conflict_do_nothing(index_elements=['raw_id']) session.execute(stmt_do_nothing) session.commit()
def run(cls, args): try: # print(args) args = cls.args_schema.validate(args) except SchemaError as e: sys.exit(e) session = Session() # make sure lucene be inited lucene.initVM() lucene.getVMEnv().attachCurrentThread() if args['--index'] is True: configure_logging( 'lucene.index', console_level=args['--console-log-level']) mgid = get_or_create_m( session, MetaInfo, data=dict( name='article_group_id_lucene_index', value='0', value_type='int', description='article.group_id used for lucene index'), fb_uk='name') if args['--mode'] == 'create': mgid.set_value(0) session.commit() logger.debug('Indexing started.. Getting articles..') q = """ SELECT DISTINCT ON (a.group_id) a.id, a.group_id, a.canonical_url, a.title, a.meta, a.content, coalesce(a.date_published, a.date_captured) AS pd, s.domain, s.site_type FROM article AS a JOIN site AS s ON s.id=a.site_id WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE AND a.group_id>:gid ORDER BY group_id, pd ASC """ articles_iter = session.execute( sqlalchemy.text(q).bindparams(gid=mgid.get_value())) cls.index(session, args['--mode'], articles_iter, mgid) elif args['--search'] is True: configure_logging( 'lucene.search', console_level=args['--console-log-level']) cls.search(args['--query'], args['--top']) else: print("Unrecognized command!") sys.exit(2)
def workers_queue(pid, q1, q2): """Receiving parameters from q1, then computing and finally put results into q2 """ engine = create_engine(URL(**CONF['database']['connect_args']), pool_size=1, pool_recycle=CONF['database']['pool_recycle'], client_encoding='utf8') Session = scoped_session(sessionmaker(bind=engine)) session = Session() parser = BulkParser(platform_id=1, save_none_url_tweet=True) while True: try: data = q1.get(timeout=1) except Empty: logger.info('Worker process %s: queue is empty for 1 seconds', pid) q2.put((pid, 'STOP', None)) break if data == 'STOP': logger.info('Worker process %s: STOP sign received from q1!', pid) q1.put('STOP') q2.put((pid, 'STOP', None)) break else: logger.info('Worker process %s: data=%s received', pid, data) w_open_left, w_close_right = data jds = dict() g_urls_map = dict() query = """ SELECT tw.id, tw.json_data, u.id, u.raw FROM tweet AS tw LEFT JOIN ass_tweet_url AS atu ON atu.tweet_id=tw.id LEFT JOIN url AS u ON u.id=atu.url_id WHERE tw.id>:l AND tw.id<=:r """ for tw_id, jd, url_id, url in engine.execute( text(query).bindparams(l=w_open_left, r=w_close_right)): jds[tw_id] = jd if url_id is not None: g_urls_map[url] = url_id g_uusers_set = set() g_edges_set = set() for tw_id, jd in jds.items(): parser.parse_existed_one(tw_id, jd, session, g_urls_map=g_urls_map, g_uusers_set=g_uusers_set, g_edges_set=g_edges_set) edges = [ dict(tweet_raw_id=t0, from_raw_id=t1, to_raw_id=t2, url_id=t3, is_quoted_url=t4, is_mention=t5, tweet_type=t6) for t0, t1, t2, t3, t4, t5, t6 in g_edges_set if t3 != -1 ] uusers = [dict(raw_id=t1, screen_name=t2) for t1, t2 in g_uusers_set] # session.bulk_insert_mappings(TwitterNetworkEdge, edges) stmt_do_nothing = insert(TwitterNetworkEdge).values( edges).on_conflict_do_nothing(index_elements=[ 'tweet_raw_id', 'from_raw_id', 'to_raw_id', 'url_id', 'is_quoted_url', 'is_mention', 'tweet_type' ]) session.execute(stmt_do_nothing) session.commit() q2.put((pid, 'RUN', uusers)) logger.info('Worker process %s: tweets from %s to %s done', pid, w_open_left + 1, w_close_right)