def query_network(): """Handle API request '/network'. API Request Parameters ---------------------- ids : list of int nodes_limit : int edges_limit : int include_user_mentions : bool API Response Keys ----------------- status : string num_of_entries : int edges : dict canonical_url : string date_published : string formatted datetime domain : string from_user_id : string from_user_screen_name : string id : int is_mention : bool site_type : {'claim', 'fact_checking'} title : string to_user_id : string to_user_screen_name : string tweet_created_at : string formatted datetime tweet_id: string tweet_type: {'origin', 'retweet', 'quote', 'reply'} """ lucene.getVMEnv().attachCurrentThread() q_network_schema = Schema({ 'ids': Use(flask.json.loads), Optional('nodes_limit', default=1000): And(Use(int), lambda i: i > 0), Optional('edges_limit', default=12500): And(Use(int), lambda i: i > 0), Optional('include_user_mentions', default=True): And( unicode, Use(lambda s: s.lower()), lambda s: s in ('true', 'false'), Use(lambda s: True if s == 'true' else False)), }) q_kwargs = copy_req_args(request.args) try: q_kwargs = q_network_schema.validate(q_kwargs) df = db_query_network(engine, **q_kwargs) if len(df) == 0: raise APINoResultError('No edge could be built!') response = dict(status='OK', num_of_entries=len(df), edges=flask.json.loads(df.to_json(**TO_JSON_KWARGS))) except SchemaError as e: response = dict(status='ERROR', error=str(e)) except APINoResultError as e: response = dict(status='No result error', error=str(e)) except Exception as e: logger.exception(e) response = dict(status='ERROR', error='Server error, query failed') return flask.jsonify(response)
def test_api_flow(n1=N1, n2=N2, min_score=MIN_SCORE, min_date_published=STRAMING_START_AT, query='', sort_by='relevant', use_lucene_syntax=False): try: t0 = time.time() logger.debug('Starting lucene query %r at %r', query, t0) n, df = searcher.search( n1=N1, n2=N2, min_score_of_recent_sorting=MIN_SCORE, min_date_published=STRAMING_START_AT, query=query, sort_by=sort_by, use_lucene_syntax=use_lucene_syntax) t1 = time.time() logger.debug('Starting filtering disabled sites at %r', t1) df = db_query_filter_disabled_site(engine, df) t2 = time.time() logger.debug('Starting querying number tweets sharing at %r', t2) df = db_query_twitter_shares(engine, df) if len(df) == 0: raise APINoResultError('No article found!') # sort dataframe by 'number_of_tweets' df = df.sort_values('number_of_tweets', ascending=False) t3 = time.time() ids = df.iloc[:10].id.tolist() logger.debug('Starting querying network %s by old api at %r', ids, t3) df = db_query_network_old(engine, ids=ids) t4 = time.time() logger.debug('Starting querying network %s by new api at %r', ids, t4) df = db_query_network(engine, ids=ids) t5 = time.time() except Exception as e: logger.error(e) return None return dict( t0_lucene_query=(t1 - t0), t1_article_filtering=(t2 - t1), t2_article_sharing=(t3 - t2), t3_network_building_old=(t4 - t3), t4_network_buiding_new=(t5 - t4))
from hoaxy.database import ENGINE as engine from hoaxy.ir.search import db_query_network import logging logger = logging.getLogger(__name__) gids = [1, 2, 3, 4, 5] df1 = db_query_network(engine, ids=gids) # df2 = db_query_network_old(engine, ids=gids) # number of retweet edges unweighted_retweet_cols = ['tweet_id', 'from_user_id', 'to_user_id'] logger.info('New api, number of unweighted retweet edges is %s', len(df1[unweighted_retweet_cols].drop_duplicates())) # logger.info('Old api, number of unweighted retweet edges is %s', # len(df2[unweighted_retweet_cols].drop_duplicates())) cols = list(df1.columns.values) cols.remove('url_id') s1 = set(tuple(x) for x in df1[cols].values) # s2 = set(tuple(x) for x in df2[cols].values)