def test_query(self): app = Vespa(url="http://localhost", port=8080) body = {"yql": "select * from sources * where test"} self.assertDictEqual( app.query(body=body, debug_request=True).request_body, body) self.assertDictEqual( app.query( query="this is a test", query_model=Query(match_phase=OR(), rank_profile=RankProfile()), debug_request=True, hits=10, ).request_body, { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "default", "listFeatures": "false" }, "hits": 10, }, ) self.assertDictEqual( app.query( query="this is a test", query_model=Query(match_phase=OR(), rank_profile=RankProfile()), debug_request=True, hits=10, recall=("id", [1, 5]), ).request_body, { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "default", "listFeatures": "false" }, "hits": 10, "recall": "+(id:1 id:5)", }, )
def test_query_with_body_function(self): app = Vespa(url="http://localhost", port=8080) def body_function(query): body = { "yql": "select * from sources * where userQuery();", "query": query, "type": "any", "ranking": { "profile": "bm25", "listFeatures": "true" }, } return body query_model = QueryModel(body_function=body_function) self.assertDictEqual( app.query( query="this is a test", query_model=query_model, debug_request=True, hits=10, recall=("id", [1, 5]), ).request_body, { "yql": "select * from sources * where userQuery();", "query": "this is a test", "type": "any", "ranking": { "profile": "bm25", "listFeatures": "true" }, "hits": 10, "recall": "+(id:1 id:5)", }, )
class TwitterInserter: def run(self): try: config.load_incluster_config() except: config.load_kube_config() v1 = client.CoreV1Api() twitter_secrets = v1.read_namespaced_secret(name='twitter-secrets', namespace='default').data api_key = base64.b64decode(twitter_secrets["api-key"]).decode('utf-8') api_secret = base64.b64decode( twitter_secrets["api-secret"]).decode('utf-8') self.vespa = Vespa(url="http://vespa-search", port=8080) auth = tweepy.AppAuthHandler(api_key, api_secret) self.api = tweepy.API(auth) updated = 0 for userid in [ 'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage', 'canberratimes', 'zdnetaustralia', 'newscomauHQ', 'westaustralian', 'SBSNews', 'australian', 'crikey_news', '9NewsAUS', 'BBCNewsAus' ]: try: for status in tweepy.Cursor(self.api.user_timeline, id=userid, include_entities=True, tweet_mode="extended").items(60): if len(status.entities['urls']) == 0: continue url = status.entities['urls'][0]['expanded_url'] url = self.get_url(url) if (url.startswith("https://twitter.com") or url.startswith("https://www.reddit.com")): continue article = self.get_article(url) if article: self.update_document(article, status) updated += 1 # else: # self.insert_document(url) except Exception as e: print("exception! {}".format(e)) continue print("Completed run, updated {} tweets".format(updated)) def get_url(self, url): if (re.match(r'https?://zd.net', url) or url.startswith("https://trib.al") or url.startswith("https://bit.ly") or url.startswith("https://bbc.in")): url = urlopen(url).geturl() return self.get_url(url) else: return url.split('?')[0] # def insert_document(self, url): # payload = {'url': url } # requests.get("http://localhost:8000/", params=payload) # print("Hit spider url for {}".format(url)) def update_document(self, article, status): vespa_fields = {} vespa_fields['twitter_favourite_count'] = status.favorite_count vespa_fields['twitter_retweet_count'] = status.retweet_count vespa_fields[ 'twitter_link'] = 'https://twitter.com/{}/status/{}'.format( status.user.screen_name, status.id) response = self.vespa.update_data( schema="newsarticle", data_id=hashlib.sha256( article['fields']['url'].encode()).hexdigest(), fields=vespa_fields) print("Updated {} with {} {}: {}".format(article['fields']['url'], status.favorite_count, status.retweet_count, response)) def get_article(self, url): article_time = time.time() - 24 * 60 * 60 body = { 'yql': 'select url from sources newsarticle where userQuery();', 'query': "url:{}".format(url), 'hits': 1, } results = self.vespa.query(body=body) if len(results.hits) > 0: return results.hits[0]
class TwitterInserter: api_key = "TWITTER_API_KEY" api_secret = "TWITTER_API_SECRET" def run(self): self.vespa = Vespa(url="http://vespa-search", port=8080) auth = tweepy.AppAuthHandler(self.api_key, self.api_secret) self.api = tweepy.API(auth) updated = 0 for userid in [ 'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage', 'canberratimes', 'zdnetaustralia', 'newscomauHQ', 'westaustralian' ]: try: for status in tweepy.Cursor(self.api.user_timeline, id=userid, include_entities=True).items(60): if len(status.entities['urls']) == 0: continue url = status.entities['urls'][0]['expanded_url'] url = url.split('?')[0] if (url.startswith("https://twitter.com")): continue if (url.startswith("https://zd.net") or url.startswith("https://bit.ly")): url = urlopen(url).geturl() article = self.get_article(url) if article: self.update_document(article, status) updated += 1 except Exception as e: logger.error(e) print("Completed run, updated {} tweets".format(updated)) def update_document(self, article, status): vespa_fields = {} vespa_fields['twitter_favourite_count'] = status.favorite_count vespa_fields['twitter_retweet_count'] = status.retweet_count response = self.vespa.update_data( schema="newsarticle", data_id=hashlib.sha256( article['fields']['url'].encode()).hexdigest(), fields=vespa_fields) #print("Updated {} with {} {}: {}".format(article['fields']['url'], status.favorite_count, status.retweet_count, response)) def get_article(self, url): article_time = time.time() - 24 * 60 * 60 body = { 'yql': 'select url from sources newsarticle where userQuery();', 'query': "url:{}".format(url), 'hits': 1, } results = self.vespa.query(body=body) if len(results.hits) > 0: return results.hits[0] def get_twitter_user(self, url): if url.startswith("https://www.abc.net.au"): return "abcnews" if url.startswith("https://www.theguardian.com/"): return "GuardianAus" if url.startswith("https://www.smh.com.au"): return "smh" if url.startswith("https://www.itnews.com.au"): return "iTnews_au" if url.startswith("https://www.theage.com.au"): return "theage" if url.startswith("https://www.canberratimes.com.au"): return "canberratimes" if url.startswith("https://www.zdnet.com"): return "zdnetaustralia" if url.startswith("https://www.news.com.au"): return "newscomauHQ" if url.startswith("https://thewest.com.au"): return "westaustralian"
def test_workflow(self): # # Connect to a running Vespa Application # app = Vespa(url="https://api.cord19.vespa.ai") # # Define a query model # match_phase = Union( WeakAnd(hits=10), ANN( doc_vector="title_embedding", query_vector="title_vector", hits=10, label="title", ), ) rank_profile = Ranking(name="bm25", list_features=True) query_model = QueryModel( name="ANN_bm25", query_properties=[ QueryRankingFeature( name="title_vector", mapping=lambda x: [random() for x in range(768)], ) ], match_phase=match_phase, rank_profile=rank_profile, ) # # Query Vespa app # query_result = app.query( query="Is remdesivir an effective treatment for COVID-19?", query_model=query_model, ) self.assertTrue(query_result.number_documents_retrieved > 0) self.assertEqual(len(query_result.hits), 10) # # Define labelled data # labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] # equivalent data in df format labeled_data_df = DataFrame( data={ "qid": [0, 0, 1, 1], "query": ["Intrauterine virus infections and congenital heart disease"] * 2 + [ "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus" ] * 2, "doc_id": [0, 3, 1, 5], "relevance": [1, 1, 1, 1], }) # # Collect training data # training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertTrue(training_data_batch.shape[0] > 0) self.assertEqual( len({"document_id", "query_id", "label"}.intersection(set(training_data_batch.columns))), 3, ) # # Evaluate a query model # eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)] evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=query_model, id_field="id", ) self.assertEqual(evaluation.shape, (9, 1)) # # AssertionError - two models with the same name # with self.assertRaises(AssertionError): _ = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), QueryModel(), query_model], id_field="id", ) evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), query_model], id_field="id", ) self.assertEqual(evaluation.shape, (9, 2)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, ) self.assertEqual(evaluation.shape, (15, 1)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, per_query=True, ) self.assertEqual(evaluation.shape, (2, 7))