Esempio n. 1
0
    def test_query(self):
        app = Vespa(url="http://localhost", port=8080)

        body = {"yql": "select * from sources * where test"}
        self.assertDictEqual(
            app.query(body=body, debug_request=True).request_body, body)

        self.assertDictEqual(
            app.query(
                query="this is a test",
                query_model=Query(match_phase=OR(),
                                  rank_profile=RankProfile()),
                debug_request=True,
                hits=10,
            ).request_body,
            {
                "yql":
                'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));',
                "ranking": {
                    "profile": "default",
                    "listFeatures": "false"
                },
                "hits": 10,
            },
        )

        self.assertDictEqual(
            app.query(
                query="this is a test",
                query_model=Query(match_phase=OR(),
                                  rank_profile=RankProfile()),
                debug_request=True,
                hits=10,
                recall=("id", [1, 5]),
            ).request_body,
            {
                "yql":
                'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));',
                "ranking": {
                    "profile": "default",
                    "listFeatures": "false"
                },
                "hits": 10,
                "recall": "+(id:1 id:5)",
            },
        )
Esempio n. 2
0
    def test_query_with_body_function(self):
        app = Vespa(url="http://localhost", port=8080)

        def body_function(query):
            body = {
                "yql": "select * from sources * where userQuery();",
                "query": query,
                "type": "any",
                "ranking": {
                    "profile": "bm25",
                    "listFeatures": "true"
                },
            }
            return body

        query_model = QueryModel(body_function=body_function)

        self.assertDictEqual(
            app.query(
                query="this is a test",
                query_model=query_model,
                debug_request=True,
                hits=10,
                recall=("id", [1, 5]),
            ).request_body,
            {
                "yql": "select * from sources * where userQuery();",
                "query": "this is a test",
                "type": "any",
                "ranking": {
                    "profile": "bm25",
                    "listFeatures": "true"
                },
                "hits": 10,
                "recall": "+(id:1 id:5)",
            },
        )
Esempio n. 3
0
class TwitterInserter:
    def run(self):
        try:
            config.load_incluster_config()
        except:
            config.load_kube_config()
        v1 = client.CoreV1Api()
        twitter_secrets = v1.read_namespaced_secret(name='twitter-secrets',
                                                    namespace='default').data
        api_key = base64.b64decode(twitter_secrets["api-key"]).decode('utf-8')
        api_secret = base64.b64decode(
            twitter_secrets["api-secret"]).decode('utf-8')
        self.vespa = Vespa(url="http://vespa-search", port=8080)
        auth = tweepy.AppAuthHandler(api_key, api_secret)
        self.api = tweepy.API(auth)
        updated = 0
        for userid in [
                'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage',
                'canberratimes', 'zdnetaustralia', 'newscomauHQ',
                'westaustralian', 'SBSNews', 'australian', 'crikey_news',
                '9NewsAUS', 'BBCNewsAus'
        ]:
            try:
                for status in tweepy.Cursor(self.api.user_timeline,
                                            id=userid,
                                            include_entities=True,
                                            tweet_mode="extended").items(60):
                    if len(status.entities['urls']) == 0:
                        continue
                    url = status.entities['urls'][0]['expanded_url']
                    url = self.get_url(url)
                    if (url.startswith("https://twitter.com")
                            or url.startswith("https://www.reddit.com")):
                        continue
                    article = self.get_article(url)
                    if article:
                        self.update_document(article, status)
                        updated += 1
#                    else:
#                        self.insert_document(url)
            except Exception as e:
                print("exception! {}".format(e))
                continue
        print("Completed run, updated {} tweets".format(updated))

    def get_url(self, url):
        if (re.match(r'https?://zd.net', url)
                or url.startswith("https://trib.al")
                or url.startswith("https://bit.ly")
                or url.startswith("https://bbc.in")):
            url = urlopen(url).geturl()
            return self.get_url(url)
        else:
            return url.split('?')[0]


#    def insert_document(self, url):
#        payload = {'url': url }
#        requests.get("http://localhost:8000/", params=payload)
#        print("Hit spider url for {}".format(url))

    def update_document(self, article, status):
        vespa_fields = {}
        vespa_fields['twitter_favourite_count'] = status.favorite_count
        vespa_fields['twitter_retweet_count'] = status.retweet_count
        vespa_fields[
            'twitter_link'] = 'https://twitter.com/{}/status/{}'.format(
                status.user.screen_name, status.id)
        response = self.vespa.update_data(
            schema="newsarticle",
            data_id=hashlib.sha256(
                article['fields']['url'].encode()).hexdigest(),
            fields=vespa_fields)
        print("Updated {} with {} {}: {}".format(article['fields']['url'],
                                                 status.favorite_count,
                                                 status.retweet_count,
                                                 response))

    def get_article(self, url):
        article_time = time.time() - 24 * 60 * 60
        body = {
            'yql': 'select url from sources newsarticle where userQuery();',
            'query': "url:{}".format(url),
            'hits': 1,
        }
        results = self.vespa.query(body=body)
        if len(results.hits) > 0:
            return results.hits[0]
Esempio n. 4
0
class TwitterInserter:
    api_key = "TWITTER_API_KEY"
    api_secret = "TWITTER_API_SECRET"

    def run(self):
        self.vespa = Vespa(url="http://vespa-search", port=8080)
        auth = tweepy.AppAuthHandler(self.api_key, self.api_secret)
        self.api = tweepy.API(auth)
        updated = 0
        for userid in [
                'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage',
                'canberratimes', 'zdnetaustralia', 'newscomauHQ',
                'westaustralian'
        ]:
            try:
                for status in tweepy.Cursor(self.api.user_timeline,
                                            id=userid,
                                            include_entities=True).items(60):
                    if len(status.entities['urls']) == 0:
                        continue
                    url = status.entities['urls'][0]['expanded_url']
                    url = url.split('?')[0]
                    if (url.startswith("https://twitter.com")):
                        continue
                    if (url.startswith("https://zd.net")
                            or url.startswith("https://bit.ly")):
                        url = urlopen(url).geturl()
                    article = self.get_article(url)
                    if article:
                        self.update_document(article, status)
                        updated += 1
            except Exception as e:
                logger.error(e)
        print("Completed run, updated {} tweets".format(updated))

    def update_document(self, article, status):
        vespa_fields = {}
        vespa_fields['twitter_favourite_count'] = status.favorite_count
        vespa_fields['twitter_retweet_count'] = status.retweet_count
        response = self.vespa.update_data(
            schema="newsarticle",
            data_id=hashlib.sha256(
                article['fields']['url'].encode()).hexdigest(),
            fields=vespa_fields)
        #print("Updated {} with {} {}: {}".format(article['fields']['url'], status.favorite_count, status.retweet_count, response))

    def get_article(self, url):
        article_time = time.time() - 24 * 60 * 60
        body = {
            'yql': 'select url from sources newsarticle where userQuery();',
            'query': "url:{}".format(url),
            'hits': 1,
        }
        results = self.vespa.query(body=body)
        if len(results.hits) > 0:
            return results.hits[0]

    def get_twitter_user(self, url):
        if url.startswith("https://www.abc.net.au"):
            return "abcnews"
        if url.startswith("https://www.theguardian.com/"):
            return "GuardianAus"
        if url.startswith("https://www.smh.com.au"):
            return "smh"
        if url.startswith("https://www.itnews.com.au"):
            return "iTnews_au"
        if url.startswith("https://www.theage.com.au"):
            return "theage"
        if url.startswith("https://www.canberratimes.com.au"):
            return "canberratimes"
        if url.startswith("https://www.zdnet.com"):
            return "zdnetaustralia"
        if url.startswith("https://www.news.com.au"):
            return "newscomauHQ"
        if url.startswith("https://thewest.com.au"):
            return "westaustralian"
Esempio n. 5
0
    def test_workflow(self):
        #
        # Connect to a running Vespa Application
        #
        app = Vespa(url="https://api.cord19.vespa.ai")
        #
        # Define a query model
        #
        match_phase = Union(
            WeakAnd(hits=10),
            ANN(
                doc_vector="title_embedding",
                query_vector="title_vector",
                hits=10,
                label="title",
            ),
        )
        rank_profile = Ranking(name="bm25", list_features=True)
        query_model = QueryModel(
            name="ANN_bm25",
            query_properties=[
                QueryRankingFeature(
                    name="title_vector",
                    mapping=lambda x: [random() for x in range(768)],
                )
            ],
            match_phase=match_phase,
            rank_profile=rank_profile,
        )
        #
        # Query Vespa app
        #
        query_result = app.query(
            query="Is remdesivir an effective treatment for COVID-19?",
            query_model=query_model,
        )
        self.assertTrue(query_result.number_documents_retrieved > 0)
        self.assertEqual(len(query_result.hits), 10)
        #
        # Define labelled data
        #
        labeled_data = [
            {
                "query_id": 0,
                "query":
                "Intrauterine virus infections and congenital heart disease",
                "relevant_docs": [{
                    "id": 0,
                    "score": 1
                }, {
                    "id": 3,
                    "score": 1
                }],
            },
            {
                "query_id": 1,
                "query":
                "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
                "relevant_docs": [{
                    "id": 1,
                    "score": 1
                }, {
                    "id": 5,
                    "score": 1
                }],
            },
        ]
        # equivalent data in df format
        labeled_data_df = DataFrame(
            data={
                "qid": [0, 0, 1, 1],
                "query":
                ["Intrauterine virus infections and congenital heart disease"]
                * 2 + [
                    "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus"
                ] * 2,
                "doc_id": [0, 3, 1, 5],
                "relevance": [1, 1, 1, 1],
            })

        #
        # Collect training data
        #
        training_data_batch = app.collect_training_data(
            labeled_data=labeled_data,
            id_field="id",
            query_model=query_model,
            number_additional_docs=2,
            fields=["rankfeatures"],
        )
        self.assertTrue(training_data_batch.shape[0] > 0)
        self.assertEqual(
            len({"document_id", "query_id",
                 "label"}.intersection(set(training_data_batch.columns))),
            3,
        )
        #
        # Evaluate a query model
        #
        eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)]
        evaluation = app.evaluate(
            labeled_data=labeled_data,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
        )
        self.assertEqual(evaluation.shape, (9, 1))

        #
        # AssertionError - two models with the same name
        #
        with self.assertRaises(AssertionError):
            _ = app.evaluate(
                labeled_data=labeled_data,
                eval_metrics=eval_metrics,
                query_model=[QueryModel(),
                             QueryModel(), query_model],
                id_field="id",
            )

        evaluation = app.evaluate(
            labeled_data=labeled_data,
            eval_metrics=eval_metrics,
            query_model=[QueryModel(), query_model],
            id_field="id",
        )
        self.assertEqual(evaluation.shape, (9, 2))

        evaluation = app.evaluate(
            labeled_data=labeled_data_df,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
            detailed_metrics=True,
        )
        self.assertEqual(evaluation.shape, (15, 1))

        evaluation = app.evaluate(
            labeled_data=labeled_data_df,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
            detailed_metrics=True,
            per_query=True,
        )
        self.assertEqual(evaluation.shape, (2, 7))