Ejemplo n.º 1
0
    def testDownloadingOfArticleUris(self):
        iter = QueryArticlesIter(conceptUri=self.er.getConceptUri("Trump"),
                                 dateStart="2017-02-04",
                                 dateEnd="2017-02-06")
        # number of matches
        count = iter.count(self.er)
        self.assertTrue(count > 0)
        print("\nFound %d articles by uris\nDownloading page:" % count, end="")

        # try again with a randomized order of pages
        print("\nFound %d articles by uris\nDownloading page:" % count, end="")
        uriSet = set()
        totArts = 0
        pages = list(range(1, int(1 + math.ceil(count / 10000))))
        random.shuffle(pages)
        for page in pages:
            print("%d" % page, end=", ")
            q = QueryArticles(conceptUri=self.er.getConceptUri("Trump"),
                              dateStart="2017-02-04",
                              dateEnd="2017-02-06")
            q.setRequestedResult(
                RequestArticlesUriWgtList(page=page, count=10000))
            res = self.er.execQuery(q)
            c = res.get("uriWgtList", {}).get("totalResults", -1)
            self.assertTrue(c == count)
            arts = res.get("uriWgtList", {}).get("results", [])
            for art in arts:
                uriSet.add(art.split(":")[0])
            self.assertTrue(len(arts) <= 10000)
            totArts += len(arts)
        self.assertTrue(len(uriSet) == count)
        self.assertTrue(totArts == count)

        erAdmin = EventRegistryAdmin(self.er._host)
        erAdmin.clearCache()

        uriSet = set()
        totArts = 0
        pages = list(range(1, int(1 + math.ceil(count / 10000))))
        for page in pages:
            print("%d" % page, end=", ")
            q = QueryArticles(conceptUri=self.er.getConceptUri("Trump"),
                              dateStart="2017-02-04",
                              dateEnd="2017-02-06")
            q.setRequestedResult(
                RequestArticlesUriWgtList(page=page, count=10000))
            res = self.er.execQuery(q)
            c = res.get("uriWgtList", {}).get("totalResults", -1)
            self.assertTrue(c == count)
            arts = res.get("uriWgtList", {}).get("results", [])
            for art in arts:
                uriSet.add(art.split(":")[0])
            self.assertTrue(len(arts) <= 10000)
            totArts += len(arts)
        self.assertTrue(len(uriSet) == count)
        self.assertTrue(totArts == count)
Ejemplo n.º 2
0
    def testDownloadingOfArticlePages(self):
        """
        download article pages in random order of pages and in the normal order
        """
        iter = QueryArticlesIter(sourceUri="bbc.co.uk",
                                 dateStart="2017-02-04",
                                 dateEnd="2017-02-06")
        # number of matches
        count = iter.count(self.er)
        self.assertTrue(count > 0)
        print("\nFound %d articles" % count)

        # try again with a randomized order of pages
        uriSet = set()
        totArts = 0
        pages = list(range(1, int(1 + math.ceil(count / 100))))
        random.shuffle(pages)
        for page in pages:
            q = QueryArticles(sourceUri="bbc.co.uk",
                              dateStart="2017-02-04",
                              dateEnd="2017-02-06")
            q.setRequestedResult(RequestArticlesInfo(page=page, count=100))
            res = self.er.execQuery(q)
            c = res.get("articles", {}).get("totalResults", -1)
            self.assertTrue(c == count)
            arts = res.get("articles", {}).get("results", [])
            for art in arts:
                uriSet.add(art["uri"])
            self.assertTrue(len(arts) <= 100)
            totArts += len(arts)
        self.assertTrue(len(uriSet) == count)
        self.assertTrue(totArts == count)

        erAdmin = EventRegistryAdmin(self.er._host)
        erAdmin.clearCache()

        uriSet = set()
        totArts = 0
        pages = list(range(1, int(1 + math.ceil(count / 100))))
        for page in pages:
            q = QueryArticles(sourceUri="bbc.co.uk",
                              dateStart="2017-02-04",
                              dateEnd="2017-02-06")
            q.setRequestedResult(RequestArticlesInfo(page=page, count=100))
            res = self.er.execQuery(q)
            c = res.get("articles", {}).get("totalResults", -1)
            self.assertTrue(c == count)
            arts = res.get("articles", {}).get("results", [])
            for art in arts:
                uriSet.add(art["uri"])
            self.assertTrue(len(arts) <= 100)
            totArts += len(arts)
        self.assertTrue(len(uriSet) == count)
        self.assertTrue(totArts == count)
Ejemplo n.º 3
0
    def testAllPagesArt1(self):
        """
        download all pages of results through articles directly and using uriWgtList - in both cases should be the same
        """
        q = QueryArticles(sourceUri="bbc.co.uk",
                          dateStart="2017-02-04",
                          dateEnd="2017-02-06")
        page = 1
        uriList = []
        while True:
            q.setRequestedResult(RequestArticlesInfo(page=page, count=100))
            res = self.er.execQuery(q)
            arr = res.get("articles", {}).get("results", [])
            uriList.extend([art["uri"] for art in arr])
            page += 1
            if len(arr) == 0:
                break

        erAdmin = EventRegistryAdmin(self.er._host)
        erAdmin.clearCache()

        q = QueryArticles(sourceUri="bbc.co.uk",
                          dateStart="2017-02-04",
                          dateEnd="2017-02-06")
        page = 1
        uriList2 = []
        while True:
            q.setRequestedResult(
                RequestArticlesUriWgtList(page=page, count=100))
            res = self.er.execQuery(q)
            arr = res.get("uriWgtList", {}).get("results", [])
            uriList2.extend(self.er.getUriFromUriWgt(arr))
            page += 1
            if len(arr) == 0:
                break

        self.assertTrue(len(uriList) > 0)
        uriList.sort()
        uriList2.sort()
        self.assertTrue(len(uriList) == len(uriList2))
        for i in range(len(uriList)):
            self.assertTrue(uriList[i] == uriList2[i])
Ejemplo n.º 4
0
    def testPagingUri1(self):
        """
        test pages 1 and 2, download uriwgtlist and then test in reverse
        """
        q = QueryArticles(sourceUri="bbc.co.uk",
                          dateStart="2017-02-04",
                          dateEnd="2017-02-06")
        q.setRequestedResult(RequestArticlesUriWgtList(page=1, count=1000))
        res = self.er.execQuery(q)
        arr = res.get("uriWgtList", {}).get("results", [])
        uriList = self.er.getUriFromUriWgt(arr)
        self.assertTrue(len(uriList) > 0)

        q.setRequestedResult(RequestArticlesUriWgtList(page=2, count=1000))
        res = self.er.execQuery(q)
        arr = res.get("uriWgtList", {}).get("results", [])
        uriList.extend(self.er.getUriFromUriWgt(arr))

        erAdmin = EventRegistryAdmin(self.er._host)
        erAdmin.clearCache()

        q = QueryArticles(sourceUri="bbc.co.uk",
                          dateStart="2017-02-04",
                          dateEnd="2017-02-06")
        q.setRequestedResult(RequestArticlesUriWgtList(page=2, count=1000))
        res = self.er.execQuery(q)
        arr = res.get("uriWgtList", {}).get("results", [])
        uriList2 = self.er.getUriFromUriWgt(arr)

        q.setRequestedResult(RequestArticlesUriWgtList(page=1, count=1000))
        res = self.er.execQuery(q)
        arr = res.get("uriWgtList", {}).get("results", [])
        uriList2.extend(self.er.getUriFromUriWgt(arr))

        uriList.sort()
        uriList2.sort()
        self.assertTrue(len(uriList) == len(uriList2))
        for i in range(len(uriList)):
            self.assertTrue(uriList[i] == uriList2[i])
from __future__ import print_function
import unittest, math, random
from eventregistry import *
from DataValidator import DataValidator
from eventregistryadmin import EventRegistryAdmin

erAdmin = EventRegistryAdmin(host=DataValidator().er._host)


class TestQueryPaging(DataValidator):
    def testPagingUri1(self):
        """
        test pages 1 and 2, download uriwgtlist and then test in reverse
        """
        q = QueryArticles(sourceUri="bbc.co.uk",
                          dateStart="2018-04-22",
                          dateEnd="2018-04-25")
        q.setRequestedResult(RequestArticlesUriWgtList(page=1, count=1000))
        res = self.er.execQuery(q)
        arr = res.get("uriWgtList", {}).get("results", [])
        uriList = self.er.getUriFromUriWgt(arr)

        q.setRequestedResult(RequestArticlesUriWgtList(page=2, count=1000))
        res = self.er.execQuery(q)
        arr = res.get("uriWgtList", {}).get("results", [])
        uriList.extend(self.er.getUriFromUriWgt(arr))

        erAdmin.clearCache()

        q = QueryArticles(sourceUri="bbc.co.uk",
                          dateStart="2018-04-22",