Beispiel #1
0
 def test_replacement(self):
     s = [0] * 50 + [1] * 50
     c1 = np.array(s).reshape((100, 1))
     s = [0] * 5 + [1] * 5 + [2] * 90
     c2 = np.array(s).reshape((100, 1))
     x = np.hstack([c1, c2])
     domain = data.Domain(
         [
             data.ContinuousVariable("a"),
             data.DiscreteVariable("b", values="ABC")
         ],
         data.ContinuousVariable("c"),
     )
     table = Table(domain, x, c1)
     for col, computed_value in ((0, 0.5), (1, 2)):
         var1 = preprocess.Average()(table, col)
         self.assertIsInstance(var1.compute_value,
                               preprocess.ReplaceUnknowns)
         self.assertEqual(var1.compute_value.value, computed_value)
Beispiel #2
0
    def test_replacement(self):
        nan = np.nan
        X = [
            [1.0, nan, 0.0],
            [2.0, 1.0, 3.0],
            [nan, nan, nan]
        ]
        domain = data.Domain(
            (data.DiscreteVariable("A", values=["0", "1", "2"]),
             data.ContinuousVariable("B"),
             data.ContinuousVariable("C"))
        )
        table = data.Table.from_numpy(domain, np.array(X))

        v1 = impute.AsValue()(table, domain[0])
        self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
        self.assertTrue(np.all(v1.compute_value(table) == [1., 2., 3.]))
        self.assertEqual([v1.str_val(v) for v in v1.compute_value(table)],
                         ["1", "2", "N/A"])

        v1, v2 = impute.AsValue()(table, domain[1])
        self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
        self.assertTrue(np.all(np.isfinite(v2.compute_value(table))))
        self.assertTrue(np.all(v2.compute_value(table) == [0., 1., 0.]))
        self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)],
                         ["undef", "def", "undef"])

        vars = reduce(lambda acc, v:
                          acc + (list(v) if isinstance(v, (tuple, list))
                                 else [v]),
                      [impute.AsValue()(table, var) for var in table.domain],
                      [])
        domain = data.Domain(vars)
        idata = table.from_table(domain, table)

        np.testing.assert_allclose(
            idata.X,
            [[1, 1.0, 0, 0.0, 1],
             [2, 1.0, 1, 3.0, 1],
             [3, 1.0, 0, 1.5, 0]]
        )
    def test_replacement(self):
        nan = np.nan
        X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]]
        unknowns = np.isnan(X)

        domain = data.Domain(
            (data.DiscreteVariable("A", values=("0", "1", "2")),
             data.ContinuousVariable("B"), data.ContinuousVariable("C")))
        table = data.Table.from_numpy(domain, np.array(X))

        for i in range(0, 3):
            v = impute.Random()(table, domain[i])
            self.assertTrue(np.all(np.isfinite(v.compute_value(table))))

        imputer = preprocess.Impute(method=impute.Random())
        itable = imputer(table)
        self.assertTrue(np.all(np.isfinite(itable.X)))

        # Original data should keep unknowns
        self.assertTrue(np.all(unknowns == np.isnan(table.X)))
        self.assertTrue(np.all(itable.X[~unknowns] == table.X[~unknowns]))
Beispiel #4
0
    def test_find_compatible_unordered(self):
        gend = data.DiscreteVariable("gend", values=["F", "M"])

        find_comp = data.DiscreteVariable.find_compatible
        self.assertIs(find_comp("gend"), gend)
        self.assertIs(find_comp("gend", values=["F"]), gend)
        self.assertIs(find_comp("gend", values=["F", "M"]), gend)
        self.assertIs(find_comp("gend", values=["M", "F"]), gend)

        # Incompatible since it is ordered
        self.assertIsNone(find_comp("gend", values=["M", "F"], ordered=True))
        self.assertIsNone(find_comp("gend", values=["F", "M"], ordered=True))
        self.assertIsNone(find_comp("gend", values=["F"], ordered=True))
        self.assertIsNone(find_comp("gend", values=["M"], ordered=True))
        self.assertIsNone(find_comp("gend", values=["N"], ordered=True))

        # Incompatible due to empty intersection
        self.assertIsNone(find_comp("gend", values=["N"]))

        # Compatible, adds values
        self.assertIs(find_comp("gend", values=["F", "N", "R"]), gend)
        self.assertEqual(gend.values, ["F", "M", "N", "R"])
Beispiel #5
0
    def test_replacement(self):
        from Orange.classification import MajorityLearner, SimpleTreeLearner
        from Orange.regression import MeanLearner

        nan = np.nan
        X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]]
        unknowns = np.isnan(X)

        domain = data.Domain(
            (data.DiscreteVariable("A", values=["0", "1", "2"]),
             data.ContinuousVariable("B"), data.ContinuousVariable("C")))
        table = data.Table.from_numpy(domain, np.array(X))

        v = impute.Model(MajorityLearner())(table, domain[0])
        self.assertTrue(np.all(np.isfinite(v.compute_value(table))))
        self.assertTrue(
            np.all(v.compute_value(table) == [1., 2., 1.])
            or np.all(v.compute_value(table) == [1., 2., 2.]))
        v = impute.Model(MeanLearner())(table, domain[1])
        self.assertTrue(np.all(np.isfinite(v.compute_value(table))))
        self.assertTrue(np.all(v.compute_value(table) == [1., 1., 1.]))

        imputer = preprocess.Impute(impute.Model(SimpleTreeLearner()))
        itable = imputer(table)

        # Original data should keep unknowns
        self.assertTrue(np.all(np.isnan(table.X) == unknowns))
        self.assertTrue(np.all(itable.X[~unknowns] == table.X[~unknowns]))

        Aimp = itable.domain["A"].compute_value
        self.assertIsInstance(Aimp, impute.ReplaceUnknownsModel)

        col = Aimp(table)
        self.assertEqual(col.shape, (len(table), ))
        self.assertTrue(np.all(np.isfinite(col)))

        v = Aimp(table[-1])
        self.assertEqual(v.shape, (1, ))
        self.assertTrue(np.all(np.isfinite(v)))
Beispiel #6
0
    def test_sparse_get_distributions(self):
        def assert_dist_and_unknowns(computed, goal_dist):
            nonlocal d
            goal_dist = np.array(goal_dist)
            sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim ==
                              2 else goal_dist)
            n_all = np.sum(d.W) if d.has_weights() else len(d)

            assert_dist_almost_equal(computed, goal_dist)
            self.assertEqual(computed.unknowns, n_all - sum_dist)

        domain = data.Domain([
            data.DiscreteVariable("d%i" % i, values=tuple("abc"))
            for i in range(10)
        ] + [data.ContinuousVariable("c%i" % i) for i in range(10)])

        # pylint: disable=bad-whitespace
        X = sp.csr_matrix(
            # 0  1  2  3       4       5       6  7  8  9 10 11 12   13 14 15 16      17 18 19
            # --------------------------------------------------------------------------------
            [[0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, np.nan, 2, 0],
             [
                 0, 0, 1, 1, np.nan, np.nan, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1,
                 np.nan, 0, 0
             ],
             [0, 0, 0, 1, 0, 2, np.nan, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0, 0, 0, 0]])
        warnings.filterwarnings("ignore", ".*", sp.SparseEfficiencyWarning)
        X[0, 0] = 0

        d = data.Table.from_numpy(domain, X)
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        zeros = [5, 0, 0]
        assert_dist_and_unknowns(ddist[0], zeros)
        assert_dist_and_unknowns(ddist[1], [4, 0, 1])
        assert_dist_and_unknowns(ddist[2], [3, 1, 1])
        assert_dist_and_unknowns(ddist[3], [2, 2, 1])
        assert_dist_and_unknowns(ddist[4], [3, 1, 0])
        assert_dist_and_unknowns(ddist[5], [2, 1, 1])
        assert_dist_and_unknowns(ddist[6], [1, 2, 1])
        assert_dist_and_unknowns(ddist[7], zeros)
        assert_dist_and_unknowns(ddist[8], [4, 0, 1])
        assert_dist_and_unknowns(ddist[9], [4, 1, 0])

        zeros = [[0], [5]]
        assert_dist_and_unknowns(ddist[10], zeros)
        assert_dist_and_unknowns(ddist[11], zeros)
        assert_dist_and_unknowns(ddist[12], zeros)
        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[15], zeros)
        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [3]])
        assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]])
        assert_dist_and_unknowns(ddist[19], zeros)

        with d.unlocked():
            d.set_weights(np.array([1, 2, 3, 4, 5]))
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        assert_dist_and_unknowns(ddist[0], [15, 0, 0])
        assert_dist_and_unknowns(ddist[1], [14, 0, 1])
        assert_dist_and_unknowns(ddist[2], [8, 2, 5])
        assert_dist_and_unknowns(ddist[3], [9, 5, 1])
        assert_dist_and_unknowns(ddist[4], [12, 1, 0])
        assert_dist_and_unknowns(ddist[5], [9, 1, 3])
        assert_dist_and_unknowns(ddist[6], [4, 7, 1])
        assert_dist_and_unknowns(ddist[7], [15, 0, 0])
        assert_dist_and_unknowns(ddist[8], [13, 0, 2])
        assert_dist_and_unknowns(ddist[9], [14, 1, 0])

        zeros = [[0], [15]]
        assert_dist_and_unknowns(ddist[10], zeros)
        assert_dist_and_unknowns(ddist[11], zeros)
        assert_dist_and_unknowns(ddist[12], zeros)
        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]])
        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]])
        assert_dist_and_unknowns(ddist[15], zeros)
        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [12]])
        assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]])
        assert_dist_and_unknowns(ddist[19], zeros)
Beispiel #7
0
        self.assertTrue(math.isnan(var.to_val("?")))

        # TODO: with self.assertRaises(ValueError): var.to_val(2)
        with self.assertRaises(ValueError):
            var.to_val("G")


PickleContinuousVariable = create_pickling_tests(
    "PickleContinuousVariable",
    ("variable", lambda: data.ContinuousVariable()),
    ("with_name", lambda: data.ContinuousVariable(name="Feature 0")),
)

PickleDiscreteVariable = create_pickling_tests(
    "PickleDiscreteVariable",
    ("variable", lambda: data.DiscreteVariable()),
    ("with_name", lambda: data.DiscreteVariable(name="Feature 0")),
    ("with_int_values",
     lambda: data.DiscreteVariable(name="Feature 0", values=[1, 2, 3])),
    ("with_str_value",
     lambda: data.DiscreteVariable(name="Feature 0", values=["F", "M"])),
    ("ordered", lambda: data.DiscreteVariable(
        name="Feature 0", values=["F", "M"], ordered=True)),
    ("with_base_value", lambda: data.DiscreteVariable(
        name="Feature 0", values=["F", "M"], base_value=0)),
)

PickleStringVariable = create_pickling_tests(
    "PickleStringVariable",
    ("variable", lambda: data.StringVariable()),
    ("with_name", lambda: data.StringVariable(name="Feature 0")),
Beispiel #8
0
class WikipediaAPI:
    """ Wraps Wikipedia API.

    Examples:
        >>> api = WikipediaAPI()
        >>> corpus = api.search('en', ['Barack Obama', 'Hillary Clinton'])
    """
    metas = [
        (data.StringVariable('题目'), lambda doc: getattr(doc, 'title')),
        (data.StringVariable('内容'), lambda doc: getattr(doc, 'content')),
        (data.StringVariable('摘要'), lambda doc: getattr(doc, 'summary')),
        (data.StringVariable('连接'), lambda doc: getattr(doc, 'url')),
        (data.ContinuousVariable('文章ID', number_of_decimals=0),
         lambda doc: int(getattr(doc, 'pageid'))),
        (data.ContinuousVariable('版本ID', number_of_decimals=0),
         lambda doc: int(getattr(doc, 'revision_id'))),
        (data.DiscreteVariable('Query'), lambda doc: getattr(doc, 'query')),
    ]

    attributes = []
    class_vars = []
    text_features = [m for m, _ in metas]
    string_attributes = [
        m for m, _ in metas if isinstance(m, data.StringVariable)
    ]

    def __init__(self, on_error=None):
        super().__init__()
        self.on_error = on_error or (lambda x: x)

    def search(self,
               lang,
               queries,
               articles_per_query=10,
               should_break=None,
               on_progress=None):
        """ Searches for articles.

        Args:
            lang(str): A language code in ISO 639-1 format.
            queries(list of str): A list of queries.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.
            on_progress (callable): Callback for progress bar.
        """
        wikipedia.set_lang(lang)

        results = []
        for i, query in enumerate(queries):
            try:
                articles = wikipedia.search(query, results=articles_per_query)
                for j, article in enumerate(articles):
                    if callable(should_break) and should_break():
                        break

                    results.extend(self._get(article, query, should_break))

                    if callable(on_progress):
                        on_progress((i * articles_per_query + j + 1) /
                                    (len(queries) * articles_per_query),
                                    len(results))
            except (wikipedia.exceptions.HTTPTimeoutError, IOError) as e:
                self.on_error(str(e))
                break

            if callable(should_break) and should_break():
                break

        return Corpus.from_documents(results,
                                     'Wikipedia',
                                     self.attributes,
                                     self.class_vars,
                                     self.metas,
                                     title_indices=[-1])

    def _get(self, article, query, should_break, recursive=True):
        try:
            article = wikipedia.page(article)
            article.query = query
            return [article]
        except wikipedia.exceptions.DisambiguationError:
            res = []
            if recursive:
                for article in wikipedia.search(article, 10):
                    if callable(should_break) and should_break():
                        break
                    res.extend(
                        self._get(article,
                                  query,
                                  should_break,
                                  recursive=False))
            return res

        except wikipedia.exceptions.PageError:
            return []
Beispiel #9
0
    def _construct_sparse():
        domain = data.Domain(
            [
                data.DiscreteVariable("d%i" % i, values=list("abc"))
                for i in range(10)
            ] + [data.ContinuousVariable("c%i" % i) for i in range(10)],
            data.DiscreteVariable("y", values=list("abc")),
        )

        #  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
        # ------------------------------------------------------------
        #     2     2  1  1  2        1           1  1     2  0  2
        #        1  1  0  0  1     2                 2     1  0
        #           1     2  0
        #
        #        2        0  1                   1.1
        #
        sdata = np.array([
            2,
            2,
            1,
            1,
            2,
            1,
            1,
            1,
            2,
            0,
            2,
            1,
            1,
            0,
            0,
            1,
            2,
            2,
            1,
            0,
            1,
            2,
            0,
            2,
            0,
            1,
            1.1,
        ])
        indices = [
            1,
            3,
            4,
            5,
            6,
            9,
            13,
            14,
            16,
            17,
            18,
            2,
            3,
            4,
            5,
            6,
            8,
            14,
            16,
            17,
            3,
            5,
            6,
            2,
            5,
            6,
            13,
        ]
        indptr = [0, 11, 20, 23, 23, 27]
        X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20))
        Y = np.array([[1, 2, 1, 0, 0]]).T
        return data.Table.from_numpy(domain, X, Y)
Beispiel #10
0
class NYT:
    """ Class for fetching records from the NYT API. """

    @staticmethod
    def keywords(doc, name):
        return ', '.join([kw.get('value')
                          for kw in doc.get('keywords', [])
                          if kw['name'] == name])

    attributes = []

    class_vars = [
        (data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)),
    ]

    tv = data.TimeVariable('Publication Date')
    metas = [
        (data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''),
        (data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''),
        (data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''),
        (data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''),
        (data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')),
        (data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''),
        (data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')),
        (data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')),
        (data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')),
        (data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')),
        (tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))),
        (data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)),
        (data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)),
    ]

    text_features = [metas[0][0], metas[1][0]]  # headline + abstract

    def __init__(self, api_key):
        """
        Args:
            api_key (str): NY Time API key.
        """
        self.api_key = api_key
        self.on_error = None
        self.on_rate_limit = None
        self.on_no_connection = None
        self.cache_path = None
        self._cache_init()

    def api_key_valid(self):
        """ Checks whether api key given at initialization is valid. """
        url = self._encode_url('test')
        try:
            with request.urlopen(url) as connection:
                if connection.getcode() == 200:
                    return True
        except (HTTPError, URLError, HTTPException):
            return False

    def search(self, query, date_from=None, date_to=None, max_docs=None,
               on_progress=None, should_break=None):
        """
        Args:
            query (str): Search query.
            date_from (date): Start date limit.
            date_to (date): End date limit.
            max_docs (int): Maximal number of documents returned.
            on_progress (callback): Called after every iteration of downloading.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.

        Returns:
            Corpus: Search results.
        """
        if max_docs is None or max_docs > MAX_DOCS:
            max_docs = MAX_DOCS

        # TODO create corpus on the fly and extend, so it stops faster.
        records = []
        data, go_sleep = self._fetch_page(query, date_from, date_to, 0)
        if data is None:
            return None
        records.extend(data['response']['docs'])
        max_docs = min(data['response']['meta']['hits'], max_docs)
        if callable(on_progress):
            on_progress(len(records), max_docs)

        for page in range(1, math.ceil(max_docs/BATCH_SIZE)):
            if callable(should_break) and should_break():
                break

            if go_sleep:
                sleep(SLEEP)

            data, go_sleep = self._fetch_page(query, date_from, date_to, page)

            if data is None:
                break

            records.extend(data['response']['docs'])
            if callable(on_progress):
                on_progress(len(records), max_docs)

        if len(records) > max_docs:
            records = records[:max_docs]

        return Corpus.from_documents(records, 'NY Times', self.attributes,
                                     self.class_vars, self.metas, title_indices=[-1])

    def _cache_init(self):
        """ Initialize cache in Orange environment buffer dir. """
        path = os.path.join(environ.cache_dir(), "nytcache")
        try:
            if not os.path.exists(path):
                os.makedirs(path)
            self.cache_path = os.path.join(path, "query_cache")
        except OSError as e:
            warnings.warn('Could not initialize NYT cache: {}'.format(str(e)), RuntimeWarning)

    def _cache_fetch(self, url):
        """ Fetch URL from cache if present. """
        with shelve.open(self.cache_path) as cache:
            if url in cache.keys():
                return cache[url]
            else:
                return None

    def _cache_store(self, url, data):
        """ Store data for URL in cache. """
        with shelve.open(self.cache_path) as cache:
            cache[url] = data

    def _fetch_page(self, query, date_from, date_to, page):
        """ Fetch one page either from cache or web. """
        cache_url = self._encode_url(query, date_from, date_to, page, for_caching=True)
        data = self._cache_fetch(cache_url)
        if data:
            return data, False
        else:
            url = self._encode_url(query, date_from, date_to, page, for_caching=False)
            try:
                with request.urlopen(url, timeout=TIMEOUT) as conn:
                    data = conn.read().decode('utf-8')
            except HTTPError as e:
                if e.code == 403 and page > 0:
                    # occasionally some pages return error 403 (Forbidden)
                    # while all other page numbers seem to work just fine.
                    # Skip such pages and don't break loading!
                    warnings.warn('NYT api returned HTTPError with code 403 '
                                  '(Forbidden)! Skipping this page ...')
                    return {'response': {'docs': []}}, True
                if e.code == 429 and callable(self.on_rate_limit):
                    self.on_rate_limit()
                elif callable(self.on_error):
                    self.on_error(str(e))
                return None, False
            except URLError:
                if callable(self.on_no_connection):
                    self.on_no_connection()
                    return None, False
                raise
            data = json.loads(data)
            self._cache_store(cache_url, data)
            return data, True

    def _encode_url(self, query, date_from=None, date_to=None, page=0, for_caching=False):
        """
        Encode url for given query, date restrictions and page number.

        Args:
            query (str): Search query.
            date_from (date): Date restriction.
            date_to (date): Date restriction.
            page (int): Page number.
            for_caching (bool): Whether URL would be used for caching. If set, exclude BASE_URL
                and API key.

        Returns:
            str: An encoded URL.
        """
        params = [   # list required to preserve order - important for caching
            ('fq', 'The New York Times'),
            ('api-key', self.api_key),
            ('q', query),
            ('page', page),
        ]
        if date_from:
            params.append(('begin_date', date_from.strftime('%Y%m%d')))
        if date_to:
            params.append(('end_date', date_to.strftime('%Y%m%d')))

        if for_caching:     # remove api key, return only params
            del params[0]
            return parse.urlencode(params)
        else:
            return '{}?{}'.format(BASE_URL, parse.urlencode(params))
Beispiel #11
0
    def test_sparse_get_distributions(self):
        domain = data.Domain(
            [data.DiscreteVariable("d%i" % i, values=list("abc"))
             for i in range(10)] +
            [data.ContinuousVariable("c%i" % i) for i in range(10)])

        #  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
        # ------------------------------------------------------------
        #     2     2  1  1  2        1           1  1     2  0  2
        #        1  1  0  0  1     2                 2     1  0
        #           1     2  0
        #
        #        2        0  1                   1.1
        #
        sdata = np.array([2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2,
                          1, 1, 0, 0, 1, 2, 2, 1, 0,
                          1, 2, 0,
                          2, 0, 1, 1.1])
        indices = [1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18,
                   2, 3, 4, 5, 6, 8, 14, 16, 17,
                   3, 5, 6,
                   2, 5, 6, 13]
        indptr = [0, 11, 20, 23, 23, 27]
        X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20))
        d = data.Table.from_numpy(domain, X)

        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        np.testing.assert_almost_equal(ddist[0], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[1], [0, 0, 1])
        np.testing.assert_almost_equal(ddist[2], [0, 1, 1])
        np.testing.assert_almost_equal(ddist[3], [0, 2, 1])
        np.testing.assert_almost_equal(ddist[4], [1, 1, 0])
        np.testing.assert_almost_equal(ddist[5], [2, 1, 1])
        np.testing.assert_almost_equal(ddist[6], [1, 2, 1])
        np.testing.assert_almost_equal(ddist[7], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[8], [0, 0, 1])
        np.testing.assert_almost_equal(ddist[9], [0, 1, 0])

        z = np.zeros((2, 0))
        np.testing.assert_almost_equal(ddist[10], z)
        np.testing.assert_almost_equal(ddist[11], z)
        np.testing.assert_almost_equal(ddist[12], z)
        np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 1]])
        np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 1]])
        np.testing.assert_almost_equal(ddist[15], z)
        np.testing.assert_almost_equal(ddist[16], [[1, 2], [1, 1]])
        np.testing.assert_almost_equal(ddist[17], [[0], [2]])
        np.testing.assert_almost_equal(ddist[18], [[2], [1]])
        np.testing.assert_almost_equal(ddist[19], z)

        d.set_weights(np.array([1, 2, 3, 4, 5]))

        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        np.testing.assert_almost_equal(ddist[0], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[1], [0, 0, 1])
        np.testing.assert_almost_equal(ddist[2], [0, 2, 5])
        np.testing.assert_almost_equal(ddist[3], [0, 5, 1])
        np.testing.assert_almost_equal(ddist[4], [2, 1, 0])
        np.testing.assert_almost_equal(ddist[5], [7, 1, 3])
        np.testing.assert_almost_equal(ddist[6], [3, 7, 1])
        np.testing.assert_almost_equal(ddist[7], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[8], [0, 0, 2])
        np.testing.assert_almost_equal(ddist[9], [0, 1, 0])

        z = np.zeros((2, 0))
        np.testing.assert_almost_equal(ddist[10], z)
        np.testing.assert_almost_equal(ddist[11], z)
        np.testing.assert_almost_equal(ddist[12], z)
        np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 5]])
        np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 2]])
        np.testing.assert_almost_equal(ddist[15], z)
        np.testing.assert_almost_equal(ddist[16], [[1, 2], [2, 1]])
        np.testing.assert_almost_equal(ddist[17], [[0], [3]])
        np.testing.assert_almost_equal(ddist[18], [[2], [1]])
        np.testing.assert_almost_equal(ddist[19], z)
Beispiel #12
0
import unittest

from Orange.testing import create_pickling_tests
from Orange import data

age = data.ContinuousVariable(name="AGE")
gender = data.DiscreteVariable(name="Gender",
                               values=["M", "F"])
incomeA = data.ContinuousVariable(name="AGE")
income = data.ContinuousVariable(name="income")
education = data.DiscreteVariable(name="education",
                                  values=["GS", "HS", "C"])
ssn = data.StringVariable(name="SSN")
race = data.DiscreteVariable(name="race",
                             values=["White", "Hypsanic", "African", "Other"])

PickleDomain = create_pickling_tests(
    "PickleDomain",
    ("empty_domain", lambda: data.Domain([])),
    ("with_continuous_variable", lambda: data.Domain([age])),
    ("with_discrete_variable", lambda: data.Domain([gender])),
    ("with_mixed_variables", lambda: data.Domain([age, gender])),
    ("with_continuous_class", lambda: data.Domain([age, gender],
                                                  [incomeA])),
    ("with_discrete_class", lambda: data.Domain([age, gender],
                                                [education])),
    ("with_multiple_classes", lambda: data.Domain([age, gender],
                                                  [incomeA, education])),
    ("with_metas", lambda: data.Domain([age, gender], metas=[ssn])),
    ("with_class_and_metas", lambda: data.Domain([age, gender],
                                                 [incomeA, education],
Beispiel #13
0
class TheGuardianAPI:
    attributes = []

    class_vars = [
        (data.DiscreteVariable('Section'), lambda doc: doc['sectionName']),
    ]

    tv = data.TimeVariable('Publication Date')
    metas = [
        (data.StringVariable('Headline'), lambda doc: doc['fields']['headline']),
        (data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']),
        (data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']),
        (data.StringVariable('HTML'), lambda doc: doc['fields']['body']),
        (tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])),
        (data.DiscreteVariable('Type'), lambda doc: doc['type']),
        (data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']),
        (data.StringVariable('Tags'),
            lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])),
        (data.StringVariable('URL'), lambda doc: doc['webUrl']),
        (data.ContinuousVariable('Word Count', number_of_decimals=0),
            lambda doc: doc['fields']['wordcount']),
    ]

    text_features = [metas[0][0], metas[1][0]]  # Headline + Content
    title_indices = [-1]    # Headline

    def __init__(self, credentials, on_progress=None, should_break=None):
        """
        Args:
            credentials (:class:`TheGuardianCredentials`): The Guardian Creentials.
            on_progress (callable): Function for progress reporting.
            should_break (callable): Function for early stopping.
        """
        self.per_page = ARTICLES_PER_PAGE
        self.pages = 0
        self.credentials = credentials
        self.on_progress = on_progress or (lambda x, y: None)
        self.should_break = should_break or (lambda: False)

        self.results = []

    def _search(self, query, from_date, to_date, page=1):
        data = self._build_query(query, from_date, to_date, page)

        response = requests.get(BASE_URL, data)
        parsed = json.loads(response.text)

        if page == 1:   # store number of pages
            self.pages = parsed['response']['pages']

        self.results.extend(parsed['response']['results'])

    def _build_query(self, query, from_date=None, to_date=None, page=1):
        data = {
            'q': query,
            'api-key': self.credentials.key,
            'page': str(page),
            'show-fields': 'headline,trailText,body,bodyText,lang,wordcount',
            'show-tags': 'all',
        }
        if from_date is not None:
            data['from-date'] = from_date
        if to_date is not None:
            data['to-date'] = to_date

        return data

    def search(self, query, from_date=None, to_date=None, max_documents=None,
               accumulate=False):
        """
        Search The Guardian API for articles.

        Args:
            query (str): A query for searching the articles by
            from_date (str): Search only articles newer than the date provided.
                Date should be in ISO format; e.g. '2016-12-31'.
            to_date (str): Search only articles older than the date provided.
                Date should be in ISO format; e.g. '2016-12-31'.
            max_documents (int): Maximum number of documents to retrieve.
                When not given, retrieve all documents.
            accumulate (bool): A flag indicating whether to accumulate results
                of multiple consequent search calls.

        Returns:
            :ref:`Corpus`
        """
        if not accumulate:
            self.results = []

        self._search(query, from_date, to_date)

        pages = math.ceil(max_documents/self.per_page) if max_documents else self.pages
        self.on_progress(self.per_page, pages * self.per_page)

        for p in range(2, pages+1):     # to one based
            if self.should_break():
                break
            self._search(query, from_date, to_date, p)
            self.on_progress(p*self.per_page, pages * self.per_page)

        c = Corpus.from_documents(
            self.results, 'The Guardian', self.attributes, self.class_vars,
            self.metas, title_indices=self.title_indices)
        c.text_features = self.text_features
        return c
Beispiel #14
0
class TwitterAPI:
    """ Fetch tweets from the Tweeter API.

    Notes:
        Results across multiple searches are aggregated. To remove tweets form
        previous searches and only return results from the last search either
        call `reset` method before searching or provide `collecting=False`
        argument to search method.
    """
    attributes = []
    class_vars = [
        (data.DiscreteVariable('Author'), lambda doc: '@' + doc.author.screen_name),
    ]

    tv = data.TimeVariable('Date')
    metas = [
        (data.StringVariable('Content'), lambda doc: doc.text),
        (tv, lambda doc: TwitterAPI.tv.parse(doc.created_at.isoformat())),
        (data.DiscreteVariable('Language'), lambda doc: doc.lang),
        (data.DiscreteVariable('Location'), lambda doc: getattr(doc.place, 'country_code', None)),
        (data.ContinuousVariable('Number of Likes', number_of_decimals=0),
         lambda doc: doc.favorite_count),
        (data.ContinuousVariable('Number of Retweets', number_of_decimals=0),
         lambda doc: doc.retweet_count),
        (data.DiscreteVariable('In Reply To'),
            lambda doc: '@' + doc.in_reply_to_screen_name if doc.in_reply_to_screen_name else ''),
        (data.DiscreteVariable('Author Name'), lambda doc: doc.author.name),
        (data.StringVariable('Author Description'), lambda doc: doc.author.description),
        (data.ContinuousVariable('Author Statuses Count', number_of_decimals=0),
         lambda doc: doc.author.statuses_count),
        (data.ContinuousVariable('Author Favourites Count', number_of_decimals=0),
         lambda doc: doc.author.favourites_count),
        (data.ContinuousVariable('Author Friends Count', number_of_decimals=0),
         lambda doc: doc.author.friends_count),
        (data.ContinuousVariable('Author Followers Count', number_of_decimals=0),
         lambda doc: doc.author.followers_count),
        (data.ContinuousVariable('Author Listed Count', number_of_decimals=0),
         lambda doc: doc.author.listed_count),
        (data.DiscreteVariable('Author Verified'), lambda doc: str(doc.author.verified)),
        (data.ContinuousVariable('Longitude'),
            lambda doc: coordinates_geoJSON(doc.coordinates)[0]),
        (data.ContinuousVariable('Latitude'),
            lambda doc: coordinates_geoJSON(doc.coordinates)[1]),
    ]

    text_features = [metas[0][0]]       # Content
    string_attributes = [m for m, _ in metas
                         if isinstance(m, data.StringVariable)]

    def __init__(self, credentials,
                 on_progress=None, should_break=None,
                 on_error=None, on_rate_limit=None):
        self.key = credentials
        self.api = tweepy.API(credentials.auth)
        self.container = OrderedDict()
        self.search_history = []

        # Callbacks:
        self.on_error = on_error
        self.on_rate_limit = on_rate_limit
        self.on_progress = on_progress or (lambda *args: args)
        self.should_break = should_break or (lambda *args: False)

    @property
    def tweets(self):
        return self.container.values()

    def search_content(self, content, *, max_tweets=0,
                       lang=None, allow_retweets=True,
                       collecting=False):
        """ Search by content.

        Args:
            content (list of str): A list of key words to search for.
            max_tweets (int): If greater than zero limits the number of
                downloaded tweets.
            lang (str): A language's code (either ISO 639-1 or ISO 639-3
                formats).
            allow_retweets(bool): Whether to download retweets.
            collecting (bool): Whether to collect results across multiple
                search calls.

        Returns:
            Corpus
        """
        if not collecting:
            self.reset()

        if max_tweets == 0:
            max_tweets = float('Inf')

        def build_query():
            nonlocal content
            if not content:
                q = 'from: '
            else:
                if not isinstance(content, list):
                    content = [content]
                q = ' OR '.join(['"{}"'.format(q) for q in content])
            if not allow_retweets:
                q += ' -filter:retweets'
            return q

        query = build_query()
        cursor = tweepy.Cursor(self.api.search, q=query, lang=lang)
        corpus, count = self.fetch(cursor, max_tweets)
        self.append_history('Content', content, lang if lang else 'Any',
                            str(allow_retweets), count)
        return corpus

    def search_authors(self, authors, *, max_tweets=0, collecting=False):
        """ Search by authors.

        Args:
            authors (list of str): A list of authors to search for.
            max_tweets (int): If greater than zero limits the number of
                downloaded tweets.
            collecting (bool): Whether to collect results across multiple
                search calls.

        Returns:
            Corpus
        """
        if not collecting:
            self.reset()

        if max_tweets == 0:     # set to max allowed for progress
            max_tweets = 3200

        if not isinstance(authors, list):
            authors = [authors]

        cursors = [tweepy.Cursor(self.api.user_timeline, screen_name=a)
                   for a in authors]
        corpus, count = self.fetch(cursors, max_tweets)
        self.append_history('Author', authors, None, None, count)
        return corpus

    def fetch(self, cursors, max_tweets):
        if not isinstance(cursors, list):
            cursors = [cursors]

        count = 0
        try:
            for i, cursor in enumerate(cursors):
                for j, tweet in enumerate(cursor.items(max_tweets), start=1):
                    if self.should_break():
                        break
                    if tweet.id not in self.container:
                        count += 1
                    self.container[tweet.id] = tweet
                    if j % 20 == 0:
                        self.on_progress(len(self.container),
                                         (i*max_tweets + j)/
                                         (len(cursors)*max_tweets))
                if self.should_break():
                    break
        except tweepy.TweepError as e:
            if e.response.status_code == 429 and self.on_rate_limit:
                self.on_rate_limit()
            elif self.on_error:
                self.on_error(str(e))
                return None, 0
        return self.create_corpus(), count

    def create_corpus(self):
        return Corpus.from_documents(self.tweets, 'Twitter', self.attributes,
                                     self.class_vars, self.metas,
                                     title_indices=[-1])

    def reset(self):
        """ Removes all downloaded tweets. """
        self.search_history = []
        self.container = OrderedDict()

    def append_history(self, mode, query, lang, allow_retweets, n_tweets):
        query = ', '.join(query) if isinstance(query, Iterable) else query
        if lang in code2lang.keys():
            lang = code2lang[lang]
        self.search_history.append((
            ('Query', query),
            ('Search by', mode),
            ('Language', lang),
            ('Allow retweets', allow_retweets),
            ('Tweets count', n_tweets),
        ))

    def report(self):
        return self.search_history
class FacebookOrangeAPI():
    attributes = []
    class_vars = []
    image_var = data.StringVariable.make("image")
    image_var.attributes["type"] = "image"
    post_metas = [
        (data.StringVariable('Message'), lambda doc: doc['status_message']),
        (data.DiscreteVariable('From'), lambda doc: doc['from_name']),
        (data.ContinuousVariable('likes'), lambda doc: doc['like']),
        (data.ContinuousVariable('comments'), lambda doc: doc['comments']),
        (data.ContinuousVariable('shares'), lambda doc: doc['shares']),
        (data.DiscreteVariable('top emotion'),
         lambda doc: doc['top_reaction']),
        (data.StringVariable('Link name'), lambda doc: doc['link_name']),
        (image_var, lambda doc: doc['picture']),
        (data.StringVariable('link'), lambda doc: doc['status_link']),
        (data.DiscreteVariable('From ID'), lambda doc: doc['from_id']),
        (data.StringVariable('Post ID'), lambda doc: doc['status_id']),
        (data.DiscreteVariable('Post type'), lambda doc: doc['status_type']),
        (data.TimeVariable('Publication Date'),
         lambda doc: doc['status_published']),
        (data.TimeVariable('Publication Date UTC'),
         lambda doc: doc['status_published_utc']),
        (data.ContinuousVariable('emotion angry'), lambda doc: doc['angry']),
        (data.ContinuousVariable('emotion love'), lambda doc: doc['love']),
        (data.ContinuousVariable('emotion haha'), lambda doc: doc['haha']),
        (data.ContinuousVariable('emotion wow'), lambda doc: doc['wow']),
        (data.ContinuousVariable('emotion sad'), lambda doc: doc['sad'])
    ]
    text_features = [post_metas[0][0]]
    title_indices = [-1]

    def __init__(self, credentials, on_progress=None, should_break=None):
        self.utc_datecor = datetime.utcnow() - datetime.now()
        self.pages = 0
        self.credentials = credentials
        self.on_progress = on_progress or (lambda x, y: None)
        self.should_break = should_break or (lambda: False)

    def buildUrl(self, node, version='v2.11'):
        return BASE_URL + '/' + version + '/' + node

    def getData(self, url, params=None):
        while True:
            if self.should_break():
                return {}
            try:
                headers = {'Authorization': 'Bearer ' + self.credentials.token}
                p = requests.get(url, params=params, headers=headers)
                return p.json()
            except:
                print('retry in 5 sec')
                for i in range(50):
                    if self.should_break():
                        return {}
                    time.sleep(0.1)

    def localToUtc(self, date):
        return date + self.utc_datecor

    def utcToLocal(self, date):
        return date - self.utc_datecor

    def processDate(self, created_time):
        return datetime.strptime(created_time, '%Y-%m-%dT%H:%M:%S+0000')

    def processStatus(self, status, engagement=True):
        d = {}
        d['status_id'] = status['id']
        d['from_id'] = status['from']['id'] if 'from' in status.keys() else ''
        d['from_name'] = status['from']['name'] if 'from' in status.keys(
        ) else ''
        d['status_message'] = '' if 'message' not in status.keys(
        ) else status['message']
        d['status_type'] = status['type']
        d['link_name'] = '' if 'name' not in status.keys() else status['name']

        d['status_published_utc'] = self.processDate(status['created_time'])
        d['status_published'] = self.utcToLocal(d['status_published_utc'])
        d['status_link'] = '' if 'link' not in status.keys(
        ) else status['link']
        d['picture'] = status['full_picture'] if 'full_picture' in status.keys(
        ) else ''

        topscore = 0
        d['like'] = status['like']['summary'][
            'total_count'] if engagement else ''
        d['comments'] = status['comments']['summary'][
            'total_count'] if engagement else ''
        d['shares'] = status['shares']['count'] if 'shares' in status.keys(
        ) else ''

        d['top_reaction'] = ''
        for score in ['love', 'haha', 'wow', 'sad', 'angry']:
            d[score] = status[score]['summary'][
                'total_count'] if engagement else ''
            if engagement:
                d[score] = status[score]['summary']['total_count']
                if int(d[score]) > topscore:
                    topscore = int(d[score])
                    d['top_reaction'] = score
            else:
                d[score] = ''
                d['top_reaction'] = ''

        return d

    def fieldString(self, engagement=True):
        field_string = 'message,from,link,created_time,type,name,id,full_picture'

        if engagement:
            field_string += ',' + 'comments.limit(0).summary(true),shares.limit(0).summary(true)'
            for r in ['like', 'love', 'haha', 'wow', 'sad', 'angry']:
                field_string += ',' + 'reactions.type({}).limit(0).summary(true).as({})'.format(
                    r.upper(), r.lower())
        return field_string

    def getStatuses(self,
                    page_id,
                    mode='posts',
                    since=None,
                    until=None,
                    engagement=True,
                    comments=True):
        node = page_id + '/' + mode + '/'  ## mode can be "posts" (posts by page), "feed" (all posts on page) and "tagged" (all public posts in which page is tagged
        url = self.buildUrl(node)

        params = {}
        params['fields'] = self.fieldString(engagement)
        params['limit'] = 100

        if since is not None:
            params['since'] = (
                self.localToUtc(since)).strftime('%Y-%m-%dT%H:%M:%S')
        if until is not None:
            params['until'] = (
                self.localToUtc(until)).strftime('%Y-%m-%dT%H:%M:%S')
        while True:
            statuses = self.getData(url, params=params)
            if not 'data' in statuses: break

            proc_statuses = [
                self.processStatus(s, engagement) for s in statuses['data']
            ]
            yield proc_statuses

            if not 'paging' in statuses.keys(): break
            if not 'next' in statuses['paging'].keys(): break
            url = statuses['paging']['next']

    def _search(self,
                page_ids,
                mode,
                since,
                until,
                max_documents,
                sub_progress=(0, 1)):
        since = since.strftime('%Y-%m-%d')
        until = until.strftime('%Y-%m-%d')
        since = datetime.strptime(since, '%Y-%m-%d')
        until = datetime.strptime(until + 'T23:59:59', '%Y-%m-%dT%H:%M:%S')
        total_sec = float((until - since).total_seconds())
        n_pages = len(page_ids)

        progress_pct = 1 / float(n_pages)

        for page_i in range(0, n_pages):
            page_id = page_ids[page_i].strip()
            if page_id == '': return
            if '/' in page_id: page_id = page_id.split('/')[-1]
            page_progress = progress_pct * page_i
            n = 0
            for d in self.getStatuses(page_id, mode, since, until):
                if self.should_break():
                    return
                earliest_date = d[-1]['status_published']
                sec_to_go = (until - earliest_date).total_seconds()
                date_progress = ((sec_to_go / total_sec) * progress_pct)
                progress = math.ceil((page_progress + date_progress) * 100)
                self.on_progress(progress_scale(progress, sub_progress), 100)
                for doc in d:
                    n += 1
                    if max_documents is not None:
                        if n > max_documents:
                            break
                    yield doc
                if max_documents is not None:
                    if n > max_documents:
                        break
        self.on_progress(progress_scale(100, sub_progress), 100)

    def search(self,
               page_ids,
               mode='posts',
               since=datetime.now() - timedelta(10),
               until=datetime.now(),
               max_documents=None,
               sub_progress=(0, 1)):
        results = []
        for doc in self._search(page_ids, mode, since, until, max_documents,
                                sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook', self.attributes,
                                  self.class_vars, self.post_metas,
                                  self.title_indices)
        c.set_text_features(self.text_features)
        return c

    def _search_posts(self, post_ids, sub_progress=(0, 1), engagement=True):
        for i, post_id in enumerate(post_ids):
            node = post_id
            url = self.buildUrl(node)

            params = {}
            params['fields'] = self.fieldString(engagement)
            params['limit'] = 100

            status = self.getData(url, params=params)
            status = self.processStatus(status)
            yield status

            progress = ((i + 1) / len(post_ids)) * 100
            self.on_progress(progress_scale(progress, sub_progress), 100)
        self.on_progress(progress_scale(100, sub_progress), 100)

    def search_posts(self, post_ids, sub_progress=(0, 1)):
        results = []
        for doc in self._search_posts(post_ids, sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook', self.attributes,
                                  self.class_vars, self.post_metas,
                                  self.title_indices)
        c.set_text_features(self.text_features)
        return c

    def processComment(self, comment):
        has_comment_replies = 'comments' in comment.keys()
        parent = {
            'type': 'comment',
            'comment_id': comment['id'],
            'likes': comment['like']['summary']['total_count'],
            'comment_replies': None,
            'message': comment['message'],
            'parent_comment_id': ''
        }
        parent['status_published_utc'] = self.processDate(
            comment['created_time'])
        parent['status_published'] = self.utcToLocal(
            parent['status_published_utc'])
        if has_comment_replies:
            parent['comment_replies'] = comment['comments']['summary'][
                'total_count']
        yield parent

        if has_comment_replies:
            comment_replies = comment['comments']
            while True:
                for cr in comment_replies['data']:
                    child = {
                        'type': 'comment_reply',
                        'comment_id': comment['id'],
                        'likes': cr['like']['summary']['total_count'],
                        'message': cr['message'],
                        'parent_comment_id': cr['id'],
                        'comment_replies': None
                    }
                    child['status_published_utc'] = self.processDate(
                        cr['created_time'])
                    child['status_published'] = self.utcToLocal(
                        child['status_published_utc'])
                    yield child

                if not 'paging' in comment_replies.keys(): break
                if not 'next' in comment_replies['paging'].keys(): break
                url = comment_replies['paging']['next']
                comment_replies = self.getData(url)

    def _getComments(self,
                     post_ids,
                     comment_replies=True,
                     sub_progress=(0, 1)):
        for i, post_id in enumerate(post_ids):
            node = post_id + '/comments'
            url = self.buildUrl(node)

            params = {}
            params[
                'fields'] = 'message,created_time,reactions.type(LIKE).summary(true).as(like)'
            if comment_replies:
                params[
                    'fields'] += ',comments.summary(true){message,created_time,reactions.type(LIKE).summary(true).as(like)}'
            params['limit'] = 100

            while True:
                comments = self.getData(url, params=params)
                if len(comments['data']) == 0: break

                for comment in comments['data']:
                    for proc_comment in self.processComment(comment):
                        proc_comment['post_id'] = post_id
                        yield proc_comment

                if not 'paging' in comments.keys(): break
                if not 'next' in comments['paging'].keys(): break
                url = comments['paging']['next']
            progress = ((i + 1) / len(post_ids)) * 100
            self.on_progress(progress_scale(progress, sub_progress), 100)
        self.on_progress(progress_scale(100, sub_progress), 100)

    def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)):
        attributes = []
        class_vars = []
        metas = [(data.StringVariable('Message'), lambda doc: doc['message']),
                 (data.DiscreteVariable('Type'), lambda doc: doc['type']),
                 (data.StringVariable('Post ID'), lambda doc: doc['post_id']),
                 (data.StringVariable('Comment ID'),
                  lambda doc: doc['comment_id']),
                 (data.StringVariable('Parent comment ID'),
                  lambda doc: doc['parent_comment_id']),
                 (data.ContinuousVariable('likes'), lambda doc: doc['likes']),
                 (data.ContinuousVariable('comment replies'),
                  lambda doc: doc['comment_replies']),
                 (data.TimeVariable('Publication Date'),
                  lambda doc: doc['status_published']),
                 (data.TimeVariable('Publication Date UTC'),
                  lambda doc: doc['status_published_utc'])]
        text_features = [metas[0][0]]
        title_indices = [-1]

        results = []
        for doc in self._getComments(post_ids, comment_replies, sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook comments', attributes,
                                  class_vars, metas, title_indices)
        c.set_text_features(text_features)
        return c