Example #1
0
    def __init__(self, primary_dimension, secondary_dimension=None):
        """
        Construct a DataTable for one or two dimensions.

        Dimensions may be string dimension keys or
        :class:`msgvis.apps.dimensions.models.CategoricalDimension` objects.

        :type primary_dimension: registry.models.CategoricalDimension
        :type secondary_dimension: registry.models.CategoricalDimension

        :return:
        """

        # Look up the dimensions if needed
        if isinstance(primary_dimension, basestring):
            primary_dimension = registry.get_dimension(primary_dimension)

        if secondary_dimension is not None and isinstance(secondary_dimension, basestring):
            secondary_dimension = registry.get_dimension(secondary_dimension)

        # a dirty way
        if secondary_dimension is not None and hasattr(secondary_dimension, 'key') and secondary_dimension.key == "groups":
            secondary_dimension = None

        self.primary_dimension = primary_dimension
        self.secondary_dimension = secondary_dimension

        self.mode = "default"
Example #2
0
    def test_double_quantitative_one_wide(self):
        """Can it render two quant dimensions, when one requires binning?"""
        values = [(0, 1), (2, 3), (6, 59999), (6, 60000)]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            ('shared_count', 'replied_to_count'), quant_distribution)

        binned_distribution = {
            (0, 0):
            quant_distribution[values[0]],
            (2, 0):
            quant_distribution[values[1]],
            (6, 59995):
            quant_distribution[values[2]] + quant_distribution[values[3]]
        }

        d1 = registry.get_dimension('shares')
        d2 = registry.get_dimension('replies')

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all(),
                                  desired_secondary_bins=5)

        self.assertMultiDistributionsEqual(result,
                                           binned_distribution,
                                           ('shares', 'replies'),
                                           measure_key='value')
Example #3
0
    def __init__(self, primary_dimension, secondary_dimension=None):
        """
        Construct a DataTable for one or two dimensions.

        Dimensions may be string dimension keys or
        :class:`msgvis.apps.dimensions.models.CategoricalDimension` objects.

        :type primary_dimension: registry.models.CategoricalDimension
        :type secondary_dimension: registry.models.CategoricalDimension

        :return:
        """

        # Look up the dimensions if needed
        if isinstance(primary_dimension, basestring):
            primary_dimension = registry.get_dimension(primary_dimension)

        if secondary_dimension is not None and isinstance(
                secondary_dimension, basestring):
            secondary_dimension = registry.get_dimension(secondary_dimension)

        # a dirty way
        if secondary_dimension is not None and hasattr(
                secondary_dimension,
                'key') and secondary_dimension.key == "groups":
            secondary_dimension = None

        self.primary_dimension = primary_dimension
        self.secondary_dimension = secondary_dimension

        self.mode = "default"
Example #4
0
    def test_render_single_quantitative_wide(self):
        """
        Can produce a datatable with only a single quantitative dimension.
        The distribution is very wide and binning must be used.
        """

        values = [0, 2, 3, 4, 60000]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_distribution(
            field_name='shared_count',
            distribution=quant_distribution,
        )

        binned_distribution = {
            0: sum(quant_distribution[value] for value in values[:4]),
            60000: quant_distribution[values[4]],
        }

        dimension = registry.get_dimension('shares')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all(),
                                  desired_primary_bins=5)

        self.assertDistributionsEqual(result,
                                      binned_distribution,
                                      level_key='shares',
                                      measure_key='value')
Example #5
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('time')
        self.dataset = corpus_models.Dataset.objects.create(
            name="test dataset", description='description')

        internal_filter = {
            'dimension': self.dimension,
            'min_time': now(),
            'max_time': now() + timedelta(minutes=5),
        }

        serialized_filter = serializers.FilterSerializer(internal_filter).data

        self.serialized_representation = {
            'dataset': self.dataset.id,
            'dimensions': [self.dimension.key],
            'filters': [serialized_filter],
        }

        # Should lookup exactly the same dimension
        self.deserialized_representation = {
            'dataset': self.dataset,
            'dimensions': [self.dimension],
            'filters': [internal_filter],
        }
Example #6
0
    def test_boolean_domain(self):
        dataset = self.create_empty_dataset()

        dimension = registry.get_dimension("contains_url")
        result = dimension.get_domain(dataset.message_set.all())
        result = list(result)
        self.assertEquals(len(result), 2)
        self.assertEquals(result, dimension.domain)
Example #7
0
    def test_double_quantitative_narrow(self):
        """Can it render two quantitative dimensions when binning is not needed."""
        values = [(0, 1), (2, 3), (3, 2), (4, 5), (6, 7)]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            ('shared_count', 'replied_to_count'), quant_distribution)

        d1 = registry.get_dimension('shares')
        d2 = registry.get_dimension('replies')

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all())

        self.assertMultiDistributionsEqual(result,
                                           quant_distribution,
                                           ('shares', 'replies'),
                                           measure_key='value')
Example #8
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('time')

        self.serialized_representation = {
            'key': self.dimension.key,
            'name': self.dimension.name,
            'description': self.dimension.description,
        }

        # Should lookup exactly the same dimension
        self.deserialized_representation = self.dimension
Example #9
0
    def test_quantitative_domain(self):

        reply_values = [1, 2001]
        distribution = self.get_distribution(reply_values)
        dataset = self.generate_messages_for_distribution("replied_to_count", distribution)

        dimension = registry.get_dimension("replies")
        result = dimension.get_domain(dataset.message_set.all(), bins=10)
        result = list(result)

        self.assertEquals(result, [0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200])
Example #10
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('time')

        self.serialized_representation = {
            'key': self.dimension.key,
            'name': self.dimension.name,
            'description': self.dimension.description,
        }

        # Should lookup exactly the same dimension
        self.deserialized_representation = self.dimension
Example #11
0
    def generate_some_messages(self, dataset):
        corpus_models.Message.objects.create(
            dataset=dataset,
            text="blah blah blah",
            time="2015-02-02T01:19:02Z",
            shared_count=0,
        )

        hashtag = corpus_models.Hashtag.objects.create(text="OurPriorities")
        msg = corpus_models.Message.objects.create(
            dataset=dataset,
            text="blah blah blah #%s" % hashtag.text,
            time="2015-02-02T01:19:02Z",
            shared_count=10,
        )
        msg.hashtags.add(hashtag)

        self.dimension_time = registry.get_dimension('time')
        self.dimension_hashtags = registry.get_dimension('hashtags')
        self.dimension_shared = registry.get_dimension('shares')
Example #12
0
    def doDistributionTest(self, dimension_key, dataset, distribution,
                           **kwargs):
        dimension = registry.get_dimension(dimension_key)

        # Calculate the categorical distribution over the field name
        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all(), **kwargs)
        self.assertDistributionsEqual(result,
                                      distribution,
                                      level_key=dimension_key,
                                      measure_key='value')
Example #13
0
    def setUp(self):
        self.dimension = self.dimension = dimensions.get_dimension('sentiment')

        self.internal_filter = {
            'dimension': self.dimension,
            'levels': ['a', 'b', 'c'],
        }

        self.external_filter = {
            'dimension': self.dimension.key,
            'levels': self.internal_filter['levels']
        }
Example #14
0
    def setUp(self):
        self.dimension = self.dimension = dimensions.get_dimension('sentiment')

        self.internal_filter = {
            'dimension': self.dimension,
            'levels': ['a', 'b', 'c'],
        }

        self.external_filter = {
            'dimension': self.dimension.key,
            'levels': self.internal_filter['levels']
        }
Example #15
0
    def test_render_double_categorical(self):
        """Can produce a datatable with a two categorical dimensions."""

        field_names = ('contains_url', 'contains_mention')
        values = [(True, True), (True, False), (False, True), (False, False)]
        bi_bool_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            field_names=field_names,
            distribution=bi_bool_distribution,
        )

        d1 = registry.get_dimension(field_names[0])
        d2 = registry.get_dimension(field_names[1])

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all())

        self.assertMultiDistributionsEqual(result,
                                           bi_bool_distribution,
                                           field_names,
                                           measure_key='value')
Example #16
0
    def test_render_two_related_categorical(self):
        """Can produce a datatable with two related categorical dimensions."""

        # Create some language labels
        language_ids = self.create_test_languages()
        dataset = self.create_authors_with_values(
            'username', ['username_%d' % d for d in xrange(5)])
        author_ids = dataset.person_set.values_list('id', flat=True).distinct()

        # create language/person pairs
        value_pairs = []
        for lang in language_ids:
            for author in author_ids:
                # skip cases where both are even, just so's there's gaps
                if lang % 2 == 0 and author % 2 == 0:
                    continue

                value_pairs.append((lang, author))

        # Distribute some messages
        id_distribution = self.get_distribution(value_pairs)
        self.generate_messages_for_multi_distribution(
            ('language_id', 'sender_id'), id_distribution, dataset=dataset)

        # Get the actual expected distribution
        value_distribution = self.convert_id_distribution_to_related(
            id_distribution, (corpus_models.Language, corpus_models.Person),
            ('name', 'username'))

        d1 = registry.get_dimension('language')
        d2 = registry.get_dimension('sender')

        datatable = models.DataTable(d1, d2)
        result = datatable.render(dataset.message_set.all())

        self.assertMultiDistributionsEqual(result,
                                           value_distribution,
                                           ('language', 'sender'),
                                           measure_key='value')
Example #17
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('time')

        self.internal_filter = {
            'dimension': self.dimension,
            'min_time': now(),
            'max_time': now() + timedelta(minutes=5),
        }

        self.external_filter = {
            'dimension': self.dimension.key,
            'min_time': api_time_format(self.internal_filter['min_time']),
            'max_time': api_time_format(self.internal_filter['max_time']),
        }
Example #18
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('replies')

        self.internal_filter = {
            'dimension': self.dimension,
            'min': 5,
            'max': 10,
        }

        self.external_filter = {
            'dimension': self.dimension.key,
            'min': 5,
            'max': 10,
        }
Example #19
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('replies')

        self.internal_filter = {
            'dimension': self.dimension,
            'min': 5,
            'max': 10,
        }

        self.external_filter = {
            'dimension': self.dimension.key,
            'min': 5,
            'max': 10,
        }
Example #20
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('time')

        self.internal_filter = {
            'dimension': self.dimension,
            'min_time': now(),
            'max_time': now() + timedelta(minutes=5),
        }

        self.external_filter = {
            'dimension': self.dimension.key,
            'min_time': api_time_format(self.internal_filter['min_time']),
            'max_time': api_time_format(self.internal_filter['max_time']),
        }
Example #21
0
    def test_excludes_all_data(self):
        """
        If the filters exclude all the data, an empty result set should be produced.
        """

        field_names = ('shared_count', 'replied_to_count')
        values = [(1, 1), (1, 4), (1, 3), (2, 1), (2, 2)]
        bi_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_multi_distribution(
            field_names, bi_distribution)

        d1 = registry.get_dimension('shares')
        d2 = registry.get_dimension('replies')

        datatable = models.DataTable(d1, d2)

        filtered = dataset.message_set.filter(
            shared_count__range=(2, 5),
            replied_to_count__range=(3, 5),
        )

        result = datatable.render(filtered)
        self.assertEquals(result.count(), 0)
Example #22
0
    def test_time_domain(self):
        base_time = tz.datetime(2012, 5, 2, 20, 10, 2, 0)
        if settings.USE_TZ:
            base_time = base_time.replace(tzinfo=tz.utc)

        time_values = [base_time, base_time + timedelta(days=1)]
        distribution = self.get_distribution(time_values)
        dataset = self.generate_messages_for_distribution("time", distribution)
        dimension = registry.get_dimension("time")
        result = dimension.get_domain(dataset.message_set.all(), bins=24)

        self.assertEquals(len(result), 26)
        self.assertEquals(result[0], base_time.replace(minute=0, second=0))
        self.assertEquals(result[24], time_values[1].replace(minute=0, second=0))
        self.assertEquals(result[25], time_values[1].replace(minute=0, second=0) + timedelta(hours=1))
Example #23
0
    def test_time_domain_just_over(self):
        """If the domain is a little over a convenient bin size, rounds down"""

        # Four minutes and 10 seconds is a perfect 50 bins of 5 seconds
        # so this is 4 seconds extra.
        start_time = dateparse.parse_datetime("2014-03-21T00:00:00Z")
        end_time = dateparse.parse_datetime("2014-03-21T00:04:14Z")

        time_values = [start_time, end_time]
        distribution = self.get_distribution(time_values)

        dataset = self.generate_messages_for_distribution("time", distribution)

        dimension = registry.get_dimension("time")
        result = dimension.get_domain(dataset.message_set.all(), bins=50)

        # Should have decided to use 5 second increments
        self.assertEquals(result[0], start_time)
        self.assertEquals(result[1], start_time + timedelta(seconds=5))
Example #24
0
    def test_categorical_domain(self):
        """
        Checks that the domain of a categorical model field,
        in this case Sentiment, can be calculated correctly.
        """

        # Create some language labels
        sentiment_values, sentiment_labels = zip(*corpus_models.Message.SENTIMENT_CHOICES)

        sentiment_distribution = self.get_distribution(sentiment_values)

        dataset = self.generate_messages_for_distribution(field_name="sentiment", distribution=sentiment_distribution)

        dimension = registry.get_dimension("sentiment")

        # Calculate the categorical distribution over the field name
        result = dimension.get_domain(dataset.message_set.all())

        # in order of CHOICES
        self.assertEquals(result, sentiment_values)
Example #25
0
    def test_render_single_categorical(self):
        """Can produce a datatable with a single categorical dimension."""

        values = [True, False]
        bool_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_distribution(
            field_name='contains_url',
            distribution=bool_distribution,
        )

        dimension = registry.get_dimension('contains_url')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all())

        self.assertDistributionsEqual(result,
                                      bool_distribution,
                                      level_key='contains_url',
                                      measure_key='value')
Example #26
0
    def test_render_single_quantitative_narrow(self):
        """
        Can produce a datatable with only a single quantitative dimension.
        The distribution is small enough no binning is needed.
        """

        values = [0, 2, 3, 4, 6]
        quant_distribution = self.get_distribution(values)

        dataset = self.generate_messages_for_distribution(
            field_name='shared_count',
            distribution=quant_distribution,
        )

        dimension = registry.get_dimension('shares')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all())

        self.assertDistributionsEqual(result,
                                      quant_distribution,
                                      level_key='shares',
                                      measure_key='value')
Example #27
0
    def test_render_single_related_categorical(self):
        """Can produce a datatable with a single related categorical dimension."""

        # Create some language labels
        language_ids = self.create_test_languages()
        language_distribution = self.get_distribution(language_ids)
        language_name_distribution = self.recover_related_field_distribution(
            language_distribution, corpus_models.Language, 'name')

        dataset = self.generate_messages_for_distribution(
            field_name='language_id',
            distribution=language_distribution,
        )

        dimension = registry.get_dimension('language')

        datatable = models.DataTable(dimension)
        result = datatable.render(dataset.message_set.all())

        self.assertDistributionsEqual(result,
                                      language_name_distribution,
                                      level_key='language',
                                      measure_key='value')
Example #28
0
    def setUp(self):
        self.dimension = dimensions.get_dimension('time')
        self.dataset = corpus_models.Dataset.objects.create(name="test dataset", description='description')

        internal_filter = {
            'dimension': self.dimension,
            'min_time': now(),
            'max_time': now() + timedelta(minutes=5),
        }

        serialized_filter = serializers.FilterSerializer(internal_filter).data

        self.serialized_representation = {
            'dataset': self.dataset.id,
            'dimensions': [self.dimension.key],
            'filters': [serialized_filter],
        }

        # Should lookup exactly the same dimension
        self.deserialized_representation = {
            'dataset': self.dataset,
            'dimensions': [self.dimension],
            'filters': [internal_filter],
        }
Example #29
0
    def test_related_categorical_domain(self):
        """
        Checks that the domain of a categorical related model field,
        in this case Language, can be calculated correctly.
        """

        # Create some language labels
        languages = self.create_test_languages(model=True)
        language_ids = [lang.id for lang in languages]
        language_names = [lang.name for lang in languages]
        dimension = registry.get_dimension("language")

        # Generate a distribution where messages increase with each lang id
        language_distribution = self.get_distribution(language_ids)
        dataset = self.generate_messages_for_distribution(field_name="language_id", distribution=language_distribution)
        result = dimension.get_domain(dataset.message_set.all())
        # results are in descending frequency order
        self.assertEquals(result, list(reversed(language_names)))

        # Generate another dataset with the distribution going the other way
        language_distribution = self.get_distribution(reversed(language_ids))
        dataset = self.generate_messages_for_distribution(field_name="language_id", distribution=language_distribution)
        result = dimension.get_domain(dataset.message_set.all())
        self.assertEquals(result, language_names)
Example #30
0
 def run_time_bin_test(self, delta, desired_bins, expected_bin_size):
     """Run a generic time bin test."""
     t0 = self.base_time
     t1 = t0 + delta
     dimension = registry.get_dimension('time')
     self.assertEquals(dimension._get_bin_size(t0, t1, desired_bins), expected_bin_size)
Example #31
0
 def to_internal_value(self, data):
     return registry.get_dimension(data['key'])
Example #32
0
 def test_registry_rejects_unknown_keys(self):
     """Trying to get a dimension for a nonexistent key raises an exeption"""
     with self.assertRaises(KeyError):
         registry.get_dimension('made_up_dimension_key')
Example #33
0
 def test_registry_contains_dimension(self):
     """The registry should have some dimensions"""
     time = registry.get_dimension('time')
     self.assertIsNotNone(time)
     self.assertIsInstance(time, models.TimeDimension)
Example #34
0
 def test_registry_rejects_unknown_keys(self):
     """Trying to get a dimension for a nonexistent key raises an exeption"""
     with self.assertRaises(KeyError):
         registry.get_dimension('made_up_dimension_key')
Example #35
0
 def test_registry_contains_dimension(self):
     """The registry should have some dimensions"""
     time = registry.get_dimension('time')
     self.assertIsNotNone(time)
     self.assertIsInstance(time, models.TimeDimension)