Example #1
0
    def get_advanced_search_results(self, keywords_text, include_types):

        clauses = keywords_text.split(',')
        inclusive_keywords = []
        exclusive_keywords = []
        queryset = self.tweet_words.all()
        message_queryset = self.message_set.all()
        if (len(include_types) > 0):
            message_queryset = message_queryset.filter(
                utils.levels_or('type__name',
                                map(lambda x: x.name, include_types)))
        final_queryset = self.message_set.none()
        for clause in clauses:
            if clause.startswith("NOT "):
                words = clause[4:].split(' ')
                word_list = utils.get_word_objs(
                    queryset=queryset,
                    text_field_name='original_text',
                    related_field_name="tweet_words__id",
                    words=words)
                if len(word_list) > 0:
                    # TODO: makes this real AND
                    #and_word_list = reduce(operator.or_, word_list)
                    exclusive_keywords.extend(word_list)

            else:
                words = clause.split(' ')
                word_list = utils.get_word_objs(
                    queryset=queryset,
                    text_field_name='original_text',
                    related_field_name="tweet_words__id",
                    words=words)
                if len(word_list) > 0:
                    #and_word_list = reduce(operator.and_, word_list)
                    #inclusive_keywords.append(and_word_list)
                    clause_queryset = message_queryset
                    for or_word_list in word_list:
                        clause_queryset = clause_queryset.filter(or_word_list)

                    final_queryset |= clause_queryset

        queryset = final_queryset

        #if len(inclusive_keywords) > 0:
        #    inclusive_keywords = reduce(operator.or_, inclusive_keywords)
        #    queryset = queryset.filter(inclusive_keywords)

        if len(exclusive_keywords) > 0:
            for word in exclusive_keywords:
                queryset = queryset.exclude(word)

        return queryset.distinct()
Example #2
0
    def get_advanced_search_results(self, keywords_text, include_types):

        clauses = keywords_text.split(',')
        inclusive_keywords = []
        exclusive_keywords = []
        queryset = self.tweet_words.all()
        message_queryset = self.message_set.all()
        if (len(include_types) > 0):
            message_queryset = message_queryset.filter(utils.levels_or('type__name', map(lambda x: x.name, include_types)))
        final_queryset = self.message_set.none()
        for clause in clauses:
            if clause.startswith("NOT "):
                words = clause[4:].split(' ')
                word_list = utils.get_word_objs(queryset=queryset, text_field_name='original_text', related_field_name="tweet_words__id", words=words)
                if len(word_list) > 0:
                    # TODO: makes this real AND
                    #and_word_list = reduce(operator.or_, word_list)
                    exclusive_keywords.extend(word_list)

            else:
                words = clause.split(' ')
                word_list = utils.get_word_objs(queryset=queryset, text_field_name='original_text', related_field_name="tweet_words__id", words=words)
                if len(word_list) > 0:
                    #and_word_list = reduce(operator.and_, word_list)
                    #inclusive_keywords.append(and_word_list)
                    clause_queryset = message_queryset
                    for or_word_list in word_list:
                        clause_queryset = clause_queryset.filter(or_word_list)


                    final_queryset |= clause_queryset


        queryset = final_queryset


        #if len(inclusive_keywords) > 0:
        #    inclusive_keywords = reduce(operator.or_, inclusive_keywords)
        #    queryset = queryset.filter(inclusive_keywords)

        if len(exclusive_keywords) > 0:
            for word in exclusive_keywords:
                queryset = queryset.exclude(word)

        return queryset.distinct()
Example #3
0
 def all_messages(self):
     queryset = self.dataset.get_message_set()
     queryset = queryset.filter(
         utils.levels_or("tweet_words__id",
                         map(lambda x: x.id, self.related_features)))
     return queryset
Example #4
0
    def generate(self, dataset, filters=None, exclude=None, page_size=100, page=None, search_key=None, groups=None):
        """
        Generate a complete data group table response.

        This includes 'table', which provides the non-zero
        message frequency for each combination of primary and secondary dimension values,
        respecting the filters.

        It also includes 'domains', which provides, for both
        primary and secondary dimensions, the levels of the
        dimension irrespective of filters (except on those actual dimensions).
        """

        if (groups is None):
            queryset = dataset.message_set.all()

            # Filter out null time
            queryset = queryset.exclude(time__isnull=True)
            if dataset.start_time and dataset.end_time:
                range = dataset.end_time - dataset.start_time
                buffer = timedelta(seconds=range.total_seconds() * 0.1)
                queryset = queryset.filter(time__gte=dataset.start_time - buffer,
                                           time__lte=dataset.end_time + buffer)

            unfiltered_queryset = queryset

            # Filter the data (look for filters on the primary/secondary dimensions at the same time
            primary_filter = None
            secondary_filter = None
            if filters is not None:
                for filter in filters:
                    dimension = filter['dimension']
                    queryset = dimension.filter(queryset, **filter)

                    if dimension == self.primary_dimension:
                        primary_filter = filter
                    if dimension == self.secondary_dimension:
                        secondary_filter = filter

            primary_exclude = None
            secondary_exclude = None
            if exclude is not None:
                for exclude_filter in exclude:
                    dimension = exclude_filter['dimension']
                    queryset = dimension.exclude(queryset, **exclude_filter)

                    if dimension == self.primary_dimension:
                        primary_exclude = exclude_filter
                    if dimension == self.secondary_dimension:
                        secondary_exclude = exclude_filter

            domains = {}
            domain_labels = {}
            max_page = None
            queryset_for_others = None

            # flag is true if the dimension is categorical and has more than MAX_CATEGORICAL_LEVELS levels
            primary_flag = False
            secondary_flag = False

            # Include the domains for primary and (secondary) dimensions
            domain, labels = self.domain(self.primary_dimension,
                                         unfiltered_queryset,
                                         primary_filter, primary_exclude)

            # paging the first dimension, this is for the filter distribution
            if primary_filter is None and self.secondary_dimension is None and page is not None:

                if search_key is not None:
                    domain, labels = self.filter_search_key(domain, labels, search_key)
                start = (page - 1) * page_size
                end = min(start + page_size, len(domain))
                max_page = (len(domain) / page_size) + 1

                # no level left
                if len(domain) == 0 or start > len(domain):
                    return None

                domain = domain[start:end]
                if labels is not None:
                    labels = labels[start:end]

                queryset = queryset.filter(utils.levels_or(self.primary_dimension.field_name, domain))
            else:
                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.primary_dimension.is_categorical() and len(domain) > MAX_CATEGORICAL_LEVELS:
                    primary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    queryset_for_others = queryset
                    queryset = queryset.filter(utils.levels_or(self.primary_dimension.field_name, domain))

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]

            domains[self.primary_dimension.key] = domain
            if labels is not None:
                domain_labels[self.primary_dimension.key] = labels

            if self.secondary_dimension:
                domain, labels = self.domain(self.secondary_dimension,
                                             unfiltered_queryset,
                                             secondary_filter, secondary_exclude)

                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.secondary_dimension.is_categorical() and \
                        len(domain) > MAX_CATEGORICAL_LEVELS:
                    secondary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    if queryset_for_others is None:
                        queryset_for_others = queryset
                    queryset = queryset.filter(utils.levels_or(self.secondary_dimension.field_name, domain))

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]


                domains[self.secondary_dimension.key] = domain
                if labels is not None:
                    domain_labels[self.secondary_dimension.key] = labels

            # Render a table
            table = self.render(queryset)

            if self.mode == "enable_others" and queryset_for_others is not None:
                # adding others to the results
                table_for_others = self.render_others(queryset_for_others, domains, primary_flag, secondary_flag)
                table = list(table)
                table.extend(table_for_others)

            results = {
                'table': table,
                'domains': domains,
                'domain_labels': domain_labels
            }
            if max_page is not None:
                results['max_page'] = max_page

        else:
            domains = {}
            domain_labels = {}
            max_page = None
            queryset_for_others = None

            # flag is true if the dimension is categorical and has more than MAX_CATEGORICAL_LEVELS levels
            primary_flag = False
            secondary_flag = False
            primary_filter = None
            secondary_filter = None
            primary_exclude = None
            secondary_exclude = None

            queryset = dataset.message_set.all()
            queryset = queryset.exclude(time__isnull=True)
            if dataset.start_time and dataset.end_time:
                range = dataset.end_time - dataset.start_time
                buffer = timedelta(seconds=range.total_seconds() * 0.1)
                queryset = queryset.filter(time__gte=dataset.start_time - buffer,
                                           time__lte=dataset.end_time + buffer)
            if filters is not None:
                for filter in filters:
                    dimension = filter['dimension']
                    queryset = dimension.filter(queryset, **filter)

                    if dimension == self.primary_dimension:
                        primary_filter = filter
                    if dimension == self.secondary_dimension:
                        secondary_filter = filter

            if exclude is not None:
                for exclude_filter in exclude:
                    dimension = exclude_filter['dimension']
                    queryset = dimension.exclude(queryset, **exclude_filter)

                    if dimension == self.primary_dimension:
                        primary_exclude = exclude_filter
                    if dimension == self.secondary_dimension:
                        secondary_exclude = exclude_filter

            queryset_all = queryset

            #queryset = corpus_models.Message.objects.none()
            group_querysets = []
            group_labels = []
            #message_list = set()


            for group in groups:
                group_obj = groups_models.Group.objects.get(id=group)
                if group_obj.order > 0:
                    group_labels.append("#%d %s"%(group_obj.order, group_obj.name))
                else:
                    group_labels.append("%s"%(group_obj.name))
                queryset = group_obj.messages


                # Filter out null time
                queryset = queryset.exclude(time__isnull=True)
                if dataset.start_time and dataset.end_time:
                    range = dataset.end_time - dataset.start_time
                    buffer = timedelta(seconds=range.total_seconds() * 0.1)
                    queryset = queryset.filter(time__gte=dataset.start_time - buffer,
                                               time__lte=dataset.end_time + buffer)

                unfiltered_queryset = queryset

                # Filter the data (look for filters on the primary/secondary dimensions at the same time

                if filters is not None:
                    for filter in filters:
                        dimension = filter['dimension']
                        queryset = dimension.filter(queryset, **filter)


                if exclude is not None:
                    for exclude_filter in exclude:
                        dimension = exclude_filter['dimension']
                        queryset = dimension.exclude(queryset, **exclude_filter)


                group_querysets.append(queryset)

#########################################################################################################################

            # deal with union distribution
            # This is due to union of queries in django does not work...
            # super ugly. Refactoring is required.


            # Include the domains for primary and (secondary) dimensions
            domain, labels = self.groups_domain(self.primary_dimension,
                                         queryset_all, group_querysets)

            # paging the first dimension, this is for the filter distribution
            if primary_filter is None and self.secondary_dimension is None and page is not None:

                if search_key is not None:
                    domain, labels = self.filter_search_key(domain, labels, search_key)
                start = (page - 1) * page_size
                end = min(start + page_size, len(domain))
                max_page = (len(domain) / page_size) + 1

                # no level left
                if len(domain) == 0 or start > len(domain):
                    return None

                domain = domain[start:end]
                if labels is not None:
                    labels = labels[start:end]

            else:
                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.primary_dimension.is_categorical() and len(domain) > MAX_CATEGORICAL_LEVELS:
                    primary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]

            domains[self.primary_dimension.key] = domain
            if labels is not None:
                domain_labels[self.primary_dimension.key] = labels

            if self.secondary_dimension:
                domain, labels = self.groups_domain(self.secondary_dimension,
                                             queryset_all, group_querysets)

                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.secondary_dimension.is_categorical() and \
                        len(domain) > MAX_CATEGORICAL_LEVELS:
                    secondary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]


                domains[self.secondary_dimension.key] = domain
                if labels is not None:
                    domain_labels[self.secondary_dimension.key] = labels
#########################################################################################################################

            group_tables = []
            for queryset in group_querysets:
                queryset_for_others = queryset
                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.primary_dimension.is_categorical():
                    queryset = queryset.filter(utils.levels_or(self.primary_dimension.field_name, domains[self.primary_dimension.key]))
                if self.secondary_dimension:
                    if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.secondary_dimension.is_categorical():
                        if queryset_for_others is None:
                            queryset_for_others = queryset
                        queryset = queryset.filter(utils.levels_or(self.secondary_dimension.field_name, domains[self.secondary_dimension.key]))

                # Render a table
                if self.primary_dimension.key == "words":
                    table = group_messages_by_words_with_raw_query(utils.quote(str(queryset.query)), fetchall_table)
                else:
                    table = self.render(queryset)

                if self.mode == "enable_others" and queryset_for_others is not None:
                    # adding others to the results
                    table_for_others = self.render_others(queryset_for_others, domains, primary_flag, secondary_flag)
                    table = list(table)
                    table.extend(table_for_others)

                group_tables.append(table)

            if self.secondary_dimension is None:
                final_table = []
                for idx, group_table in enumerate(group_tables):
                    for item in group_table:
                        item['groups'] = groups[idx]
                    final_table.extend(group_table)

                domains['groups'] = groups
                domain_labels['groups'] = group_labels

                results = {
                    'table': final_table,
                    'domains': domains,
                    'domain_labels': domain_labels
                }

            else:
                tables = []
                for idx, group_table in enumerate(group_tables):
                    tables.append({
                        'group_id': groups[idx],
                        'group_name': group_labels[idx],
                        'table': group_table
                    })
                results = {
                    'tables': tables,
                    'domains': domains,
                    'domain_labels': domain_labels
                }


            if max_page is not None:
                results['max_page'] = max_page

        return results
Example #5
0
    def render_others(self, queryset, domains, primary_flag, secondary_flag, desired_primary_bins=None, desired_secondary_bins=None):
        """
        Given a set of messages (already filtered as necessary),
        calculate the data table.

        Optionally, a number of primary and secondary bins may be given.

        The result is a list of dictionaries. Each
        dictionary contains a key for each dimension
        and a value key for the count.
        """

        # check if any of the dimensions is categorical
        if not primary_flag and not secondary_flag:
            return None

        if not self.secondary_dimension and self.primary_dimension.is_categorical() and primary_flag:
            # If there is only one dimension, we should be able to fall back
            # on that dimension's group_by() implementation.

            queryset = queryset.exclude(utils.levels_or(self.primary_dimension.field_name, domains[self.primary_dimension.key]))
            domains[self.primary_dimension.key].append(u'Other ' + self.primary_dimension.name)

            return [{self.primary_dimension.key: u'Other ' + self.primary_dimension.name, 'value': queryset.count()}]

        elif self.secondary_dimension:

            # both dimensions are categorical
            if self.primary_dimension.is_categorical() and self.secondary_dimension.is_categorical():
                original_queryset = queryset
                others_results = []
                if primary_flag:
                    domains[self.primary_dimension.key].append(u'Other ' + self.primary_dimension.name)
                if secondary_flag:
                    domains[self.secondary_dimension.key].append(u'Other ' + self.secondary_dimension.name)

                # primary others x secondary others
                if primary_flag and secondary_flag:
                    queryset = queryset.exclude(utils.levels_or(self.primary_dimension.field_name, domains[self.primary_dimension.key]))
                    queryset = queryset.exclude(utils.levels_or(self.secondary_dimension.field_name, domains[self.secondary_dimension.key]))

                    others_results.append({self.primary_dimension.key: u'Other ' + self.primary_dimension.name,
                                           self.secondary_dimension.key: u'Other ' + self.secondary_dimension.name,
                                           'value': queryset.count()})

                # primary top ones x secondary others
                if secondary_flag:
                    queryset = original_queryset
                    queryset = queryset.filter(utils.levels_or(self.primary_dimension.field_name, domains[self.primary_dimension.key]))
                    queryset = queryset.exclude(utils.levels_or(self.secondary_dimension.field_name, domains[self.secondary_dimension.key]))

                    queryset = self.primary_dimension.group_by(queryset,
                                                                          grouping_key=self.primary_dimension.key)

                    queryset = queryset.annotate(value=models.Count('id'))
                    results = list(queryset)
                    for r in results:
                        r[self.secondary_dimension.key] = u'Other ' + self.secondary_dimension.name
                    others_results.extend(results)

                # primary others x secondary top ones
                if primary_flag:
                    queryset = original_queryset
                    queryset = queryset.exclude(utils.levels_or(self.primary_dimension.field_name, domains[self.primary_dimension.key]))
                    queryset = queryset.filter(utils.levels_or(self.secondary_dimension.field_name, domains[self.secondary_dimension.key]))

                    queryset = self.secondary_dimension.group_by(queryset,
                                                                            grouping_key=self.secondary_dimension.key)

                    queryset = queryset.annotate(value=models.Count('id'))
                    results = list(queryset)
                    for r in results:
                        r[self.primary_dimension.key] = u'Other ' + self.primary_dimension.name
                    others_results.extend(results)

                return others_results

            # primary categorical and secondary quantitative
            elif self.primary_dimension.is_categorical() and primary_flag and not self.secondary_dimension.is_categorical():
                queryset = queryset.exclude(utils.levels_or(self.primary_dimension.field_name, domains[self.primary_dimension.key]))
                domains[self.primary_dimension.key].append(u'Other ' + self.primary_dimension.name)
                queryset = self.secondary_dimension.group_by(queryset,
                                                                        grouping_key=self.secondary_dimension.key,
                                                                        bins=desired_secondary_bins)
                queryset = queryset.annotate(value=models.Count('id'))
                results = list(queryset)
                for r in results:
                    r[self.primary_dimension.key] = u'Other ' + self.primary_dimension.name
                return results

            # primary quantitative and secondary categorical
            elif not self.primary_dimension.is_categorical() and self.secondary_dimension.is_categorical() and secondary_flag:
                queryset = queryset.exclude(utils.levels_or(self.secondary_dimension.field_name, domains[self.secondary_dimension.key]))
                domains[self.secondary_dimension.key].append(u'Other ' + self.secondary_dimension.name)
                queryset = self.primary_dimension.group_by(queryset,
                                                                      grouping_key=self.primary_dimension.key,
                                                                      bins=desired_primary_bins)
                queryset = queryset.annotate(value=models.Count('id'))
                results = list(queryset)
                for r in results:
                    r[self.secondary_dimension.key] = u'Other ' + self.secondary_dimension.name
                return results
Example #6
0
    def generate(self,
                 dataset,
                 filters=None,
                 exclude=None,
                 page_size=100,
                 page=None,
                 search_key=None,
                 groups=None):
        """
        Generate a complete data group table response.

        This includes 'table', which provides the non-zero
        message frequency for each combination of primary and secondary dimension values,
        respecting the filters.

        It also includes 'domains', which provides, for both
        primary and secondary dimensions, the levels of the
        dimension irrespective of filters (except on those actual dimensions).
        """

        if (groups is None):
            queryset = dataset.message_set.all()

            # Filter out null time
            queryset = queryset.exclude(time__isnull=True)
            if dataset.start_time and dataset.end_time:
                range = dataset.end_time - dataset.start_time
                buffer = timedelta(seconds=range.total_seconds() * 0.1)
                queryset = queryset.filter(time__gte=dataset.start_time -
                                           buffer,
                                           time__lte=dataset.end_time + buffer)

            unfiltered_queryset = queryset

            # Filter the data (look for filters on the primary/secondary dimensions at the same time
            primary_filter = None
            secondary_filter = None
            if filters is not None:
                for filter in filters:
                    dimension = filter['dimension']
                    queryset = dimension.filter(queryset, **filter)

                    if dimension == self.primary_dimension:
                        primary_filter = filter
                    if dimension == self.secondary_dimension:
                        secondary_filter = filter

            primary_exclude = None
            secondary_exclude = None
            if exclude is not None:
                for exclude_filter in exclude:
                    dimension = exclude_filter['dimension']
                    queryset = dimension.exclude(queryset, **exclude_filter)

                    if dimension == self.primary_dimension:
                        primary_exclude = exclude_filter
                    if dimension == self.secondary_dimension:
                        secondary_exclude = exclude_filter

            domains = {}
            domain_labels = {}
            max_page = None
            queryset_for_others = None

            # flag is true if the dimension is categorical and has more than MAX_CATEGORICAL_LEVELS levels
            primary_flag = False
            secondary_flag = False

            # Include the domains for primary and (secondary) dimensions
            domain, labels = self.domain(self.primary_dimension,
                                         unfiltered_queryset, primary_filter,
                                         primary_exclude)

            # paging the first dimension, this is for the filter distribution
            if primary_filter is None and self.secondary_dimension is None and page is not None:

                if search_key is not None:
                    domain, labels = self.filter_search_key(
                        domain, labels, search_key)
                start = (page - 1) * page_size
                end = min(start + page_size, len(domain))
                max_page = (len(domain) / page_size) + 1

                # no level left
                if len(domain) == 0 or start > len(domain):
                    return None

                domain = domain[start:end]
                if labels is not None:
                    labels = labels[start:end]

                queryset = queryset.filter(
                    utils.levels_or(self.primary_dimension.field_name, domain))
            else:
                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.primary_dimension.is_categorical() and len(domain) > MAX_CATEGORICAL_LEVELS:
                    primary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    queryset_for_others = queryset
                    queryset = queryset.filter(
                        utils.levels_or(self.primary_dimension.field_name,
                                        domain))

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]

            domains[self.primary_dimension.key] = domain
            if labels is not None:
                domain_labels[self.primary_dimension.key] = labels

            if self.secondary_dimension:
                domain, labels = self.domain(self.secondary_dimension,
                                             unfiltered_queryset,
                                             secondary_filter,
                                             secondary_exclude)

                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.secondary_dimension.is_categorical() and \
                        len(domain) > MAX_CATEGORICAL_LEVELS:
                    secondary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    if queryset_for_others is None:
                        queryset_for_others = queryset
                    queryset = queryset.filter(
                        utils.levels_or(self.secondary_dimension.field_name,
                                        domain))

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]

                domains[self.secondary_dimension.key] = domain
                if labels is not None:
                    domain_labels[self.secondary_dimension.key] = labels

            # Render a table
            table = self.render(queryset)

            if self.mode == "enable_others" and queryset_for_others is not None:
                # adding others to the results
                table_for_others = self.render_others(queryset_for_others,
                                                      domains, primary_flag,
                                                      secondary_flag)
                table = list(table)
                table.extend(table_for_others)

            results = {
                'table': table,
                'domains': domains,
                'domain_labels': domain_labels
            }
            if max_page is not None:
                results['max_page'] = max_page

        else:
            domains = {}
            domain_labels = {}
            max_page = None
            queryset_for_others = None

            # flag is true if the dimension is categorical and has more than MAX_CATEGORICAL_LEVELS levels
            primary_flag = False
            secondary_flag = False
            primary_filter = None
            secondary_filter = None
            primary_exclude = None
            secondary_exclude = None

            queryset = dataset.message_set.all()
            queryset = queryset.exclude(time__isnull=True)
            if dataset.start_time and dataset.end_time:
                range = dataset.end_time - dataset.start_time
                buffer = timedelta(seconds=range.total_seconds() * 0.1)
                queryset = queryset.filter(time__gte=dataset.start_time -
                                           buffer,
                                           time__lte=dataset.end_time + buffer)
            if filters is not None:
                for filter in filters:
                    dimension = filter['dimension']
                    queryset = dimension.filter(queryset, **filter)

                    if dimension == self.primary_dimension:
                        primary_filter = filter
                    if dimension == self.secondary_dimension:
                        secondary_filter = filter

            if exclude is not None:
                for exclude_filter in exclude:
                    dimension = exclude_filter['dimension']
                    queryset = dimension.exclude(queryset, **exclude_filter)

                    if dimension == self.primary_dimension:
                        primary_exclude = exclude_filter
                    if dimension == self.secondary_dimension:
                        secondary_exclude = exclude_filter

            queryset_all = queryset

            #queryset = corpus_models.Message.objects.none()
            group_querysets = []
            group_labels = []
            #message_list = set()

            for group in groups:
                group_obj = groups_models.Group.objects.get(id=group)
                if group_obj.order > 0:
                    group_labels.append("#%d %s" %
                                        (group_obj.order, group_obj.name))
                else:
                    group_labels.append("%s" % (group_obj.name))
                queryset = group_obj.messages

                # Filter out null time
                queryset = queryset.exclude(time__isnull=True)
                if dataset.start_time and dataset.end_time:
                    range = dataset.end_time - dataset.start_time
                    buffer = timedelta(seconds=range.total_seconds() * 0.1)
                    queryset = queryset.filter(
                        time__gte=dataset.start_time - buffer,
                        time__lte=dataset.end_time + buffer)

                unfiltered_queryset = queryset

                # Filter the data (look for filters on the primary/secondary dimensions at the same time

                if filters is not None:
                    for filter in filters:
                        dimension = filter['dimension']
                        queryset = dimension.filter(queryset, **filter)

                if exclude is not None:
                    for exclude_filter in exclude:
                        dimension = exclude_filter['dimension']
                        queryset = dimension.exclude(queryset,
                                                     **exclude_filter)

                group_querysets.append(queryset)

#########################################################################################################################

# deal with union distribution
# This is due to union of queries in django does not work...
# super ugly. Refactoring is required.

# Include the domains for primary and (secondary) dimensions
            domain, labels = self.groups_domain(self.primary_dimension,
                                                queryset_all, group_querysets)

            # paging the first dimension, this is for the filter distribution
            if primary_filter is None and self.secondary_dimension is None and page is not None:

                if search_key is not None:
                    domain, labels = self.filter_search_key(
                        domain, labels, search_key)
                start = (page - 1) * page_size
                end = min(start + page_size, len(domain))
                max_page = (len(domain) / page_size) + 1

                # no level left
                if len(domain) == 0 or start > len(domain):
                    return None

                domain = domain[start:end]
                if labels is not None:
                    labels = labels[start:end]

            else:
                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.primary_dimension.is_categorical() and len(domain) > MAX_CATEGORICAL_LEVELS:
                    primary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]

            domains[self.primary_dimension.key] = domain
            if labels is not None:
                domain_labels[self.primary_dimension.key] = labels

            if self.secondary_dimension:
                domain, labels = self.groups_domain(self.secondary_dimension,
                                                    queryset_all,
                                                    group_querysets)

                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.secondary_dimension.is_categorical() and \
                        len(domain) > MAX_CATEGORICAL_LEVELS:
                    secondary_flag = True
                    domain = domain[:MAX_CATEGORICAL_LEVELS]

                    if labels is not None:
                        labels = labels[:MAX_CATEGORICAL_LEVELS]

                domains[self.secondary_dimension.key] = domain
                if labels is not None:
                    domain_labels[self.secondary_dimension.key] = labels
#########################################################################################################################

            group_tables = []
            for queryset in group_querysets:
                queryset_for_others = queryset
                if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.primary_dimension.is_categorical():
                    queryset = queryset.filter(
                        utils.levels_or(self.primary_dimension.field_name,
                                        domains[self.primary_dimension.key]))
                if self.secondary_dimension:
                    if (self.mode == 'enable_others' or self.mode == 'omit_others') and \
                    self.secondary_dimension.is_categorical():
                        if queryset_for_others is None:
                            queryset_for_others = queryset
                        queryset = queryset.filter(
                            utils.levels_or(
                                self.secondary_dimension.field_name,
                                domains[self.secondary_dimension.key]))

                # Render a table
                if self.primary_dimension.key == "words":
                    table = group_messages_by_words_with_raw_query(
                        utils.quote(str(queryset.query)), fetchall_table)
                else:
                    table = self.render(queryset)

                if self.mode == "enable_others" and queryset_for_others is not None:
                    # adding others to the results
                    table_for_others = self.render_others(
                        queryset_for_others, domains, primary_flag,
                        secondary_flag)
                    table = list(table)
                    table.extend(table_for_others)

                group_tables.append(table)

            if self.secondary_dimension is None:
                final_table = []
                for idx, group_table in enumerate(group_tables):
                    for item in group_table:
                        item['groups'] = groups[idx]
                    final_table.extend(group_table)

                domains['groups'] = groups
                domain_labels['groups'] = group_labels

                results = {
                    'table': final_table,
                    'domains': domains,
                    'domain_labels': domain_labels
                }

            else:
                tables = []
                for idx, group_table in enumerate(group_tables):
                    tables.append({
                        'group_id': groups[idx],
                        'group_name': group_labels[idx],
                        'table': group_table
                    })
                results = {
                    'tables': tables,
                    'domains': domains,
                    'domain_labels': domain_labels
                }

            if max_page is not None:
                results['max_page'] = max_page

        return results
Example #7
0
    def render_others(self,
                      queryset,
                      domains,
                      primary_flag,
                      secondary_flag,
                      desired_primary_bins=None,
                      desired_secondary_bins=None):
        """
        Given a set of messages (already filtered as necessary),
        calculate the data table.

        Optionally, a number of primary and secondary bins may be given.

        The result is a list of dictionaries. Each
        dictionary contains a key for each dimension
        and a value key for the count.
        """

        # check if any of the dimensions is categorical
        if not primary_flag and not secondary_flag:
            return None

        if not self.secondary_dimension and self.primary_dimension.is_categorical(
        ) and primary_flag:
            # If there is only one dimension, we should be able to fall back
            # on that dimension's group_by() implementation.

            queryset = queryset.exclude(
                utils.levels_or(self.primary_dimension.field_name,
                                domains[self.primary_dimension.key]))
            domains[self.primary_dimension.key].append(
                u'Other ' + self.primary_dimension.name)

            return [{
                self.primary_dimension.key:
                u'Other ' + self.primary_dimension.name,
                'value':
                queryset.count()
            }]

        elif self.secondary_dimension:

            # both dimensions are categorical
            if self.primary_dimension.is_categorical(
            ) and self.secondary_dimension.is_categorical():
                original_queryset = queryset
                others_results = []
                if primary_flag:
                    domains[self.primary_dimension.key].append(
                        u'Other ' + self.primary_dimension.name)
                if secondary_flag:
                    domains[self.secondary_dimension.key].append(
                        u'Other ' + self.secondary_dimension.name)

                # primary others x secondary others
                if primary_flag and secondary_flag:
                    queryset = queryset.exclude(
                        utils.levels_or(self.primary_dimension.field_name,
                                        domains[self.primary_dimension.key]))
                    queryset = queryset.exclude(
                        utils.levels_or(self.secondary_dimension.field_name,
                                        domains[self.secondary_dimension.key]))

                    others_results.append({
                        self.primary_dimension.key:
                        u'Other ' + self.primary_dimension.name,
                        self.secondary_dimension.key:
                        u'Other ' + self.secondary_dimension.name,
                        'value':
                        queryset.count()
                    })

                # primary top ones x secondary others
                if secondary_flag:
                    queryset = original_queryset
                    queryset = queryset.filter(
                        utils.levels_or(self.primary_dimension.field_name,
                                        domains[self.primary_dimension.key]))
                    queryset = queryset.exclude(
                        utils.levels_or(self.secondary_dimension.field_name,
                                        domains[self.secondary_dimension.key]))

                    queryset = self.primary_dimension.group_by(
                        queryset, grouping_key=self.primary_dimension.key)

                    queryset = queryset.annotate(value=models.Count('id'))
                    results = list(queryset)
                    for r in results:
                        r[self.secondary_dimension.
                          key] = u'Other ' + self.secondary_dimension.name
                    others_results.extend(results)

                # primary others x secondary top ones
                if primary_flag:
                    queryset = original_queryset
                    queryset = queryset.exclude(
                        utils.levels_or(self.primary_dimension.field_name,
                                        domains[self.primary_dimension.key]))
                    queryset = queryset.filter(
                        utils.levels_or(self.secondary_dimension.field_name,
                                        domains[self.secondary_dimension.key]))

                    queryset = self.secondary_dimension.group_by(
                        queryset, grouping_key=self.secondary_dimension.key)

                    queryset = queryset.annotate(value=models.Count('id'))
                    results = list(queryset)
                    for r in results:
                        r[self.primary_dimension.
                          key] = u'Other ' + self.primary_dimension.name
                    others_results.extend(results)

                return others_results

            # primary categorical and secondary quantitative
            elif self.primary_dimension.is_categorical(
            ) and primary_flag and not self.secondary_dimension.is_categorical(
            ):
                queryset = queryset.exclude(
                    utils.levels_or(self.primary_dimension.field_name,
                                    domains[self.primary_dimension.key]))
                domains[self.primary_dimension.key].append(
                    u'Other ' + self.primary_dimension.name)
                queryset = self.secondary_dimension.group_by(
                    queryset,
                    grouping_key=self.secondary_dimension.key,
                    bins=desired_secondary_bins)
                queryset = queryset.annotate(value=models.Count('id'))
                results = list(queryset)
                for r in results:
                    r[self.primary_dimension.
                      key] = u'Other ' + self.primary_dimension.name
                return results

            # primary quantitative and secondary categorical
            elif not self.primary_dimension.is_categorical(
            ) and self.secondary_dimension.is_categorical() and secondary_flag:
                queryset = queryset.exclude(
                    utils.levels_or(self.secondary_dimension.field_name,
                                    domains[self.secondary_dimension.key]))
                domains[self.secondary_dimension.key].append(
                    u'Other ' + self.secondary_dimension.name)
                queryset = self.primary_dimension.group_by(
                    queryset,
                    grouping_key=self.primary_dimension.key,
                    bins=desired_primary_bins)
                queryset = queryset.annotate(value=models.Count('id'))
                results = list(queryset)
                for r in results:
                    r[self.secondary_dimension.
                      key] = u'Other ' + self.secondary_dimension.name
                return results
Example #8
0
 def all_messages(self):
     queryset = self.dataset.message_set.all()
     queryset = queryset.filter(utils.levels_or("tweet_words__id", map(lambda x: x.id, self.related_words)))
     return queryset