Esempio n. 1
0
    def __init__(self, series, top_n_term=None, top_n=None, pareto_term=None, 
                 sortf_mapf_mts=None):
        """ Creates a PivotDataPool object. 
        
        :Arguments: 
        
        - **series** (**required**) - a list of dicts that specifies the what 
          data to retrieve, where to retrieve it from and how to pivot the 
          data. It is of the form ::
           
            [{'options': {
                'source': django Model, Manager or QuerySet ,
                'categories': ['a_valid_field', ...],
                'legend_by': ['a_valid_field', ...] (optional),
                'top_n_per_cat': a number (optional),
              },
              'terms': {
                'any_name_here': django Aggregate,
                'some_other_name':{
                  'func': django Aggregate,
                  #any options to override
                  ...
                },
              ...
              }
             },
             ... #repeat dicts with 'options' & 'terms'
            ]
        
          Where 
        
          - **options** - is a dict that specifies the common options for all 
            the terms. 
            
            + **source** (**required**) - is either a ``Model``, ``Manager`` 
              or a ``QuerySet``.
            + **categories** (**required**) - is a list of model fields by 
              which the data needs to be pivoted by. If there is only a single 
              item, ``categories`` can just be a string instead of a list with 
              single element.  
              
              For example if you have a model with ``country``, ``state``, 
              ``county``, ``city``, ``date``, ``rainfall``, ``temperature`` 
              and you want to pivot the data by ``country`` and ``state``, 
              then ``categories = ['country', 'state']`` .
              
              .. note:: Order of elements in the ``categories`` list matters!
              
              ``categories = ['country', 'state']`` groups your data first by 
              ``country`` and then by ``state`` when running the SQL query. 
              This obviously is not the same as grouping by ``state`` first 
              and then by ``country``.
                  
            + **legend_by** (*optional*) - is a list of model fields by which 
              the data needs to be legended by. For example, in the above case, 
              if you want to legend by ``county`` and ``city``, then 
              ``legend_by = ['county', 'city']``
              
              .. note:: Order of elements in the ``legend_by`` list matters!
              
              See the note in ``categories`` above.
              
            + **top_n_per_cat** (*optional*) - The number of top items that 
              the legended entries need to be limited to in each category. For 
              example, in the above case, if you wanted only the top 3 
              ``county/cities`` with highest rainfall for each of the 
              ``country/state``, then ``top_n_per_cat = 3``.
            
          - **terms** - is a ``dict``. The keys can be any strings (but helps 
            if they are meaningful aliases for the field). The values can 
            either be  
          
            + a django ``Aggregate`` : of a valid field in corresponding model. 
              For example, ``Avg('temperature')``, ``Sum('price')``, etc. or 
            + a ``dict``: In this case the ``func`` must specify relevant 
              django aggregate to retrieve. For example 
              ``'func': Avg('price')``. The dict can also have any additional 
              entries from the options dict. Any entries here will override 
              the entries in the ``options`` dict.
        
        - **top_n_term** (*optional*) - a string. Must be one of the keys in 
          the corresponding ``terms`` in the ``series`` argument.
         
        - **top_n** (*optional*) - an integer. The number of items for the 
          corresponding ``top_n_term`` that need to be retained. 
         
          If ``top_n_term`` and ``top_n`` are present, only the ``top_n`` number 
          of items are going to displayed in the pivot chart. For example, if 
          you want to plot only the top 5 states with highest average rainfall, 
          you can do something like this. ::
            
            PivotDataPool(
              series = [
                 {'options': {
                    'source': RainfallData.objects.all(),
                    'categories': 'state'},
                  'terms': { 
                    'avg_rain': Avg('rainfall')}}],
              top_n_term = 'avg_rain',
              top_n = 5)
          
          Note that the ``top_n_term`` is ``'avg_rain'`` and **not** ``state`` ; 
          because we want to limit by the average rainfall.
        
        - **pareto_term** (*optional*) - the term with respect to which the 
          pivot chart needs to be paretoed by. 
          
          For example, if you want to plot the average rainfall on the y-axis 
          w.r.t the state on the x-axis and want to pareto by the average 
          rainfall, you can do something like this. ::
          
            PivotDataPool(
              series = [
                 {'options': {
                    'source': RainfallData.objects.all(),
                    'categories': 'state'},
                  'terms': { 
                    'avg_rain': Avg('rainfall')}}],
              pareto_term = 'avg_rain')
                
        - **sortf_mapf_mts** (*optional*) - a ``tuple`` with three elements of
          the form ``(sortf, mapf, mts)`` where 
          
          + **sortf** - is a function (or a callable) that is used as a `key`
            when sorting the category values. 
             
            For example, if ``categories = 'month_num'`` and if the months
            need to be sorted in reverse order, then ``sortf`` can be :: 
              
              sortf = lambda *x: (-1*x[0],) 
          
            .. note:: ``sortf`` is passed the category values as tuples and 
               must return tuples! 
              
            If ``categories`` is ``['city', 'state']`` and if the category 
            values returned need to be sorted with state first and then city, 
            then ``sortf`` can be :: 
              
              sortf = lambda *x: (x[1], x[0])
              
            The above ``sortf`` is passed tuples like 
            ``('San Francisco', 'CA')``, ``('New York', 'NY')``, ``...`` and 
            it returns tuples like ``('CA', 'San Francisco')``, 
            ``('NY', 'New York')``, ``...`` which when used as keys to sort the 
            category values will obviously first sort by state and then by 
            city.
                  
          + **mapf** - is a function (or a callable) that defines how the 
            category values need to be mapped.
            
            For example, let's say ``categories`` is ``'month_num'`` and that 
            the category values that are retrieved from your database are 
            ``1``, ``2``, ``3``, etc. If you want month *names* as the 
            category values instead of month numbers, you can define a 
            ``mapf`` to transform the month numbers to month names like so ::
              
              def month_name(*t):
                  names ={1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 
                          5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 
                          9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
                  month_num = t[0]
                  return (names[month_num], )
              
              mapf = month_name
            
            .. note:: ``mapf`` like ``sortf`` is passed the category values 
               as tuples and must return tuples.
               
          + **mts** - *map then sort* ; a ``bool``. If ``True``, the 
            category values are mapped first and then sorted, and if 
            ``False`` category values are sorted first and then mapped.
            
            In the above example of month names, we ``mts`` must be ``False``
            because the months must first be sorted based on their number 
            and then mapped to their names. If ``mts`` is ``True``, the 
            month numbers would be transformed to the month names, and then 
            sorted, which would yield an order like ``Apr``, ``Aug``, 
            ``Dec``, etc. (not what we want).
        
        :Raises:    
          
        - **APIInputError** - if the ``series`` argument has any invalid 
          parameters.

        Here is a full example of a ``series`` term that retrieves the 
        average temperature of the top 3 cities in each country/state and 
        the average rainfall of the top 2 cities in each country/state. ::
        
          [{'options': {
              'source': Weather.objects.all(),
              'categories': ['country', 'state'],
              'legend_by': 'city', 
              'top_n_per_cat': 3}, 
            'terms': {
              'avg_temp': Avg('temperature'),
              'avg_rain': {
                'func': Avg('rainfall'),
                'top_n_per_cat': 2}}}]
        
        The ``'top_n_per_cat': 2`` term in ``avg_rain`` dict overrides 
        ``'top_n_per_cat': 5`` from the comon options dict. Effectively, 
        the above ``series`` retrieves the *top 2*  ``cities`` with 
        highest ``avg_rain`` in each ``country/state`` and *top 3* ``cities`` 
        with highest ``avg_temp`` in each ``country/state``.
             
        A single ``PivotDataPool`` can hold data from multiple Models. 
        If there are more models or QuerySets to retrieve the data from, 
        just add more dicts to the series list with different ``source`` 
        values.
        
        .. warning:: The ``keys`` for the ``terms`` must be **unique across 
           all the dictionaries** in the ``series`` list! If there are 
           multiple terms with same ``key``, the latter ones will just 
           overwrite the previous ones.
        
        For instance, the following example is **wrong**. ::
        
          [{'options': {
              'source': EuropeWeather.objects.all(),
              'categories': ['country', 'state']}, 
            'terms': {
              'avg_temp': Avg('temperature')}},
           {'options': {
               'source': AsiaWeather.objects.all(),
               'categories': ['country', 'state']},
            'terms': {
              'avg_temp': Avg('temperature')}}]
        
        The second ``avg_temp`` will overwrite the first one. Instead just 
        use different names for each of the keys in all the dictionaries. 
        Here is the **right** format. ::
          
          [{'options': {
              'source': EuropeWeather.objects.all(),
              'categories': ['country', 'state']}, 
            'terms': {
              'europe_avg_temp': Avg('temperature')}},
           {'options': {
               'source': AsiaWeather.objects.all(),
               'categories': ['country', 'state']},
            'terms': {
              'asia_avg_temp': Avg('temperature')}}]
        """
        # Save user input to a separate dict. Can be used for debugging.
        self.user_input = locals()
        self.user_input['series'] = copy.deepcopy(series)
        
        self.series = clean_pdps(series)
        self.top_n_term = (top_n_term if top_n_term 
                           in self.series.keys() else None)
        self.top_n = (top_n if (self.top_n_term is not None 
                                and isinstance(top_n, int)) else 0)   
        self.pareto_term = (pareto_term if pareto_term in 
                            self.series.keys() else None)
        self.sortf, self.mapf, self.mts = clean_sortf_mapf_mts(sortf_mapf_mts)
        # query groups and data
        self.query_groups = \
          self._group_terms_by_query('top_n_per_cat','categories','legend_by')
        self._get_data()
Esempio n. 2
0
    def __init__(self,
                 series,
                 top_n_term=None,
                 top_n=None,
                 pareto_term=None,
                 sortf_mapf_mts=None):
        """ Creates a PivotDataPool object. 
        
        :Arguments: 
        
        - **series** (**required**) - a list of dicts that specifies the what 
          data to retrieve, where to retrieve it from and how to pivot the 
          data. It is of the form ::
           
            [{'options': {
                'source': django Model, Manager or QuerySet ,
                'categories': ['a_valid_field', ...],
                'legend_by': ['a_valid_field', ...] (optional),
                'top_n_per_cat': a number (optional),
              },
              'terms': {
                'any_name_here': django Aggregate,
                'some_other_name':{
                  'func': django Aggregate,
                  #any options to override
                  ...
                },
              ...
              }
             },
             ... #repeat dicts with 'options' & 'terms'
            ]
        
          Where 
        
          - **options** - is a dict that specifies the common options for all 
            the terms. 
            
            + **source** (**required**) - is either a ``Model``, ``Manager`` 
              or a ``QuerySet``.
            + **categories** (**required**) - is a list of model fields by 
              which the data needs to be pivoted by. If there is only a single 
              item, ``categories`` can just be a string instead of a list with 
              single element.  
              
              For example if you have a model with ``country``, ``state``, 
              ``county``, ``city``, ``date``, ``rainfall``, ``temperature`` 
              and you want to pivot the data by ``country`` and ``state``, 
              then ``categories = ['country', 'state']`` .
              
              .. note:: Order of elements in the ``categories`` list matters!
              
              ``categories = ['country', 'state']`` groups your data first by 
              ``country`` and then by ``state`` when running the SQL query. 
              This obviously is not the same as grouping by ``state`` first 
              and then by ``country``.
                  
            + **legend_by** (*optional*) - is a list of model fields by which 
              the data needs to be legended by. For example, in the above case, 
              if you want to legend by ``county`` and ``city``, then 
              ``legend_by = ['county', 'city']``
              
              .. note:: Order of elements in the ``legend_by`` list matters!
              
              See the note in ``categories`` above.
              
            + **top_n_per_cat** (*optional*) - The number of top items that 
              the legended entries need to be limited to in each category. For 
              example, in the above case, if you wanted only the top 3 
              ``county/cities`` with highest rainfall for each of the 
              ``country/state``, then ``top_n_per_cat = 3``.
            
          - **terms** - is a ``dict``. The keys can be any strings (but helps 
            if they are meaningful aliases for the field). The values can 
            either be  
          
            + a django ``Aggregate`` : of a valid field in corresponding model. 
              For example, ``Avg('temperature')``, ``Sum('price')``, etc. or 
            + a ``dict``: In this case the ``func`` must specify relevant 
              django aggregate to retrieve. For example 
              ``'func': Avg('price')``. The dict can also have any additional 
              entries from the options dict. Any entries here will override 
              the entries in the ``options`` dict.
        
        - **top_n_term** (*optional*) - a string. Must be one of the keys in 
          the corresponding ``terms`` in the ``series`` argument.
         
        - **top_n** (*optional*) - an integer. The number of items for the 
          corresponding ``top_n_term`` that need to be retained. 
         
          If ``top_n_term`` and ``top_n`` are present, only the ``top_n`` 
          numberof items are going to displayed in the pivot chart. For 
          example, if you want to plot only the top 5 states with highest 
          average rainfall, you can do something like this. ::
            
            PivotDataPool(
              series = [
                 {'options': {
                    'source': RainfallData.objects.all(),
                    'categories': 'state'},
                  'terms': { 
                    'avg_rain': Avg('rainfall')}}],
              top_n_term = 'avg_rain',
              top_n = 5)
          
          Note that the ``top_n_term`` is ``'avg_rain'`` and **not** ``state``; 
          because we want to limit by the average rainfall.
        
        - **pareto_term** (*optional*) - the term with respect to which the 
          pivot chart needs to be paretoed by. 
          
          For example, if you want to plot the average rainfall on the y-axis 
          w.r.t the state on the x-axis and want to pareto by the average 
          rainfall, you can do something like this. ::
          
            PivotDataPool(
              series = [
                 {'options': {
                    'source': RainfallData.objects.all(),
                    'categories': 'state'},
                  'terms': { 
                    'avg_rain': Avg('rainfall')}}],
              pareto_term = 'avg_rain')
                
        - **sortf_mapf_mts** (*optional*) - a ``tuple`` with three elements of
          the form ``(sortf, mapf, mts)`` where 
          
          + **sortf** - is a function (or a callable) that is used as a `key`
            when sorting the category values. 
             
            For example, if ``categories = 'month_num'`` and if the months
            need to be sorted in reverse order, then ``sortf`` can be :: 
              
              sortf = lambda *x: (-1*x[0],) 
          
            .. note:: ``sortf`` is passed the category values as tuples and 
               must return tuples! 
              
            If ``categories`` is ``['city', 'state']`` and if the category 
            values returned need to be sorted with state first and then city, 
            then ``sortf`` can be :: 
              
              sortf = lambda *x: (x[1], x[0])
              
            The above ``sortf`` is passed tuples like 
            ``('San Francisco', 'CA')``, ``('New York', 'NY')``, ``...`` and 
            it returns tuples like ``('CA', 'San Francisco')``, 
            ``('NY', 'New York')``, ``...`` which when used as keys to sort the 
            category values will obviously first sort by state and then by 
            city.
                  
          + **mapf** - is a function (or a callable) that defines how the 
            category values need to be mapped.
            
            For example, let's say ``categories`` is ``'month_num'`` and that 
            the category values that are retrieved from your database are 
            ``1``, ``2``, ``3``, etc. If you want month *names* as the 
            category values instead of month numbers, you can define a 
            ``mapf`` to transform the month numbers to month names like so ::
              
              def month_name(*t):
                  names ={1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 
                          5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 
                          9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
                  month_num = t[0]
                  return (names[month_num], )
              
              mapf = month_name
            
            .. note:: ``mapf`` like ``sortf`` is passed the category values 
               as tuples and must return tuples.
               
          + **mts** - *map then sort* ; a ``bool``. If ``True``, the 
            category values are mapped first and then sorted, and if 
            ``False`` category values are sorted first and then mapped.
            
            In the above example of month names, we ``mts`` must be ``False``
            because the months must first be sorted based on their number 
            and then mapped to their names. If ``mts`` is ``True``, the 
            month numbers would be transformed to the month names, and then 
            sorted, which would yield an order like ``Apr``, ``Aug``, 
            ``Dec``, etc. (not what we want).
        
        :Raises:    
          
        - **APIInputError** - if the ``series`` argument has any invalid 
          parameters.

        Here is a full example of a ``series`` term that retrieves the 
        average temperature of the top 3 cities in each country/state and 
        the average rainfall of the top 2 cities in each country/state. ::
        
          [{'options': {
              'source': Weather.objects.all(),
              'categories': ['country', 'state'],
              'legend_by': 'city', 
              'top_n_per_cat': 3}, 
            'terms': {
              'avg_temp': Avg('temperature'),
              'avg_rain': {
                'func': Avg('rainfall'),
                'top_n_per_cat': 2}}}]
        
        The ``'top_n_per_cat': 2`` term in ``avg_rain`` dict overrides 
        ``'top_n_per_cat': 5`` from the comon options dict. Effectively, 
        the above ``series`` retrieves the *top 2*  ``cities`` with 
        highest ``avg_rain`` in each ``country/state`` and *top 3* ``cities`` 
        with highest ``avg_temp`` in each ``country/state``.
             
        A single ``PivotDataPool`` can hold data from multiple Models. 
        If there are more models or QuerySets to retrieve the data from, 
        just add more dicts to the series list with different ``source`` 
        values.
        
        .. warning:: The ``keys`` for the ``terms`` must be **unique across 
           all the dictionaries** in the ``series`` list! If there are 
           multiple terms with same ``key``, the latter ones will just 
           overwrite the previous ones.
        
        For instance, the following example is **wrong**. ::
        
          [{'options': {
              'source': EuropeWeather.objects.all(),
              'categories': ['country', 'state']}, 
            'terms': {
              'avg_temp': Avg('temperature')}},
           {'options': {
               'source': AsiaWeather.objects.all(),
               'categories': ['country', 'state']},
            'terms': {
              'avg_temp': Avg('temperature')}}]
        
        The second ``avg_temp`` will overwrite the first one. Instead just 
        use different names for each of the keys in all the dictionaries. 
        Here is the **right** format. ::
          
          [{'options': {
              'source': EuropeWeather.objects.all(),
              'categories': ['country', 'state']}, 
            'terms': {
              'europe_avg_temp': Avg('temperature')}},
           {'options': {
               'source': AsiaWeather.objects.all(),
               'categories': ['country', 'state']},
            'terms': {
              'asia_avg_temp': Avg('temperature')}}]
        """
        # Save user input to a separate dict. Can be used for debugging.
        self.user_input = locals()
        self.user_input['series'] = copy.deepcopy(series)

        self.series = clean_pdps(series)
        self.top_n_term = (top_n_term
                           if top_n_term in self.series.keys() else None)
        self.top_n = (top_n if (self.top_n_term is not None
                                and isinstance(top_n, int)) else 0)
        self.pareto_term = (pareto_term
                            if pareto_term in self.series.keys() else None)
        self.sortf, self.mapf, self.mts = clean_sortf_mapf_mts(sortf_mapf_mts)
        # query groups and data
        self.query_groups = \
          self._group_terms_by_query('top_n_per_cat','categories','legend_by')
        self._get_data()