Example #1
0
def graph_definition(edge_string):
    """Parse a graph definition string

    >>> graph_definition('Origin,3,9_I_5.6~Origin_II~I_III')
    [{'source': 0, 'target': 1, 'value': 5.6}, {'source': 0, 'target': 2, 'value': 4}, {'source': 1, 'target': 3, 'value': 4}]
    """
    edge_definitions = listify(edge_string, delim=graph_definition.delim)
    edge_list, node_list, node_names = [], [], []
    for i, edge_definition in enumerate(edge_definitions):
        edge = {}
        node_string_pair = listify(edge_definition, delim=graph_definition.node_pair_delim)
        for j, field in enumerate(graph_definition.schema):
            try:
                obj = field['type'](node_string_pair[j])
                # if the field holds a node (source or target) then its name must be retrieved
                if isinstance(obj, Mapping) and 'name' in obj:
                    # is this node name already in our list (and other node properties already defined)
                    if obj['name'] in node_names:
                        node_index = node_names.index(obj['name'])
                        # update the existing node with any new properties of this node
                        node_list[node_index].update(obj)  # obj[1] is a node dict
                        obj = node_index
                    else:
                        # reprocess/recast the object (which is now node dict) to add default values to the node
                        obj = field['type'](obj)
                        node_list += [obj]  # obj[1] is a node dict
                        node_names += [obj['name']]  # obj[1] is a node dict
                        # since this is a new node, its node index is one less than the len of the node list (or node name list)
                        obj = len(node_list) - 1
                edge[field['key']] = obj
            except:
                edge[field['key']] = field['default']
        edge_list += [edge]
    return edge_list, node_list
Example #2
0
def lagged_series(series, lags=1, pads=None):
    """
    Delay each time series in a set of time series by the lags (number of samples) indicated.

    Pad any gaps in the resulting series with the value of pads or clip, if None.


    TODO: Allow fractional sample lags (interpolation)
    TODO: Allow time value lags instead of sample counts
    TODO: Incorporate into the nlp.db.Columns class
    
    >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=3)
    [[-1, 0, 1, 2, 3], [1, 8, 2, 2, 7], [8, 2, 8, 8, 1]]
    >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=[2, 1], pads=0)
    [[-1, 0, 1, 2, 3], [0, 0, 2, 7, 1], [0, 8, 1, 8, 2]]
    >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=[-1, 3], pads=[-9, -5])
    [[-1, 0, 1, 2, 3], [7, 1, 8, 2, -9], [-5, -5, -5, 8, 1]]
    """
    N = len(series) - 1
    pads = [None] * N if pads is None else util.listify(pads, N)
    pads = [None] + pads
    lags = [None] * N if lags is None else util.listify(lags, N)
    lags = [None] + lags

    ans = [series[0]]

    for i in range(1, min(len(lags) + 1, len(pads) + 1, N + 1)):
        #print pads[i]
        ans += [lagged_seq(series[i], lags[i], pads[i])]

    return ans
Example #3
0
def lagged_series(series, lags=1, pads=None):
    """
    Delay each time series in a set of time series by the lags (number of samples) indicated.

    Pad any gaps in the resulting series with the value of pads or clip, if None.


    TODO: Allow fractional sample lags (interpolation)
    TODO: Allow time value lags instead of sample counts
    TODO: Incorporate into the nlp.db.Columns class
    
    >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=3)
    [[-1, 0, 1, 2, 3], [1, 8, 2, 2, 7], [8, 2, 8, 8, 1]]
    >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=[2, 1], pads=0)
    [[-1, 0, 1, 2, 3], [0, 0, 2, 7, 1], [0, 8, 1, 8, 2]]
    >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=[-1, 3], pads=[-9, -5])
    [[-1, 0, 1, 2, 3], [7, 1, 8, 2, -9], [-5, -5, -5, 8, 1]]
    """
    N = len(series) - 1
    pads = [None] * N if pads is None else util.listify(pads, N)
    pads = [None] + pads
    lags = [None] * N if lags is None else util.listify(lags, N)
    lags = [None] + lags

    ans = [series[0]]

    for i in range(1, min(len(lags) + 1, len(pads) + 1, N + 1)):
        #print pads[i]
        ans += [lagged_seq(series[i], lags[i], pads[i])]

    return ans
Example #4
0
def make_time_series(x,
                     t=pd.Timestamp(datetime.datetime(1970, 1, 1)),
                     freq=None):
    """Convert a 2-D array of time/value pairs (or pair of time/value vectors) into a pd.Series time-series

    >>> make_time_series(range(3), freq='15min')  # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
    1970-01-01 00:00:00   NaN
    1970-01-01 00:15:00   NaN
    1970-01-01 00:30:00   NaN
    dtype: float64
    """
    if isinstance(x, pd.DataFrame):
        x = pd.Series(x[x.columns[0]])
    elif not isinstance(x, pd.Series) and (
            not isinstance(t,
                           (pd.Series, pd.Index, list, tuple)) or not len(t)):
        #warnings.warn("Coercing a non-Series")
        if len(x) == 2:
            t, x = listify(x[0]), listify(x[1])
        elif len(x) >= 2:
            try:
                t, x = zip(*x)
            except (ValueError, IndexError, TypeError):
                pass
        x = pd.Series(x)
    else:
        if isinstance(t, (datetime.datetime, pd.Timestamp)):
            t = pd.Timestamp(t)
        else:
            x = pd.Series(listify(x), index=listify(t))
    if not isinstance(x, pd.Series):
        raise TypeError(
            "`pug.invest.util.make_time_series(x, t)` expects x to be a type that"
            " can be coerced to a Series object, but it's type is: {0}".format(
                type(x)))
    # By this point x must be a Series, only question is whether its index needs to be converted to a DatetimeIndex
    if x.index[0] != 0 and isinstance(
            x.index[0], (datetime.date, datetime.datetime, pd.Timestamp,
                         basestring, float, np.int64, int)):
        t = x.index
    elif isinstance(t, (datetime.date, datetime.datetime, pd.Timestamp,
                        basestring, float, np.int64, int)):
        if not freq:
            freq = '15min'
            warnings.warn(
                'Assumed time series freq to be {0} though no freq argument was provided!'
                .format(freq), RuntimeWarning)
        t = pd.date_range(t, periods=len(x), freq=freq)
    x = pd.Series(x, index=t)
    if isinstance(x, pd.Series):
        x.index = pd.DatetimeIndex(x.index.values)
    return x
Example #5
0
 def __init__(self, start_urls=None, *args, **kwargs):
     self.start_urls = [
         r'http://www.google.com/googlebooks/uspto-patents-applications-biblio.html'
     ]
     if start_urls:
         self.start_urls = listify(start_urls)
     super(PatentBiblioSpider, self).__init__(*args, **kwargs)
Example #6
0
def node_name(name, use_defaults=False):
    """
    >>> sorted(node_name('Origin,2.7, 3 ')[1].items())
    [('charge', 2.7), ('group', 3), ('name', 'Origin')]
    >>> node_name('Origin,2.7, 3 ')[0]
    'Origin'
    """
    # if the name is not a string, but a dict defining a node, then just set the defaults and return it
    if isinstance(name, Mapping):
        ans = dict(name)
        for j, field in enumerate(node_name.schema):
            if field['key'] not in ans:
                ans[field['key']] = field['default']
        return ans
    seq = listify(name, delim=',')
    ans = {}
    for j, field in enumerate(node_name.schema):
        if 'default' in field:
            try:
                ans[field['key']] = field['type'](seq[j])
            except:
                if use_defaults:
                    ans[field['key']] = field['default']
        else:
            try:
                ans[field['key']] = ans.get(field['key'],
                                            field['type'](seq[j]))
            except:
                pass
    return ans
Example #7
0
def dropna(x):
    """Delete all NaNs and and infinities in a sequence of real values

    Returns:
        list: Array of all values in x that are between -inf and +inf, exclusive
    """
    return [x_i for x_i in listify(x) if float('-inf') < x_i < float('inf')]
Example #8
0
def dropna(x):
    """Delete all NaNs and and infinities in a sequence of real values

    Returns:
        list: Array of all values in x that are between -inf and +inf, exclusive
    """
    return [x_i for x_i in listify(x) if float('-inf') < x_i < float('inf')]
Example #9
0
def args_tptnfpfn(*args, **kwargs):
    """Convert kwargs for tp, tn, fp, fn to ordered tuple of args
    If a single tuple/list is passed as the first arg, it is assumed to be the desired tuple of args
    >>> args_tptnfpfn(1, 2, 3, 4)
    (1, 2, 3, 4)
    >>> args_tptnfpfn((1, 2, 3, 4))
    (1, 2, 3, 4)
    >>> args_tptnfpfn([1, 2, 3, 4])
    (1, 2, 3, 4)
    >>> args_tptnfpfn(3, 4, tp=1, tn=2)
    (1, 2, 3, 4)
    >>> args_tptnfpfn(tp=1, tn=2)
    (1, 2, 0, 0)
    >>> args_tptnfpfn(tp=1, tn=2, fp=3, fn=4)
    (1, 2, 3, 4)
    >>> args_tptnfpfn(1)
    (1, 0, 0, 0)
    """
    if len(args) == 4:
        tp, tn, fp, fn = args
    elif len(kwargs) == 0:
        if len(args) == 1:
            args = listify(args[0])
        tp, tn, fp, fn = list(list(args) + [0] * (4 - len(args)))
    else:
        args = list(args)
        tp = kwargs['tp'] if 'tp' in kwargs else args.pop(0) if len(args) else 0
        tn = kwargs['tn'] if 'tn' in kwargs else args.pop(0) if len(args) else 0
        fp = kwargs['fp'] if 'fp' in kwargs else args.pop(0) if len(args) else 0
        fn = kwargs['fn'] if 'fn' in kwargs else args.pop(0) if len(args) else 0
    return tp, tn, fp, fn
Example #10
0
def append_app_urls(local, app_names):
    app_names = listify(app_names)  # or local.get('local.settings.INSTALLED_APPS') ;)
    urlpatterns = local.get('urlpatterns', patterns(''))

    for app_name in app_names:
        urlpatterns += patterns('', url(r'^', include('%s.urls' % app_name)))#, name='order-list'),)
    local['urlpatterns'] = urlpatterns
Example #11
0
def node_name(name, use_defaults=False):
    """
    >>> sorted(node_name('Origin,2.7, 3 ')[1].items())
    [('charge', 2.7), ('group', 3), ('name', 'Origin')]
    >>> node_name('Origin,2.7, 3 ')[0]
    'Origin'
    """
    # if the name is not a string, but a dict defining a node, then just set the defaults and return it
    if isinstance(name, Mapping):
        ans = dict(name)
        for j, field in enumerate(node_name.schema):
            if field['key'] not in ans:
                ans[field['key']] = field['default']
        return ans
    seq = listify(name, delim=',')
    ans = {}
    for j, field in enumerate(node_name.schema):
        if 'default' in field:
            try:
                ans[field['key']] = field['type'](seq[j])
            except:
                if use_defaults:
                    ans[field['key']] = field['default']
        else:
            try:
                ans[field['key']] = ans.get(field['key'], field['type'](seq[j]))
            except:
                pass
    return ans
Example #12
0
def augment_model_meta(model, db_alias, model_meta, column_name_filters=None, count=0, verbosity=0):
    """Fields are keyed by their db_column name rather than field name (like model_meta)"""
    if settings.DEBUG and verbosity > 2:
        print 'Augmenting model meta data for %r...' % model

    column_name_filters = util.listify(column_name_filters)
    queryset = djdb.get_queryset(model)

    if db_alias:
        queryset = queryset.using(db_alias)
    for field_name in model._meta.get_all_field_names():
        field = None
        try:
            field = model._meta.get_field(field_name)
            db_column = field.db_column
        # Django creates reverse ForeignKey relationship fields that may not have a database column in this table
        # This happens if you make existing fields/columns in other tables a ForeignKey referencing this table
        except FieldDoesNotExist:
            db_column = None
        if not field:
            if verbosity:
                print "WARNING: Skipped 'phantom' field named '%s'.  This is likely because of a ForeignKey relationship elsewhere back to this model (%r). No field found in the model '%s' for database '%s'." % (field_name, model, model.__name__, db_alias)
            continue
        if not db_column:
            if field.name in model_meta:
                db_column = field.name
            elif field.name.lower() in model_meta:
                db_column = field.name.lower()
            elif field.name.upper() in model_meta:
                db_column = field.name.upper()
        if not db_column:
            if verbosity:
                print "WARNING: Skipped field named '%s'. No column found in the database.table '%s.%s'." % (field.name, db_alias, model.__name__)
            continue
        if column_name_filters:
            if not any(((callable(cnf) and cnf(db_column)) or (db_column == cnf)) for cnf in column_name_filters):
                if verbosity:
                    print "WARNING: Skipped field named '%s' for table '%s.%s' because it didn't match any filters: %r." % (field.name, db_alias, model.__name__, column_name_filters)
                continue
        if (field.name == 'id' and isinstance(field, models.fields.AutoField)
                and field.primary_key and (not model_meta[db_column]['primary_key'])):
            print "WARNING: Skipped field named '%s' for table '%s.%s' because it is an AutoField and no primary_key is defined for this table." % (field.name, db_alias, model.__name__)
            continue

        model_meta[db_column] = augment_field_meta(field, queryset, model_meta[db_column], count=count, verbosity=verbosity)
        if verbosity > 1:
            print '%s (%s of type %s) has %s / %s (%3.1f%%) distinct values between %s and %s, excluding %s nulls.' % (field.name, db_column, 
                                                        model_meta[db_column]['type'],
                                                        model_meta[db_column]['num_distinct'], 
                                                        count,
                                                        100. * (model_meta[db_column]['num_distinct'] or 0) / (count or 1),
                                                        repr(model_meta[db_column]['min']),
                                                        repr(model_meta[db_column]['max']),
                                                        model_meta[db_column]['num_null'])
    return model_meta
Example #13
0
def make_time_series(x, t=pd.Timestamp(datetime.datetime(1970,1,1)), freq=None):
    """Convert a 2-D array of time/value pairs (or pair of time/value vectors) into a pd.Series time-series

    >>> make_time_series(range(3))  # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
    1970-01-01 00:00:00   NaN
    1970-01-01 00:15:00   NaN
    1970-01-01 00:30:00   NaN
    dtype: float64
    """
    if isinstance(x, pd.DataFrame):
        x = pd.Series(x[x.columns[0]])
    elif not isinstance(x, pd.Series) and (not isinstance(t, (pd.Series, pd.Index, list, tuple)) or not len(t)):
        #warnings.warn("Coercing a non-Series")
        if len(x) == 2: 
            t, x = listify(x[0]), listify(x[1])
        elif len(x) >= 2:
            try:
                t, x = zip(*x)
            except (ValueError, IndexError, TypeError):
                pass
        x = pd.Series(x)
    else:
        if isinstance(t, (datetime.datetime, pd.Timestamp)):
            t = pd.Timestamp(t)
        else:
            x = pd.Series(listify(x), index=listify(t))
    if not isinstance(x, pd.Series):
        raise TypeError("`pug.invest.util.make_time_series(x, t)` expects x to be a type that can be coerced to a Series object, but it's type is: {0}".format(type(x)))
    # By this point x must be a Series, only question is whether its index needs to be converted to a DatetimeIndex
    if x.index[0] != 0 and isinstance(x.index[0], (datetime.date, datetime.datetime, pd.Timestamp, basestring, float, np.int64, int)):
        t = x.index
    elif isinstance(t, (datetime.date, datetime.datetime, pd.Timestamp, basestring, float, np.int64, int)):
        if not freq:
            freq = '15min'
            warnings.warn('Assumed time series freq to be {0} though no freq argument was provided!'.format(freq), RuntimeWarning)
        t = pd.date_range(t, periods=len(x), freq=freq)
    x = pd.Series(x, index=t)
    if isinstance(x, pd.Series):
        x.index = pd.DatetimeIndex(x.index.values)
    return x
Example #14
0
def graph_definition(edge_string):
    """Parse a graph definition string

    >>> graph_definition('Origin,3,9_I_5.6~Origin_II~I_III')
    [{'source': 0, 'target': 1, 'value': 5.6}, {'source': 0, 'target': 2, 'value': 4}, {'source': 1, 'target': 3, 'value': 4}]
    """
    edge_definitions = listify(edge_string, delim=graph_definition.delim)
    edge_list, node_list, node_names = [], [], []
    for i, edge_definition in enumerate(edge_definitions):
        edge = {}
        node_string_pair = listify(edge_definition,
                                   delim=graph_definition.node_pair_delim)
        for j, field in enumerate(graph_definition.schema):
            try:
                obj = field['type'](node_string_pair[j])
                # if the field holds a node (source or target) then its name must be retrieved
                if isinstance(obj, Mapping) and 'name' in obj:
                    # is this node name already in our list (and other node properties already defined)
                    if obj['name'] in node_names:
                        node_index = node_names.index(obj['name'])
                        # update the existing node with any new properties of this node
                        node_list[node_index].update(
                            obj)  # obj[1] is a node dict
                        obj = node_index
                    else:
                        # reprocess/recast the object (which is now node dict) to add default values to the node
                        obj = field['type'](obj)
                        node_list += [obj]  # obj[1] is a node dict
                        node_names += [obj['name']]  # obj[1] is a node dict
                        # since this is a new node, its node index is one less than the len of the node list (or node name list)
                        obj = len(node_list) - 1
                edge[field['key']] = obj
            except:
                edge[field['key']] = field['default']
        edge_list += [edge]
    return edge_list, node_list
Example #15
0
def append_urls(local, app_names=None):
    app_names = listify(app_names or basename(dirname(local.get('__file__', None))))
    urlpatterns = local.get('urlpatterns', patterns(''))

    for app_name in app_names:
        print 'Composing REST API URLs (Django urlconf entries) for app named %r' % app_name
        views_name = app_name + '.views'
        app_module = __import__(views_name)
        app = get_app(app_name)
        for Model in get_models(app):
            # print Model
            model_name = Model.__name__
            View = app_module.views.__dict__[model_name + 'List']
            urlpatterns += patterns('', url(r'^' + app_name + r'/' + model_name, View.as_view()))#, name='order-list'),)

    local['urlpatterns'] = urlpatterns
Example #16
0
def create_model_viewsets(local, app_names=None):
    app_names = listify(
        app_names
        or os.path.basename(os.path.dirname(local.get('__file__', None))))

    for app_name in app_names:  # , 'npc_s'):
        app = get_app(app_name)
        for Model in get_models(app):

            class KitchenSinkFilter(more_django_filters.FilterSet):
                class Meta:
                    model = Model
                    # fields = tuple(f.name for f in model._meta.fields)

            # KitchenSinkFilter.__doc__ = "Filter (query) for records the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__)

            class KitchenSinkSerializer(serializers.ModelSerializer):
                class Meta:
                    model = KitchenSinkFilter.Meta.model

            class KitchenSinkList(generics.ListAPIView):
                __doc__ = "Filtered list of database records (table rows) for the database.table <strong>%s.%s</strong>\n<br>\n%s" % (
                    app_name, Model.__name__, Model.__doc__)
                model = KitchenSinkFilter.Meta.model
                serializer_class = KitchenSinkSerializer
                #filter_fields = ('acctno','whse','status','partno','date_time','reference','return_days')
                filter_class = KitchenSinkFilter

                class Meta:
                    model = Model
                    fields = tuple(f.name for f in model._meta.fields)

            KitchenSinkList.__name__ = Model.__name__ + 'List'
            # KitchenSinkList.__doc__ = "Filtered list of database records (table rows) for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__)
            local[KitchenSinkList.__name__] = KitchenSinkList

            class KitchenSinkViewSet(viewsets.ModelViewSet):
                serializer_class = KitchenSinkSerializer
                model = KitchenSinkFilter.Meta.model
                filter_fields = tuple(f.name for f in model._meta.fields)
                order_by = tuple(f.name for f in model._meta.fields)

            KitchenSinkViewSet.__name__ = Model.__name__ + 'ViewSet'
            # KitchenSinkViewSet.__doc__ = "A ViewSet for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__)
            local[KitchenSinkViewSet.__name__] = KitchenSinkViewSet
Example #17
0
def create_model_viewsets(local, app_names=None):
    app_names = listify(app_names or os.path.basename(os.path.dirname(local.get('__file__', None))))

    for app_name in app_names:  # , 'npc_s'):
        app = get_app(app_name)
        for Model in get_models(app):

            class KitchenSinkFilter(more_django_filters.FilterSet):
                class Meta:
                    model = Model
                    # fields = tuple(f.name for f in model._meta.fields)
            # KitchenSinkFilter.__doc__ = "Filter (query) for records the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__)

            class KitchenSinkSerializer(serializers.ModelSerializer):
                class Meta:
                    model = KitchenSinkFilter.Meta.model

            class KitchenSinkList(generics.ListAPIView):
                __doc__ = "Filtered list of database records (table rows) for the database.table <strong>%s.%s</strong>\n<br>\n%s" % (app_name, Model.__name__, Model.__doc__)
                model = KitchenSinkFilter.Meta.model
                serializer_class = KitchenSinkSerializer
                #filter_fields = ('acctno','whse','status','partno','date_time','reference','return_days')
                filter_class = KitchenSinkFilter
                class Meta:
                    model = Model
                    fields = tuple(f.name for f in model._meta.fields)

            KitchenSinkList.__name__ = Model.__name__ + 'List'
            # KitchenSinkList.__doc__ = "Filtered list of database records (table rows) for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__)
            local[KitchenSinkList.__name__] = KitchenSinkList


            class KitchenSinkViewSet(viewsets.ModelViewSet):
                serializer_class = KitchenSinkSerializer
                model = KitchenSinkFilter.Meta.model
                filter_fields = tuple(f.name for f in model._meta.fields)
                order_by = tuple(f.name for f in model._meta.fields)
     
            KitchenSinkViewSet.__name__ = Model.__name__ + 'ViewSet'
            # KitchenSinkViewSet.__doc__ = "A ViewSet for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__)
            local[KitchenSinkViewSet.__name__] = KitchenSinkViewSet
Example #18
0
def animate_panel(panel, keys=None, columns=None, interval=1000, blit=False, titles='', path='animate_panel', xlabel='Time', ylabel='Value', ext='gif', 
                  replot=False, linewidth=3, close=False, fontsize=24, background_color='white', alpha=1, figsize=(12,8), xlabel_rotation=-25, plot_kwargs=(('rotation', 30),), 
                  verbosity=1, **video_kwargs):
    """Animate a pandas.Panel by flipping through plots of the data in each dataframe

    Arguments:
      panel (pandas.Panel): Pandas Panel of DataFrames to animate (each DataFrame is an animation video frame)
      keys (list of str): ordered list of panel keys (pages) to animate
      columns (list of str): ordered list of data series names to include in plot for eath video frame
      interval (int): number of milliseconds between video frames
      titles (str or list of str): titles to place in plot on each data frame.
        default = `keys` so that titles changes with each frame
      path (str): path and base file name to save *.mp4 animation video ('' to not save) 
      kwargs (dict): pass-through kwargs for `animation.FuncAnimation(...).save(path, **kwargs)`
        (Not used if `not path`)

    TODO: 
      - Work with other 3-D data formats:
          - dict (sorted by key) or OrderedDict
          - list of 2-D arrays/lists
          - 3-D arrays/lists
          - generators of 2-D arrays/lists
          - generators of generators of lists/arrays?
      - Write json and html5 files for d3 SVG line plots with transitions!

    >>> x = np.arange(0, 2*np.pi, 0.05)
    >>> panel = pd.Panel(dict((i, pd.DataFrame({
    ...        'T=10': np.sin(x + i/10.),
    ...        'T=7': np.sin(x + i/7.),
    ...        'beat': np.sin(x + i/10.) + np.sin(x + i/7.),
    ...        }, index=x)
    ...    ) for i in range(50)))
    >>> animate_panel(panel, interval=200, path='animate_panel_test')  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
    Drawing frames for a ".gif" animation...
    Saving video to animate_panel_test.gif...
              T=10       T=7      beat
    0.00  0.000000  0.000000  0.000000
    0.05  0.049979  0.049979  0.099958
    ...

    [126 rows x 3 columns]
    """
    plot_kwargs = plot_kwargs or {}
    plot_kwargs = dict(plot_kwargs)
    ext_kwargs = {
        'mp4': {'writer': 'ffmpeg', 'codec': 'mpeg4', 'dpi': 100, 'bitrate': 2000},
        'gif': {'writer': 'imagemagick', 'dpi': 100, 'bitrate': 2000},
        'imagemagic.gif': {'writer': 'imagemagick_gif', 'dpi': 100, 'bitrate': 2000},
        }
    ext = str(ext).lower().strip() or 'gif'
    default_kwargs = ext_kwargs.get(ext, {})

    keys = keys or list(panel.keys())
    if titles:
        titles = listify(titles)
        if len(titles) == 1:
            titles *= len(keys)
    else:
        titles = keys
    titles = dict((k, title) for k, title in zip(keys, titles))
    columns = columns or list(panel[keys[0]].columns)
    
    fig, ax = plt.subplots(figsize=figsize)

    fig.patch.set_facecolor(background_color)
    fig.patch.set_alpha(alpha)


    i = 0
    df = panel[keys[i]]
    x = df.index.values
    y = df[columns].values
    lines = ax.plot(x, y)
    ax.grid('on')
    ax.patch.set_facecolor(background_color)
    ax.patch.set_alpha(alpha)
    ax.title.set_text(titles[keys[0]])
    ax.title.set_fontsize(fontsize)
    ax.title.set_fontweight('bold')
    ax.xaxis.label.set_text(xlabel)
    plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation)
    ax.yaxis.label.set_text(ylabel)
    ax.legend(columns)


    def animate(k):
        df = panel[k]
        x = df.index.values
        y = df[columns].values.T
        if replot:
            # plt.cla()
            # fig, ax = plt.subplots(figsize=figsize)
            fig = ax.figure
            fig.patch.set_facecolor(background_color)
            fig.patch.set_alpha(alpha)
            lines = ax.plot(x, y.T, linewidth=linewidth)
            ax.grid('on')
            ax.patch.set_facecolor(background_color)
            ax.patch.set_alpha(alpha)
            ax.title.set_text(titles[k])
            ax.title.set_fontsize(fontsize)
            ax.title.set_fontweight('bold')
            ax.xaxis.label.set_text(xlabel)
            plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation)
            ax.yaxis.label.set_text(ylabel)
            ax.legend(columns)
        else:
            lines = ax.lines
            fig = ax.figure

            for i in range(len(lines)):
                lines[i].set_xdata(x)  # all lines have to share the same x-data
                lines[i].set_ydata(y[i])  # update the data, don't replot a new line
                lines[i].set_linewidth(linewidth)
                lines[i].figure.set_facecolor(background_color)
                lines[i].figure.set_alpha(alpha)
                lines[i].axes.patch.set_facecolor(background_color)
                lines[i].axes.patch.set_alpha(alpha)
            ax.patch.set_facecolor(background_color)
            ax.figure.patch.set_alpha(alpha)
            ax.title.set_text(titles[k])
            ax.title.set_fontsize(fontsize)
            ax.title.set_fontweight('bold')
        if blit:
            return lines

    # FIXME: doesn't work with ext=mp4
    # init_func to mask out pixels to be redrawn/cleared which speeds redrawing of plot
    def mask_lines():
        if verbosity > 0:
            print('Initialing mask_lines. . .')
        df = panel[0]
        x = df.index.values
        y = df[columns].values.T
        for i in range(len(lines)):
            # FIXME: why are x-values used to set the y-data coordinates of the mask?
            lines[i].set_xdata(np.ma.array(x, mask=True))
            lines[i].set_ydata(np.ma.array(y[i], mask=True))
        return lines

    if verbosity > 0:
        print('Drawing frames for a ".{0}" animation{1}. . .'.format(ext, ' with blitting' if blit else ''))
    animate(keys[0])
    ani = animation.FuncAnimation(fig, animate, keys, interval=interval, blit=blit) #, init_func=mask_lines, blit=True)

    kwargs = dict(default_kwargs)
    for k, v in six.iteritems(default_kwargs):
        kwargs[k] = video_kwargs.get(k, v)
    # if 'bitrate' in kwargs:
    #     kwargs['bitrate'] = min(kwargs['bitrate'], int(8e5 / interval))  # low information rate (long interval) might make it impossible to achieve a higher bitrate ight not
    if path and isinstance(path, basestring):
        path += '.{0}'.format(ext)
        if verbosity > 0:
            print('Saving video to {0}. . .'.format(path))
        ani.save(path, **kwargs)

    if close:
        plt.close(fig)
    return df
Example #19
0
def animate_panel(panel,
                  keys=None,
                  columns=None,
                  interval=1000,
                  blit=False,
                  titles='',
                  path='animate_panel',
                  xlabel='Time',
                  ylabel='Value',
                  ext='gif',
                  replot=False,
                  linewidth=3,
                  close=False,
                  fontsize=24,
                  background_color='white',
                  alpha=1,
                  figsize=(12, 8),
                  xlabel_rotation=-25,
                  plot_kwargs=(('rotation', 30), ),
                  verbosity=1,
                  **video_kwargs):
    """Animate a pandas.Panel by flipping through plots of the data in each dataframe

    Arguments:
      panel (pandas.Panel): Pandas Panel of DataFrames to animate (each DataFrame is an animation video frame)
      keys (list of str): ordered list of panel keys (pages) to animate
      columns (list of str): ordered list of data series names to include in plot for eath video frame
      interval (int): number of milliseconds between video frames
      titles (str or list of str): titles to place in plot on each data frame.
        default = `keys` so that titles changes with each frame
      path (str): path and base file name to save *.mp4 animation video ('' to not save) 
      kwargs (dict): pass-through kwargs for `animation.FuncAnimation(...).save(path, **kwargs)`
        (Not used if `not path`)

    TODO: 
      - Work with other 3-D data formats:
          - dict (sorted by key) or OrderedDict
          - list of 2-D arrays/lists
          - 3-D arrays/lists
          - generators of 2-D arrays/lists
          - generators of generators of lists/arrays?
      - Write json and html5 files for d3 SVG line plots with transitions!

    >>> x = np.arange(0, 2*np.pi, 0.05)
    >>> panel = pd.Panel(dict((i, pd.DataFrame({
    ...        'T=10': np.sin(x + i/10.),
    ...        'T=7': np.sin(x + i/7.),
    ...        'beat': np.sin(x + i/10.) + np.sin(x + i/7.),
    ...        }, index=x)
    ...    ) for i in range(50)))
    >>> animate_panel(panel, interval=200, path='animate_panel_test')  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
    Drawing frames for a ".gif" animation...
    Saving video to animate_panel_test.gif...
              T=10       T=7      beat
    0.00  0.000000  0.000000  0.000000
    0.05  0.049979  0.049979  0.099958
    ...

    [126 rows x 3 columns]
    """
    plot_kwargs = plot_kwargs or {}
    plot_kwargs = dict(plot_kwargs)
    ext_kwargs = {
        'mp4': {
            'writer': 'ffmpeg',
            'codec': 'mpeg4',
            'dpi': 100,
            'bitrate': 2000
        },
        'gif': {
            'writer': 'imagemagick',
            'dpi': 100,
            'bitrate': 2000
        },
        'imagemagic.gif': {
            'writer': 'imagemagick_gif',
            'dpi': 100,
            'bitrate': 2000
        },
    }
    ext = str(ext).lower().strip() or 'gif'
    default_kwargs = ext_kwargs.get(ext, {})

    keys = keys or list(panel.keys())
    if titles:
        titles = listify(titles)
        if len(titles) == 1:
            titles *= len(keys)
    else:
        titles = keys
    titles = dict((k, title) for k, title in zip(keys, titles))
    columns = columns or list(panel[keys[0]].columns)

    fig, ax = plt.subplots(figsize=figsize)

    fig.patch.set_facecolor(background_color)
    fig.patch.set_alpha(alpha)

    i = 0
    df = panel[keys[i]]
    x = df.index.values
    y = df[columns].values
    lines = ax.plot(x, y)
    ax.grid('on')
    ax.patch.set_facecolor(background_color)
    ax.patch.set_alpha(alpha)
    ax.title.set_text(titles[keys[0]])
    ax.title.set_fontsize(fontsize)
    ax.title.set_fontweight('bold')
    ax.xaxis.label.set_text(xlabel)
    plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation)
    ax.yaxis.label.set_text(ylabel)
    ax.legend(columns)

    def animate(k):
        df = panel[k]
        x = df.index.values
        y = df[columns].values.T
        if replot:
            # plt.cla()
            # fig, ax = plt.subplots(figsize=figsize)
            fig = ax.figure
            fig.patch.set_facecolor(background_color)
            fig.patch.set_alpha(alpha)
            lines = ax.plot(x, y.T, linewidth=linewidth)
            ax.grid('on')
            ax.patch.set_facecolor(background_color)
            ax.patch.set_alpha(alpha)
            ax.title.set_text(titles[k])
            ax.title.set_fontsize(fontsize)
            ax.title.set_fontweight('bold')
            ax.xaxis.label.set_text(xlabel)
            plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation)
            ax.yaxis.label.set_text(ylabel)
            ax.legend(columns)
        else:
            lines = ax.lines
            fig = ax.figure

            for i in range(len(lines)):
                lines[i].set_xdata(
                    x)  # all lines have to share the same x-data
                lines[i].set_ydata(
                    y[i])  # update the data, don't replot a new line
                lines[i].set_linewidth(linewidth)
                lines[i].figure.set_facecolor(background_color)
                lines[i].figure.set_alpha(alpha)
                lines[i].axes.patch.set_facecolor(background_color)
                lines[i].axes.patch.set_alpha(alpha)
            ax.patch.set_facecolor(background_color)
            ax.figure.patch.set_alpha(alpha)
            ax.title.set_text(titles[k])
            ax.title.set_fontsize(fontsize)
            ax.title.set_fontweight('bold')
        if blit:
            return lines

    # FIXME: doesn't work with ext=mp4
    # init_func to mask out pixels to be redrawn/cleared which speeds redrawing of plot
    def mask_lines():
        if verbosity > 0:
            print('Initialing mask_lines. . .')
        df = panel[0]
        x = df.index.values
        y = df[columns].values.T
        for i in range(len(lines)):
            # FIXME: why are x-values used to set the y-data coordinates of the mask?
            lines[i].set_xdata(np.ma.array(x, mask=True))
            lines[i].set_ydata(np.ma.array(y[i], mask=True))
        return lines

    if verbosity > 0:
        print('Drawing frames for a ".{0}" animation{1}. . .'.format(
            ext, ' with blitting' if blit else ''))
    animate(keys[0])
    ani = animation.FuncAnimation(
        fig, animate, keys, interval=interval,
        blit=blit)  #, init_func=mask_lines, blit=True)

    kwargs = dict(default_kwargs)
    for k, v in six.iteritems(default_kwargs):
        kwargs[k] = video_kwargs.get(k, v)
    # if 'bitrate' in kwargs:
    #     kwargs['bitrate'] = min(kwargs['bitrate'], int(8e5 / interval))  # low information rate (long interval) might make it impossible to achieve a higher bitrate ight not
    if path and isinstance(path, basestring):
        path += '.{0}'.format(ext)
        if verbosity > 0:
            print('Saving video to {0}. . .'.format(path))
        ani.save(path, **kwargs)

    if close:
        plt.close(fig)
    return df
Example #20
0
def index_model_field_batches(model_or_queryset, key_fields=['model_number', 'serial_number'], value_fields=['pk'], 
    key_formatter=lambda x: str.lstrip(str.strip(str(x or '')), '0'), 
    value_formatter=lambda x: str.strip(str(x)), batch_len=10000,
    limit=100000000, verbosity=1):
    '''Like index_model_field except uses 50x less memory and 10x more processing cycles

    Returns 2 dicts where both the keys and values are tuples:

    target_index = {(<key_fields[0]>, <key_fields[1]>, ...): (<value_fields[0]>,)} for all distinct model-serial pairs in the Sales queryset
    target_dupes = {(<key_fields[0]>, <key_fields[1]>, ...): [(<value_fields[1]>,), (<value_fields[2]>,), ...]}  with all the duplicates except the first pk already listed above
    '''

    qs = djdb.get_queryset(model_or_queryset)

    N = qs.count()
    if verbosity:
        print 'Indexing %d rows (database records) to aid in finding record %r values using the field %r.' % (N, value_fields, key_fields)

    index, dupes, rownum = {}, {}, 0

    pbar, rownum = None, 0
    if verbosity and N > min(1000000, max(0, 100000**(1./verbosity))):
        widgets = [pb.Counter(), '/%d rows: ' % N, pb.Percentage(), ' ', pb.RotatingMarker(), ' ', pb.Bar(),' ', pb.ETA()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=N).start()


    # to determine the type of the field value and decide whether to strip() or normalize in any way
    #obj0 = qs.filter(**{field + '__isnull': False}).all()[0]

    value_fields = util.listify(value_fields)
    key_fields = util.listify(key_fields)

    for batch in djdb.generate_queryset_batches(qs, batch_len=batch_len, verbosity=verbosity):
        for obj in batch:
            # print obj
            # normalize the key
            keys = []
            for kf in key_fields:
                k = getattr(obj, kf)
                keys += [key_formatter(k or '')]
            values = []
            keys = tuple(keys)
            for vf in value_fields:
                v = getattr(obj, vf)
                values += [value_formatter(v or '')]
            values = tuple(values)           

            if keys in index:
                dupes[keys] = dupes.get(keys, []) + [values]
            else:
                index[keys] = values
            # print rownum  / float(N)
            if pbar:
                pbar.update(rownum)
            rownum += 1
            if rownum >= limit:
                break
    if pbar:
        pbar.finish()
    if verbosity:
        print 'Found %d duplicate %s values among the %d records or %g%%' % (len(dupes), key_fields, len(index), len(dupes)*100./(len(index) or 1.))
    return index, dupes
Example #21
0
def simulate(t=1000, poly=(0., ), sinusoids=None, sigma=0, rw=0, irw=0, rrw=0):
    """Simulate a random signal with seasonal (sinusoids), linear and quadratic trend, RW, IRW, and RRW

    Arguments:
      t (int or list of float): number of samples or time vector, default = 1000
      poly (list of float): polynomial coefficients (in decreasing "order") passed to `numpy.polyval`
         i.e. poly[0]*x**(N-1) + ... + poly[N-1]
      sinusoids (list of list): [[period], [amplitude, period], or [ampl., period, phase]]

    >>> len(simulate(poly=(0,),rrw=1))
    1000
    >>> simulate(t=range(3), poly=(1,2))  # doctest: +NORMALIZE_WHITESPACE
    0    2
    1    3
    2    4
    dtype: float64
    >>> all(simulate(t=50, sinusoids=((1,2,3),)) == simulate(t=range(50), sinusoids=((1,2,3),)))
    True
    >>> any(simulate(t=100))
    False
    >>> abs(simulate(sinusoids=42.42).values[1] + simulate(sinusoids=42.42).values[-1]) < 1e-10
    True
    >>> simulate(t=17,sinusoids=[42, 16]).min()
    -42.0
    >>> all((simulate(t=range(10), sinusoids=(1, 9, 4.5))+simulate(t=10, sinusoids=(1,9))).abs() < 1e-10)
    True
    """
    if t and isinstance(t, int):
        t = np.arange(t, dtype=np.float64)
    else:
        t = np.array(t, dtype=np.float64)
    N = len(t)
    poly = poly or (0., )
    poly = listify(poly)
    y = np.polyval(poly, t)
    sinusoids = listify(sinusoids or [])
    if any(isinstance(ATP, (int, float)) for ATP in sinusoids):
        sinusoids = [sinusoids]
    for ATP in sinusoids:
        # default period is 1 more than the length of the simulated series (no values of the cycle are repeated)
        T = (t[-1] - t[0]) * N / (N - 1.)
        # default amplitude is 1 and phase is 0
        A, P = 1., 0
        try:
            A, T, P = ATP
        except (TypeError, ValueError):
            try:
                A, T = ATP
            except (TypeError, ValueError):
                # default period is 1 more than the length of the simulated series
                # (no values of the cycle are repeated)
                A = ATP[0]
        # print(A, T, P)
        # print(t[1] - t[0])
        y += A * np.sin(2 * np.pi * (t - P) / T)
    if sigma:
        y += np.random.normal(0.0, float(sigma), N)
    if rw:
        y += np.random.normal(0.0, float(rw), N).cumsum()
    if irw:
        y += np.random.normal(0.0, float(irw), N).cumsum().cumsum()
    if rrw:
        y += np.random.normal(0.0, float(rrw), N).cumsum().cumsum().cumsum()
    return pd.Series(y, index=t)
Example #22
0
 def __init__(self, start_urls=None, *args, **kwargs):
     self.start_urls = [r'http://www.google.com/googlebooks/uspto-patents-applications-biblio.html']
     if start_urls:
         self.start_urls = listify(start_urls)
     super(PatentBiblioSpider, self).__init__(*args, **kwargs)
Example #23
0
def simulate(t=1000, poly=(0.,), sinusoids=None, sigma=0, rw=0, irw=0, rrw=0):
    """Simulate a random signal with seasonal (sinusoids), linear and quadratic trend, RW, IRW, and RRW

    Arguments:
      t (int or list of float): number of samples or time vector, default = 1000
      poly (list of float): polynomial coefficients (in decreasing "order") passed to `numpy.polyval`
         i.e. poly[0]*x**(N-1) + ... + poly[N-1]
      sinusoids (list of list): [[period], [amplitude, period], or [ampl., period, phase]]

    >>> len(simulate(poly=(0,),rrw=1))
    1000
    >>> simulate(t=range(3), poly=(1,2))  # doctest: +NORMALIZE_WHITESPACE
    0    2
    1    3
    2    4
    dtype: float64
    >>> all(simulate(t=50, sinusoids=((1,2,3),)) == simulate(t=range(50), sinusoids=((1,2,3),)))
    True
    >>> any(simulate(t=100))
    False
    >>> abs(simulate(sinusoids=42.42).values[1] + simulate(sinusoids=42.42).values[-1]) < 1e-10
    True
    >>> simulate(t=17,sinusoids=[42, 16]).min()
    -42.0
    >>> all((simulate(t=range(10), sinusoids=(1, 9, 4.5))+simulate(t=10, sinusoids=(1,9))).abs() < 1e-10)
    True
    """
    if t and isinstance(t, int):
        t = np.arange(t, dtype=np.float64)
    else:
        t = np.array(t, dtype=np.float64)
    N = len(t)
    poly = poly or (0.,)
    poly = listify(poly)
    y = np.polyval(poly, t)
    sinusoids = listify(sinusoids or [])
    if any(isinstance(ATP, (int, float)) for ATP in sinusoids):
        sinusoids = [sinusoids]
    for ATP in sinusoids:
        # default period is 1 more than the length of the simulated series (no values of the cycle are repeated)
        T = (t[-1] - t[0]) * N / (N - 1.)
        # default amplitude is 1 and phase is 0
        A, P = 1., 0
        try:
            A, T, P = ATP
        except (TypeError, ValueError):
            try:
                A, T = ATP
            except (TypeError, ValueError):
                # default period is 1 more than the length of the simulated series (no values of the cycle are repeated)
                A = ATP[0]
        # print(A, T, P)
        # print(t[1] - t[0])
        y += A * np.sin(2 * np.pi * (t - P) / T)
    if sigma:
        y += np.random.normal(0.0, float(sigma), N)
    if rw:
        y += np.random.normal(0.0, float(rw), N).cumsum()
    if irw:
        y += np.random.normal(0.0, float(irw), N).cumsum().cumsum()
    if rrw:
        y += np.random.normal(0.0, float(rrw), N).cumsum().cumsum().cumsum()
    return pd.Series(y, index=t)