Ejemplo n.º 1
0
def fuzzy_index_match(possiblities, label, **kwargs):
    """Find the closest matching column label, key, or integer indexed value

    Returns:
      type(label): sequence of immutable objects corresponding to best matches to each object in label
              if label is an int returns the object (value) in the list of possibilities at that index
              if label is a str returns the closest str match in possibilities

    >>> from collections import OrderedDict as odict
    >>> fuzzy_index_match(pd.DataFrame(pd.np.random.randn(9,4), columns=list('ABCD'), index=range(9)), 'b')
    'B'
    >>> fuzzy_index_match(odict(zip('12345','ABCDE')), 'r2d2')
    '2'
    >>> fuzzy_index_match(odict(zip('12345','ABCDE')), 1)
    '2'
    >>> fuzzy_index_match(odict(zip('12345','ABCDE')), -1)
    '5'
    >>> fuzzy_index_match(odict(zip(range(4),'FOUR')), -4)
    0
    """
    possibilities = list(possiblities)
    if isinstance(label, basestring):
        return fuzzy_get(possibilities, label, **kwargs)
    if isinstance(label, int):
        return possibilities[label]
    if isinstance(label, list):
        return [fuzzy_get(possibilities, lbl) for lbl in label]
Ejemplo n.º 2
0
def api(feature='conditions', city='Portland', state='OR', key=None):
    """Use the wunderground API to get current conditions instead of scraping

    Please be kind and use your own key (they're FREE!):
    http://www.wunderground.com/weather/api/d/login.html

    References:
        http://www.wunderground.com/weather/api/d/terms.html

    Examples:
        >>> api('hurric', 'Boise', 'ID')  # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
        {u'currenthurricane': ...}}}

        >>> features = 'alerts astronomy conditions currenthurricane forecast forecast10day geolookup history hourly hourly10day planner rawtide satellite tide webcams yesterday'.split(' ')

        >> everything = [api(f, 'Portland') for f in features]
        >> js = api('alerts', 'Portland', 'OR')
        >> js = api('condit', 'Sacramento', 'CA')
        >> js = api('forecast', 'Mobile', 'AL')
        >> js = api('10day', 'Fairhope', 'AL')
        >> js = api('geo', 'Decatur', 'AL')
        >> js = api('hist', 'history', 'AL')
        >> js = api('astro')
    """

    features = 'alerts astronomy conditions currenthurricane forecast forecast10day geolookup history hourly hourly10day planner rawtide satellite tide webcams yesterday'.split(' ')
    feature = util.fuzzy_get(features, feature)
    # Please be kind and use your own key (they're FREE!):
    # http://www.wunderground.com/weather/api/d/login.html
    key = key or env.get('WUNDERGROUND', None, verbosity=-1) or env.get('WUNDERGROUND_KEY', 'c45a86c2fc63f7d0', verbosity=-1)
    url = 'http://api.wunderground.com/api/{key}/{feature}/q/{state}/{city}.json'.format(
        key=key, feature=feature, state=state, city=city)
    return json.load(urllib.urlopen(url))
Ejemplo n.º 3
0
def make_dataframe(obj, columns=None, exclude=None, limit=1e8):
    """Coerce an iterable, queryset, list or rows, dict of columns, etc into a Pandas DataFrame"""
    try:
        obj = obj.objects.all()[:limit]
    except:
        pass
    if isinstance(obj, (pd.Series, list, tuple)):
        return make_dataframe(pd.DataFrame(obj), columns, exclude, limit)
    # if the obj is a named tuple, DataFrame, dict of columns, django QuerySet, sql alchemy query result
    # retrieve the "include"d field/column names from its keys/fields/attributes
    if columns is None:
        columns = get_column_labels(obj)
    if exclude is not None and columns is not None and columns and exclude:
        columns = [i for i in columns if i not in exclude]
    try:
        return pd.DataFrame(list(obj.values(*columns)[:limit]))
    except:
        pass
    try:
        return pd.DataFrame(obj)[fuzzy_get(obj, columns)]
    except:
        pass
    return pd.DataFrame(obj)
Ejemplo n.º 4
0
def dataset_from_dataframe(df,
                           delays=(1, 2, 3),
                           inputs=(1, 2, -1),
                           outputs=(-1, ),
                           normalize=False,
                           verbosity=1):
    """Compose a pybrain.dataset from a pandas DataFrame

    Arguments:
      delays (list of int): sample delays to use for the input tapped delay line
        Positive and negative values are treated the same as sample counts into the past.
        default: [1, 2, 3], in z-transform notation: z^-1 + z^-2 + z^-3
      inputs (list of int or list of str): column indices or labels for the inputs
      outputs (list of int or list of str): column indices or labels for the outputs
      normalize (bool): whether to divide each input to be normally distributed about 0 with std 1

    Returns:
      3-tuple: tuple(dataset, list of means, list of stds)
        means and stds allow normalization of new inputs and denormalization of the outputs

    TODO:

        Detect categorical variables with low dimensionality and split into separate bits
            Vowpel Wabbit hashes strings into an int?
        Detect ordinal variables and convert to continuous int sequence
        SEE: http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
    """
    if isinstance(delays, int):
        if delays:
            delays = range(1, delays + 1)
        else:
            delays = [0]
    delays = np.abs(np.array([int(i) for i in delays]))
    inputs = [
        df.columns[int(inp)] if isinstance(inp, (float, int)) else str(inp)
        for inp in inputs
    ]
    outputs = [
        df.columns[int(out)] if isinstance(out, (float, int)) else str(out)
        for out in (outputs or [])
    ]

    inputs = [fuzzy_get(df.columns, i) for i in inputs]
    outputs = [fuzzy_get(df.columns, o) for o in outputs]

    N_inp = len(inputs)
    N_out = len(outputs)

    inp_outs = inputs + outputs
    if verbosity > 0:
        print("inputs: {}\noutputs: {}\ndelays: {}\n".format(
            inputs, outputs, delays))
    means, stds = np.zeros(len(inp_outs)), np.ones(len(inp_outs))
    if normalize:
        means, stds = df[inp_outs].mean(), df[inp_outs].std()

    if normalize and verbosity > 0:
        print("Input mean values (used to normalize input biases): {}".format(
            means[:N_inp]))
        print(
            "Output mean values (used to normalize output biases): {}".format(
                means[N_inp:]))
    ds = pb.datasets.SupervisedDataSet(N_inp * len(delays), N_out)
    if verbosity > 0:
        print(
            "Dataset dimensions are {}x{}x{} (records x indim x outdim) for {} delays, {} inputs, {} outputs"
            .format(len(df), ds.indim, ds.outdim, len(delays), len(inputs),
                    len(outputs)))
    # FIXME: normalize the whole matrix at once and add it quickly rather than one sample at a time
    if delays == np.array([0]) and not normalize:
        if verbosity > 0:
            print(
                "No tapped delay lines (delays) were requested, so using undelayed features for the dataset."
            )
        assert (df[inputs].values.shape[0] == df[outputs].values.shape[0])
        ds.setField('input', df[inputs].values)
        ds.setField('target', df[outputs].values)
        ds.linkFields(['input', 'target'])
        # for inp, outp in zip(df[inputs].values, df[outputs].values):
        #     ds.appendLinked(inp, outp)
        assert (len(ds['input']) == len(ds['target']))
    else:
        for i, out_vec in enumerate(df[outputs].values):
            if verbosity > 0 and i % 100 == 0:
                print("{}%".format(i / .01 / len(df)))
            elif verbosity > 1:
                print('sample[{i}].target={out_vec}'.format(i=i,
                                                            out_vec=out_vec))
            if i < max(delays):
                continue
            inp_vec = []
            for delay in delays:
                inp_vec += list(
                    (df[inputs].values[i - delay] - means[:N_inp]) /
                    stds[:N_inp])
            ds.addSample(inp_vec, (out_vec - means[N_inp:]) / stds[N_inp:])
    if verbosity > 0:
        print("Dataset now has {} samples".format(len(ds)))
    if normalize:
        return ds, means, stds
    else:
        return ds
Ejemplo n.º 5
0
def dataset_from_dataframe(df, delays=(1, 2, 3), inputs=(1, 2, -1), outputs=(-1,), normalize=False, verbosity=1):
    """Compose a pybrain.dataset from a pandas DataFrame

    Arguments:
      delays (list of int): sample delays to use for the input tapped delay line
        Positive and negative values are treated the same as sample counts into the past.
        default: [1, 2, 3], in z-transform notation: z^-1 + z^-2 + z^-3
      inputs (list of int or list of str): column indices or labels for the inputs
      outputs (list of int or list of str): column indices or labels for the outputs
      normalize (bool): whether to divide each input to be normally distributed about 0 with std 1

    Returns:
      3-tuple: tuple(dataset, list of means, list of stds)
        means and stds allow normalization of new inputs and denormalization of the outputs

    TODO:

        Detect categorical variables with low dimensionality and split into separate bits
            Vowpel Wabbit hashes strings into an int?
        Detect ordinal variables and convert to continuous int sequence
        SEE: http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
    """
    if isinstance(delays, int):
        if delays:
            delays = range(1, delays + 1)
        else:
            delays = [0]
    delays = np.abs(np.array([int(i) for i in delays]))
    inputs = [df.columns[int(inp)] if isinstance(inp, (float, int)) else str(inp) for inp in inputs]
    outputs = [df.columns[int(out)] if isinstance(out, (float, int)) else str(out) for out in (outputs or [])]

    inputs = [fuzzy_get(df.columns, i) for i in inputs]
    outputs = [fuzzy_get(df.columns, o) for o in outputs]

    N_inp = len(inputs)
    N_out = len(outputs)

    inp_outs = inputs + outputs
    if verbosity > 0:
        print("inputs: {}\noutputs: {}\ndelays: {}\n".format(inputs, outputs, delays))
    means, stds = np.zeros(len(inp_outs)), np.ones(len(inp_outs))
    if normalize:
        means, stds = df[inp_outs].mean(), df[inp_outs].std()

    if normalize and verbosity > 0:
        print("Input mean values (used to normalize input biases): {}".format(means[:N_inp]))
        print("Output mean values (used to normalize output biases): {}".format(means[N_inp:]))
    ds = pb.datasets.SupervisedDataSet(N_inp * len(delays), N_out)
    if verbosity > 0:
        print("Dataset dimensions are {}x{}x{} (records x indim x outdim) for {} delays, {} inputs, {} outputs".format(
              len(df), ds.indim, ds.outdim, len(delays), len(inputs), len(outputs)))
    # FIXME: normalize the whole matrix at once and add it quickly rather than one sample at a time
    if delays == np.array([0]) and not normalize:
        if verbosity > 0:
            print("No tapped delay lines (delays) were requested, so using undelayed features for the dataset.")
        assert(df[inputs].values.shape[0] == df[outputs].values.shape[0])
        ds.setField('input', df[inputs].values)
        ds.setField('target', df[outputs].values)
        ds.linkFields(['input', 'target'])
        # for inp, outp in zip(df[inputs].values, df[outputs].values):
        #     ds.appendLinked(inp, outp)
        assert(len(ds['input']) == len(ds['target']))
    else:
        for i, out_vec in enumerate(df[outputs].values):
            if verbosity > 0 and i % 100 == 0:
                print("{}%".format(i / .01 / len(df)))
            elif verbosity > 1:
                print('sample[{i}].target={out_vec}'.format(i=i, out_vec=out_vec))
            if i < max(delays):
                continue
            inp_vec = []
            for delay in delays:
                inp_vec += list((df[inputs].values[i - delay] - means[:N_inp]) / stds[:N_inp])
            ds.addSample(inp_vec, (out_vec - means[N_inp:]) / stds[N_inp:])
    if verbosity > 0:
        print("Dataset now has {} samples".format(len(ds)))
    if normalize:
        return ds, means, stds
    else:
        return ds