Ejemplo n.º 1
0
def correct_commodities():
	src_dir = path.join(data_dir, 'agmarknet/by_commodity')
	init_dir = os.getcwd()
	os.chdir(src_dir)
	folders = glob.glob('*')
	csv_dir = os.getcwd()
	for folder in folders:
		os.chdir(path.join(csv_dir, folder))
		files = glob.glob('*_all.csv')
		for file in files:
			csvr = odo.resource(path.join(csv_dir, folder, file))  # Have to use resource to discover URIs
			num_col = len(odo.discover(csvr)[1].types)	
			ds = None
			if num_col == 9:
				ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, min: ?string, max: ?string, modal: ?string}")
			elif num_col == 10:	
				ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, grade: ?string, min: ?string, max: ?string, modal: ?string}")
			else:
				ds = odo.discover(csvr)

			d = bz.Data(path.join(csv_dir, folder, file), dshape=ds)
			print(d.dshape.measure)
			d = bz.transform(d, commodity=d.commodity.map(lambda x: x.strip(), 'string'))
			d = bz.transform(d, commodity=d.commodity.map(lambda x: spelling_dict[x] if x in spelling_dict else x, 'string'))
			print(d.dshape.measure)
			print(list(bz.compute(d.commodity)))
	os.chdir(init_dir)
Ejemplo n.º 2
0
def _ensure_timestamp_field(dataset_expr, deltas):
    """Verify that the baseline and deltas expressions have a timestamp field.

    If there is not a ``TS_FIELD_NAME`` on either of the expressions, it will
    be copied from the ``AD_FIELD_NAME``. If one is provided, then we will
    verify that it is the correct dshape.

    Parameters
    ----------
    dataset_expr : Expr
        The baseline expression.
    deltas : Expr or None
        The deltas expression if any was provided.

    Returns
    -------
    dataset_expr, deltas : Expr
        The new baseline and deltas expressions to use.
    """
    measure = dataset_expr.dshape.measure
    if TS_FIELD_NAME not in measure.names:
        dataset_expr = bz.transform(
            dataset_expr,
            **{TS_FIELD_NAME: dataset_expr[AD_FIELD_NAME]}
        )
        if deltas is not None:
            deltas = bz.transform(
                deltas,
                **{TS_FIELD_NAME: deltas[AD_FIELD_NAME]}
            )
    else:
        _check_datetime_field(TS_FIELD_NAME, measure)

    return dataset_expr, deltas
Ejemplo n.º 3
0
def test_nested_transform():
    d = {'timestamp': [1379613528, 1379620047], 'platform': ["Linux",
                                                             "Windows"]}
    df = DataFrame(d)
    t = symbol('t', discover(df))
    t = transform(t, timestamp=t.timestamp.map(datetime.fromtimestamp,
                                               schema='datetime'))
    expr = transform(t, date=t.timestamp.map(lambda x: x.date(),
                                             schema='datetime'))
    result = compute(expr, df)
    df['timestamp'] = df.timestamp.map(datetime.fromtimestamp)
    df['date'] = df.timestamp.map(lambda x: x.date())
    assert str(result) == str(df)
Ejemplo n.º 4
0
def test_nested_transform():
    d = {'timestamp': [1379613528, 1379620047], 'platform': ["Linux",
                                                             "Windows"]}
    df = DataFrame(d)
    t = symbol('t', discover(df))
    t = transform(t, timestamp=t.timestamp.map(datetime.fromtimestamp,
                                               schema='datetime'))
    expr = transform(t, date=t.timestamp.map(lambda x: x.date(),
                                             schema='datetime'))
    result = compute(expr, df)
    df['timestamp'] = df.timestamp.map(datetime.fromtimestamp)
    df['date'] = df.timestamp.map(lambda x: x.date())
    tm.assert_frame_equal(result, df)
Ejemplo n.º 5
0
def test_dist(nyc):
    def distance(lat1, lon1, lat2, lon2, R=3959):
        # http://andrew.hedges.name/experiments/haversine/
        dlon = radians(lon2 - lon1)
        dlat = radians(lat2 - lat1)
        a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2
        return R * 2 * atan2(sqrt(a), sqrt(1 - a))

    t = symbol('t', discover(nyc))

    filtered = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    dist = distance(filtered.pickup_latitude, filtered.pickup_longitude,
                    filtered.dropoff_latitude, filtered.dropoff_longitude)
    transformed = transform(filtered, dist=dist)
    assert (
        odo(compute(transformed.dist.max(), nyc), float) ==
        odo(compute(transformed.dist, nyc), pd.Series).max().item()
    )
Ejemplo n.º 6
0
def test_str_does_not_repr():
    # see GH issue #1240.
    d = Data([('aa', 1), ('b', 2)], name="ZZZ",
             dshape='2 * {a: string, b: int64}')
    expr = transform(d, c=d.a.strlen() + d.b)
    assert str(
        expr) == "Merge(_child=ZZZ, children=(ZZZ, label(strlen(_child=ZZZ.a) + ZZZ.b, 'c')))"
Ejemplo n.º 7
0
def test_str_does_not_repr():
    # see GH issue #1240.
    d = Data([('aa', 1), ('b', 2)], name="ZZZ",
             dshape='2 * {a: string, b: int64}')
    expr = transform(d, c=d.a.strlen() + d.b)
    assert str(
        expr) == "Merge(_child=ZZZ, children=(ZZZ, label(strlen(_child=ZZZ.a) + ZZZ.b, 'c')))"
Ejemplo n.º 8
0
    def plot(self, output_file="termite.html"):
        import blaze as blz
        from odo import into
        import pandas as pd
        import bokeh.plotting as plt
        from bokeh.models.sources import ColumnDataSource

        t = blz.Data(self.input_file)

        MAX = blz.compute(t.weight.max())
        MIN = blz.compute(t.weight.min())

        # Create a size variable to define the size of the the circle for the plot.
        t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

        WORDS = t['word'].distinct()
        WORDS = into(list, WORDS)
        topics = t['topic'].distinct()
        topics = into(list, topics)
        # Convert topics to strings
        TOPICS = [str(i) for i in topics]

        source = into(pd.DataFrame, t)

        plt.output_file(output_file)

        data_source = ColumnDataSource(source)

        p = plt.figure(x_range=TOPICS, y_range=WORDS,
                       plot_width=1000, plot_height=1700,
                       title=self.title)

        p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
        plt.show(p)
Ejemplo n.º 9
0
    def plot(self, output_file="termite.html"):
        t = blz.Data(self.input_file)
        df = pd.read_csv(self.input_file)

        MAX =  blz.compute(t.weight.max())
        MIN = blz.compute(t.weight.min())

        # Create a size variable to define the size of the the circle for the plot.
        t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

        WORDS = t['word'].distinct()
        WORDS = into(list, WORDS)
        topics = t['topic'].distinct()
        topics = into(list, topics)
        # Convert topics to strings
        TOPICS = [str(i) for i in topics]

        source = into(pd.DataFrame, t)

        plt.output_file(output_file)

        data_source = ColumnDataSource(source)

        p = plt.figure(x_range=TOPICS, y_range=WORDS,
               plot_width=1000, plot_height=1700,
               title=self.title)

        p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
        #p.xaxis().major_label_orientation = np.pi/3
        logging.info("generating termite plot for file %s" % self.input_file)
        plt.show(p)
Ejemplo n.º 10
0
def test_dist(nyc):
    def distance(lat1, lon1, lat2, lon2, R=3959):
        # http://andrew.hedges.name/experiments/haversine/
        dlon = radians(lon2 - lon1)
        dlat = radians(lat2 - lat1)
        a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2
        return R * 2 * atan2(sqrt(a), sqrt(1 - a))

    t = symbol('t', discover(nyc))

    filtered = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    dist = distance(filtered.pickup_latitude, filtered.pickup_longitude,
                    filtered.dropoff_latitude, filtered.dropoff_longitude)
    transformed = transform(filtered, dist=dist)
    assert (
        compute(transformed.dist.max(), nyc, return_type=float) ==
        compute(transformed.dist, nyc, return_type=pd.Series).max()
    )
Ejemplo n.º 11
0
    def plot(self):
        t = blz.Data(self.input_file)
        df = pd.read_csv(self.input_file)

        MAX =  blz.compute(t.weight.max())
        MIN = blz.compute(t.weight.min())

        # Create a size variable to define the size of the the circle for the plot.
        t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

        WORDS = t['word'].distinct()
        WORDS = into(list, WORDS)
        topics = t['topic'].distinct()
        topics = into(list, topics)
        # Convert topics to strings
        TOPICS = [str(i) for i in topics]

        source = into(pd.DataFrame, t)

        data_source = ColumnDataSource(source)

        p = plt.figure(x_range=TOPICS, y_range=WORDS,
               plot_width=1000, plot_height=1700, title=None)

        p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
        #p.xaxis().major_label_orientation = np.pi/3
        logging.info("generating termite plot for file %s" % self.input_file)

        script, div = components(p, CDN)

        return script, div
Ejemplo n.º 12
0
    def test_novel_deltas_macro(self):
        asset_info = asset_infos[0][0]
        base_dates = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-04')
        ])
        baseline = pd.DataFrame({
            'value': (0, 1),
            'asof_date': base_dates,
            'timestamp': base_dates,
        })
        expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape)
        deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape)
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(pd.Timestamp, {
            '2014-01-03': repeat_last_axis(
                np.array([10.0, 10.0, 10.0]),
                nassets,
            ),
            '2014-01-06': repeat_last_axis(
                np.array([10.0, 10.0, 11.0]),
                nassets,
            ),
        })

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])
        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Ejemplo n.º 13
0
def insert_commodities(df, mongo_str, batch=False):
    ### TODO: load this list from commodities/selected_commodities.json
    selected_commodities = json.load(open(path.join(data_dir, 'commodities', 'selected_commodities.json')))
    to_strip = ''
    init_dir = os.getcwd()
    if batch:
        agmarknet_dir = path.join(data_dir, 'agmarknet', 'by_commodity')
        to_strip = '_stacked_localized.csv'
    else:
        agmarknet_dir = path.join(data_dir, 'agmarknet', 'by_date_and_commodity')
        to_strip = '_localized.csv'
    for cat, comm_list in selected_commodities.items():
        ### NOTE: following is just for testing
        if cat != 'Cereals':
            continue
        cat_folder = name_to_fs(cat).replace('-', '_')
        src_folder = path.join(agmarknet_dir, cat_folder, 'integrated')
        os.chdir(src_folder)
        files = glob.glob('*.csv')
        print(files)
        selected_files = []
        # TODO: test for Coffee and Tea
        print(comm_list)
        for comm in comm_list:
            # need to filter with every commodity name selected :/
            print(name_to_fs(comm))
            selected_files+=list(filter(lambda x: name_to_fs(comm) == x.replace(to_strip, ''), files))
            print(selected_files)
            # map commodity name to filename?
            # filter with comm_list
        for filename in selected_files:
            print(filename)
            coll = filename.replace(to_strip, '')
            coll = coll.replace('-', '.')
            coll = coll.lower()
            coll = 'market.'+coll+'.varieties'
            print(coll)
            print('Inserting {0} into collection \"{1}\"..'.format(filename, coll))
            target = mongo_str+coll
            print(target)
            d = bz.Data(filename)
            #print('shape', d.dshape)
            #nrows = bz.compute(d.count())
            #print('rows', nrows)
            # renaming columns in blaze
            d = d.relabel(min='minPrice', max='maxPrice', modal='modalPrice')#, arrival='commodityTonnage')#, commodity_translated='commodityTranslated')
            # NOTE: assumption of normal distribution since underlying price discovery process not known:
            # -> modal price = mean price
            """
            df.rename(columns={'modal': 'minPrice'}, inplace=True)
            df.rename(columns={'modal': 'maxPrice'}, inplace=True)
            df.rename(columns={'modal': 'modalPrice'}, inplace=True)
            """
            d = bz.transform(d, varietyTonnage=d.commodityTonnage.map(lambda x: np.nan, 'float64'))
            odo.odo(d, target)
        os.chdir(init_dir)
    return
Ejemplo n.º 14
0
def test_dplyr_transform():
    df = DataFrame({'timestamp': pd.date_range('now', periods=5)})
    t = symbol('t', discover(df))
    expr = transform(t, date=t.timestamp.map(lambda x: x.date(),
                                             schema='datetime'))
    lhs = compute(expr, df)
    rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()),
                                name='date').to_frame()], axis=1)
    tm.assert_frame_equal(lhs, rhs)
Ejemplo n.º 15
0
def test_dplyr_transform():
    df = DataFrame({'timestamp': pd.date_range('now', periods=5)})
    t = symbol('t', discover(df))
    expr = transform(t, date=t.timestamp.map(lambda x: x.date(),
                                             schema='datetime'))
    lhs = compute(expr, df)
    rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()),
                                name='date').to_frame()], axis=1)
    assert str(lhs) == str(rhs)
Ejemplo n.º 16
0
    def test_novel_deltas(self, asset_info):
        base_dates = pd.DatetimeIndex([pd.Timestamp("2014-01-01"), pd.Timestamp("2014-01-04")])
        repeated_dates = base_dates.repeat(3)
        baseline = pd.DataFrame(
            {
                "sid": self.sids * 2,
                "value": (0, 1, 2, 1, 2, 3),
                "asof_date": repeated_dates,
                "timestamp": repeated_dates,
            }
        )
        expr = bz.Data(baseline, name="expr", dshape=self.dshape)
        deltas = bz.Data(baseline, name="deltas", dshape=self.dshape)
        deltas = bz.transform(deltas, value=deltas.value + 10, timestamp=deltas.timestamp + timedelta(days=1))
        expected_views = keymap(
            pd.Timestamp,
            {
                "2014-01-03": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [10.0, 11.0, 12.0]]),
                "2014-01-06": np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0], [11.0, 12.0, 13.0]]),
            },
        )
        if len(asset_info) == 4:
            expected_views = valmap(lambda view: np.c_[view, [np.nan, np.nan, np.nan]], expected_views)
            expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan]
        else:
            expected_output_buffer = [10, 11, 12, 11, 12, 13]

        cal = pd.DatetimeIndex(
            [
                pd.Timestamp("2014-01-01"),
                pd.Timestamp("2014-01-02"),
                pd.Timestamp("2014-01-03"),
                # omitting the 4th and 5th to simulate a weekend
                pd.Timestamp("2014-01-06"),
            ]
        )

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                expected_output_buffer,
                index=pd.MultiIndex.from_product(
                    (sorted(expected_views.keys()), finder.retrieve_all(asset_info.index))
                ),
                columns=("value",),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Ejemplo n.º 17
0
    def test_deltas(self, asset_info):
        expr = bz.Data(self.df, name='expr', dshape=self.dshape)
        deltas = bz.Data(self.df, dshape=self.dshape)
        deltas = bz.Data(
            odo(
                bz.transform(
                    deltas,
                    value=deltas.value + 10,
                    timestamp=deltas.timestamp + timedelta(days=1),
                ),
                pd.DataFrame,
            ),
            name='delta',
            dshape=self.dshape,
        )

        expected_views = keymap(pd.Timestamp, {
            '2014-01-02': np.array([[10.0, 11.0, 12.0],
                                    [1.0, 2.0, 3.0]]),
            '2014-01-03': np.array([[11.0, 12.0, 13.0],
                                    [2.0, 3.0, 4.0]]),
            '2014-01-04': np.array([[12.0, 13.0, 14.0],
                                    [12.0, 13.0, 14.0]]),
        })

        nassets = len(asset_info)
        if nassets == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan]],
                expected_views,
            )

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([12] * nassets, [13] * nassets, [14] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            dates = self.dates
            dates = dates.insert(len(dates), dates[-1] + timedelta(days=1))
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
Ejemplo n.º 18
0
def test_transform_with_common_subexpression():
    df = DataFrame(np.random.rand(5, 2), columns=list('ab'))
    t = symbol('t', discover(df))
    expr = transform(t, c=t.a - t.a % 3, d=t.a % 3)
    result = compute(expr, df)
    expected = pd.concat(
        [df[c] for c in df.columns] +
        [pd.Series(df.a - df.a % 3, name='c'),
         pd.Series(df.a % 3, name='d')],
        axis=1)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 19
0
    def test_deltas(self, asset_info):
        expr = bz.Data(self.df, name='expr', dshape=self.dshape)
        deltas = bz.Data(self.df, dshape=self.dshape)
        deltas = bz.Data(
            odo(
                bz.transform(
                    deltas,
                    value=deltas.value + 10,
                    timestamp=deltas.timestamp + timedelta(days=1),
                ),
                pd.DataFrame,
            ),
            name='delta',
            dshape=self.dshape,
        )

        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-02': np.array([[10.0, 11.0, 12.0], [1.0, 2.0, 3.0]]),
                '2014-01-03': np.array([[11.0, 12.0, 13.0], [2.0, 3.0, 4.0]]),
                '2014-01-04': np.array([[12.0, 13.0, 14.0], [12.0, 13.0, 14.0]
                                        ]),
            })

        nassets = len(asset_info)
        if nassets == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan]],
                expected_views,
            )

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([12] * nassets, [13] * nassets, [14] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value', ),
            )
            dates = self.dates
            dates = dates.insert(len(dates), dates[-1] + timedelta(days=1))
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
Ejemplo n.º 20
0
def test_transform_with_common_subexpression():
    df = DataFrame(np.random.rand(5, 2), columns=list('ab'))
    t = symbol('t', discover(df))
    expr = transform(t, c=t.a - t.a % 3, d=t.a % 3)
    result = compute(expr, df)
    expected = pd.concat(
        [df[c] for c in df.columns] + [
            pd.Series(df.a - df.a % 3, name='c'),
            pd.Series(df.a % 3, name='d')
        ],
        axis=1
    )
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 21
0
def test_coerce_on_select(nyc):
    t = symbol('t', discover(nyc))
    t = t[(t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) &
          (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577)
          & (t.pickup_longitude >= -74.259090) &
          (t.pickup_longitude <= -73.700272) &
          (t.dropoff_longitude >= -74.259090) &
          (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6)]
    t = transform(t, pass_count=t.passenger_count + 1)
    result = compute(t.pass_count.coerce('float64'), nyc, return_type='native')
    s = odo(result, pd.Series)
    expected = compute(t, nyc, return_type=pd.DataFrame) \
                      .passenger_count.astype('float64') + 1.0
    assert list(s) == list(expected)
Ejemplo n.º 22
0
def termite(modeled_corpus, plot_title="Termite plot", topn=15):
    """A Bokeh Termite Visualization for LDA results analysis.

    Parameters
    ----------
    input_file : str or pandas DataFrame
        A pandas dataframe from a topik model get_termite_data() containing columns "word", "topic" and "weight".
        May also be a string, in which case the string is a filename of a csv file with the above columns.
    title : str
        The title for your termite plot

    Examples
    --------
    >>> plot = termite(test_model_output, plot_title="My model results", topn=5)

    """
    prepared_model_vis_data = _termite_data(modeled_corpus, topn)

    t = blz.Data(prepared_model_vis_data)

    MAX = blz.compute(t.weight.max())
    MIN = blz.compute(t.weight.min())

    # Create a size variable to define the size of the the circle for the plot.
    t = blz.transform(t, size=blz.sqrt((t.weight - MIN) / (MAX - MIN)) * 50)

    WORDS = t['word'].distinct()
    WORDS = into(list, WORDS)
    topics = t['topic'].distinct()
    topics = into(list, topics)
    # Convert topics to strings
    TOPICS = [str(i) for i in topics]

    source = into(pd.DataFrame, t)

    data_source = sources.ColumnDataSource(source)

    p = plt.figure(x_range=TOPICS,
                   y_range=WORDS,
                   plot_width=1000,
                   plot_height=1700,
                   title=plot_title)

    p.circle(x="topic",
             y="word",
             size="size",
             fill_alpha=0.6,
             source=data_source)
    return p
Ejemplo n.º 23
0
Archivo: core.py Proyecto: yu68/zipline
def _ad_as_ts(expr):
    """Duplicate the asof_date column as the timestamp column.

    Parameters
    ----------
    expr : Expr or None
        The expression to change the columns of.

    Returns
    -------
    transformed : Expr or None
        The transformed expression or None if ``expr`` is None.
    """
    return (None if expr is None else bz.transform(
        expr, **{TS_FIELD_NAME: expr[AD_FIELD_NAME]}))
Ejemplo n.º 24
0
def test_multiple_columns_in_transform(nyc):
    t = symbol('t', discover(nyc))
    t = t[(t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) &
          (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577)
          & (t.pickup_longitude >= -74.259090) &
          (t.pickup_longitude <= -73.700272) &
          (t.dropoff_longitude >= -74.259090) &
          (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6)]
    hours = t.trip_time_in_secs.coerce('float64') / 3600.0
    avg_speed_in_mph = t.trip_distance / hours
    d = transform(t,
                  avg_speed_in_mph=avg_speed_in_mph,
                  mycol=avg_speed_in_mph + 1)
    df = compute(d[d.avg_speed_in_mph <= 200], nyc, return_type=pd.DataFrame)
    assert not df.empty
Ejemplo n.º 25
0
def test_multiple_columns_in_transform(nyc):
    t = symbol('t', discover(nyc))
    t = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    hours = t.trip_time_in_secs.coerce('float64') / 3600.0
    avg_speed_in_mph = t.trip_distance / hours
    d = transform(t, avg_speed_in_mph=avg_speed_in_mph, mycol=avg_speed_in_mph + 1)
    df = compute(d[d.avg_speed_in_mph <= 200], nyc, return_type=pd.DataFrame)
    assert not df.empty
Ejemplo n.º 26
0
def _ad_as_ts(expr):
    """Duplicate the asof_date column as the timestamp column.

    Parameters
    ----------
    expr : Expr or None
        The expression to change the columns of.

    Returns
    -------
    transformed : Expr or None
        The transformed expression or None if ``expr`` is None.
    """
    return (
        None
        if expr is None else
        bz.transform(expr, **{TS_FIELD_NAME: expr[AD_FIELD_NAME]})
    )
Ejemplo n.º 27
0
    def test_deltas_macro(self):
        asset_info = asset_infos[0][0]
        expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape)
        deltas = bz.Data(
            self.macro_df.iloc[:-1],
            name='deltas',
            dshape=self.macro_dshape,
        )
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets),
                '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets),
            })

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value', ),
            )
            dates = self.dates
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
Ejemplo n.º 28
0
def termite(modeled_corpus, plot_title="Termite plot", topn=15):
    """A Bokeh Termite Visualization for LDA results analysis.

    Parameters
    ----------
    input_file : str or pandas DataFrame
        A pandas dataframe from a topik model get_termite_data() containing columns "word", "topic" and "weight".
        May also be a string, in which case the string is a filename of a csv file with the above columns.
    title : str
        The title for your termite plot

    Examples
    --------
    >>> plot = termite(test_model_output, plot_title="My model results", topn=5)

    """
    prepared_model_vis_data = _termite_data(modeled_corpus, topn)

    t = blz.Data(prepared_model_vis_data)

    MAX = blz.compute(t.weight.max())
    MIN = blz.compute(t.weight.min())

    # Create a size variable to define the size of the the circle for the plot.
    t = blz.transform(t, size=blz.sqrt((t.weight - MIN)/(MAX - MIN))*50)

    WORDS = t['word'].distinct()
    WORDS = into(list, WORDS)
    topics = t['topic'].distinct()
    topics = into(list, topics)
    # Convert topics to strings
    TOPICS = [str(i) for i in topics]

    source = into(pd.DataFrame, t)

    data_source = sources.ColumnDataSource(source)

    p = plt.figure(x_range=TOPICS, y_range=WORDS,
                   plot_width=1000, plot_height=1700,
                   title=plot_title)

    p.circle(x="topic", y="word", size="size", fill_alpha=0.6, source=data_source)
    return p
Ejemplo n.º 29
0
def test_coerce_on_select(nyc):
    t = symbol('t', discover(nyc))
    t = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    t = transform(t, pass_count=t.passenger_count + 1)
    result = compute(t.pass_count.coerce('float64'), nyc, return_type='native')
    s = odo(result, pd.Series)
    expected = compute(t, nyc, return_type=pd.DataFrame) \
                      .passenger_count.astype('float64') + 1.0
    assert list(s) == list(expected)
Ejemplo n.º 30
0
    def test_deltas_macro(self):
        asset_info = asset_infos[0][0]
        expr = bz.Data(self.macro_df, name='expr', dshape=self.macro_dshape)
        deltas = bz.Data(
            self.macro_df.iloc[:-1],
            name='deltas',
            dshape=self.macro_dshape,
        )
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(pd.Timestamp, {
            '2014-01-02': repeat_last_axis(np.array([10.0, 1.0]), nassets),
            '2014-01-03': repeat_last_axis(np.array([11.0, 2.0]), nassets),
        })

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            dates = self.dates
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=dates,
                start=dates[1],
                end=dates[-1],
                window_length=2,
                compute_fn=np.nanmax,
            )
Ejemplo n.º 31
0
def test_str_does_not_repr():
    # see GH issue #1240.
    d = data(
        [('aa', 1), ('b', 2)],
        name="ZZZ",
        dshape='2 * {a: string, b: int64}',
    )
    expr = transform(d, c=d.a.str.len() + d.b)
    assert (
        normalize(str(expr)) ==
        normalize("""
            Merge(
                args=(ZZZ, label(len(_child=ZZZ.a) + ZZZ.b, 'c')),
                _varargsexpr=VarArgsExpr(
                    _inputs=(ZZZ, label(len(_child=ZZZ.a) + ZZZ.b, 'c'))
                ),
                _shape=(2,)
            )
        """)
    )
Ejemplo n.º 32
0
    def test_complex_expr(self):
        expr = bz.data(self.df, dshape=self.dshape)
        # put an Add in the table
        expr_with_add = bz.transform(expr, value=expr.value + 1)

        # Test that we can have complex expressions with no deltas
        from_blaze(
            expr_with_add,
            deltas=None,
            loader=self.garbage_loader,
            missing_values=self.missing_values,
        )

        with self.assertRaises(TypeError):
            from_blaze(
                expr.value + 1,  # put an Add in the column
                deltas=None,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )

        deltas = bz.data(
            pd.DataFrame(columns=self.df.columns),
            dshape=self.dshape,
        )
        with self.assertRaises(TypeError):
            from_blaze(
                expr_with_add,
                deltas=deltas,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )

        with self.assertRaises(TypeError):
            from_blaze(
                expr.value + 1,
                deltas=deltas,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )
Ejemplo n.º 33
0
    def test_complex_expr(self):
        expr = bz.data(self.df, dshape=self.dshape)
        # put an Add in the table
        expr_with_add = bz.transform(expr, value=expr.value + 1)

        # Test that we can have complex expressions with no deltas
        from_blaze(
            expr_with_add,
            deltas=None,
            loader=self.garbage_loader,
            missing_values=self.missing_values,
        )

        with self.assertRaises(TypeError):
            from_blaze(
                expr.value + 1,  # put an Add in the column
                deltas=None,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )

        deltas = bz.data(
            pd.DataFrame(columns=self.df.columns),
            dshape=self.dshape,
        )
        with self.assertRaises(TypeError):
            from_blaze(
                expr_with_add,
                deltas=deltas,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )

        with self.assertRaises(TypeError):
            from_blaze(
                expr.value + 1,
                deltas=deltas,
                loader=self.garbage_loader,
                missing_values=self.missing_values,
            )
Ejemplo n.º 34
0
    def test_novel_deltas(self, asset_info):
        base_dates = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-04')
        ])
        repeated_dates = base_dates.repeat(3)
        baseline = pd.DataFrame({
            'sid': self.sids * 2,
            'value': (0., 1., 2., 1., 2., 3.),
            'int_value': (0, 1, 2, 1, 2, 3),
            'asof_date': repeated_dates,
            'timestamp': repeated_dates,
        })
        expr = bz.data(baseline, name='expr', dshape=self.dshape)
        deltas = bz.data(
            odo(
                bz.transform(
                    expr,
                    value=expr.value + 10,
                    timestamp=expr.timestamp + timedelta(days=1),
                ),
                pd.DataFrame,
            ),
            name='delta',
            dshape=self.dshape,
        )
        expected_views = keymap(pd.Timestamp, {
            '2014-01-03': np.array([[10.0, 11.0, 12.0],
                                    [10.0, 11.0, 12.0],
                                    [10.0, 11.0, 12.0]]),
            '2014-01-06': np.array([[10.0, 11.0, 12.0],
                                    [10.0, 11.0, 12.0],
                                    [11.0, 12.0, 13.0]]),
        })
        if len(asset_info) == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan, np.nan]],
                expected_views,
            )
            expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan]
        else:
            expected_output_buffer = [10, 11, 12, 11, 12, 13]

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                expected_output_buffer,
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value',),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Ejemplo n.º 35
0
# coding: utf-8
import pandas as pd
import blaze as bz
df = pd.DataFrame.from_csv('Tea_stacked_cleaned.csv', index_col=None)
df.head()
d = bz.Data('Tea_stacked_cleaned.csv')
d.shape
d.dshape
d = bz.transform(d, year=d.date.year, month=d.date.month)
d.head(5)
d.tail(5)
import odo
df = odo.odo(d, pd.DataFrame)
df.head(2)
df.groupby(['year', 'month'])
df.groupby(['year', 'month']).arrival
groups = df.groupby(['year', 'month'])
print(groups)
df.groupby(['year', 'month']).arrival.sum()
df.loc[df.year == 15]
df.loc[:, df.year==15]
grouped = df.groupby(['year', 'month'])
for name, group in grouped:
    print(name)
    print(group)
    
get_ipython().magic('save trying.py 1-23')
Ejemplo n.º 36
0
def clean(files, mode, commodity_corrections, commodity_name_mapping):
    if mode == 'batch':
        #commodities = list(set(list(map(lambda x: x.split('_')[0], files))))
        #for commodity in commodities:
        for filename in files:
            #commodity = commodity.replace('(', '\(').replace(')', '\)')
            #filename = '{}_stacked.csv'.format(commodity)
            # Have to use resource to discover URIs
            csvr = bz.resource(filename)
            num_col = len(bz.discover(csvr)[1].types)
            print(num_col)
            ds = None
            if num_col == 12:
                ds = bz.dshape("var * {date: ?string, state: ?string, market: ?string, category: ?string, commodity: ?string, variety: ?string, arrival: ?float64, min: ?float64, max: ?float64, modal: ?float64, originState: ?string, originMarket: ?string}")
            elif num_col == 13:
                ds = bz.dshape("var * {date: ?string, state: ?string, market: ?string, category: ?string, commodity: ?string, variety: ?string, arrival: ?float64, grade: ?string,  min: ?float64, max: ?float64, originState: ?string, originMarket: ?string, modal: ?float64 }")
            else:
                ds = bz.discover(csvr)
            d = bz.Data(filename, dshape=ds)
            ### Use Dask if Data loads index_col and/or header:
            # http://stackoverflow.com/questions/32716093/how-do-i-read-tabulator-separated-csv-in-blaze
            ### Fixes issue with two added months on date
            d = bz.transform(d, date=d.date.map(lambda x: datetime.strptime(x, '%d/%m/%Y').date(), 'date'))

            commodity = list(d.commodity.distinct())[0]
            varieties = list(d.variety.distinct())
            ### Removing unnecessary varieties
            ### TODO: test effectiveness
            if [commodity, 'Other'] == varieties:
                d = bz.transform(d, variety=d.variety.map(lambda x: commodity, 'string'))

            ### TODO: merge related filenames
            # if price > 100: divide by 100
            #d = bz.transform(d, min=d.min.map(lambda x: x/100 if x > 100 else x, 'float64'))
            #d = bz.transform(d, max=d.max.map(lambda x: x/100 if x > 100 else x, 'float64'))
            #d = bz.transform(d, max=d.modal.map(lambda x: x/100 if x > 100 else x, 'float64'))

            d = bz.transform(d, commodity=d.commodity.map(lambda x: commodity_corrections[x] if x in commodity_corrections else x, 'string'))
            d = bz.transform(d, commodityTranslated=d.commodity.map(lambda x: commodity_name_mapping[x] if x in commodity_name_mapping else x, 'string'))
            d = bz.transform(d, state=d.state.map(lambda x: state_corrections[x] if x in state_corrections else x, 'string'))
            ### TODO: there must be a better solution for this
            #for key, val in market_corrections.items():
            # d = bz.transform(d, market=d.market.map(lambda x: re.sub(key, val, x), 'string'))
            cleaned_fn = filename.replace('.csv', '_cleaned.csv')# {}_stacked_cleaned.csv'.format(commodity)
            print(cleaned_fn)
            if not path.isdir('cleaned'):
                os.makedirs('cleaned')
            outpath = path.join('cleaned', cleaned_fn)
            print(outpath)
            if path.isfile(outpath):
                os.remove(outpath)
            odo.odo(d, outpath)
            #df_cleaned = odo.odo(d, pd.DataFrame)
            #df_cleaned.to_csv(outpath, index=False)
            # df = pd.DataFrame.from_csv('cleaned/Tea_stacked_cleaned.csv', header=False, index_col=None)
            # df['year'] = df['date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y").year)
            # df['month'] = df['date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y").month)

            # os.remove(filename)
    else:
        for filename in files:
            df = pd.DataFrame.from_csv(filename, header=False, index_col=False)
            num_col = len(df.columns)
            if num_col == 10:
                df.columns = ['date', 'state', 'market', 'category', 'commodity', 'variety', 'arrival', 'min', 'max', 'modal']
            elif num_col == 11:
                df.columns = ['date', 'state', 'market', 'category', 'commodity', 'variety', 'arrival', 'grade', 'min', 'max', 'modal']

            df['state'] = df['state'].replace(commodity_corrections, replace=True)
            df['commodity'] = df['commodity'].replace(commodity_corrections, replace=True)
            df['commodity'] = df['commodity'].replace(commodity_name_mapping, replace=True)
            cleaned_fn = '{}_cleaned.csv'.format(filename.rstrip('.csv'))
            if not path.isdir('cleaned'):
                os.makedirs('cleaned')
            outpath = path.join('cleaned', cleaned_fn)
            if path.isfile(outpath):
                os.remove(outpath)
            df.to_csv(cleaned_fn, index=False)
        ### TODO: use blaze to load commodity files, process them and save to desk cleaned version
    #### "online" --> use pandas
    return
Ejemplo n.º 37
0
    def test_novel_deltas_macro(self):
        asset_info = asset_infos[0][0]
        base_dates = pd.DatetimeIndex(
            [pd.Timestamp('2014-01-01'),
             pd.Timestamp('2014-01-04')])
        baseline = pd.DataFrame({
            'value': (0, 1),
            'asof_date': base_dates,
            'timestamp': base_dates,
        })
        expr = bz.Data(baseline, name='expr', dshape=self.macro_dshape)
        deltas = bz.Data(baseline, name='deltas', dshape=self.macro_dshape)
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )

        nassets = len(asset_info)
        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-03':
                repeat_last_axis(
                    np.array([10.0, 10.0, 10.0]),
                    nassets,
                ),
                '2014-01-06':
                repeat_last_axis(
                    np.array([10.0, 10.0, 11.0]),
                    nassets,
                ),
            })

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])
        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                list(concatv([10] * nassets, [11] * nassets)),
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value', ),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )
Ejemplo n.º 38
0
def blaze_tutorial():
    accounts = bl.Symbol('accounts',
                         'var * {id: int, name: string, amount: int}')
    deadbeats = accounts[accounts.amount < 0].name

    list_ = [[1, 'Alice', 100],
             [2, 'Bob', -200],
             [3, 'Charlie', 300],
             [4, 'Denis', 400],
             [5, 'Edith', -500]]

    print(list(bl.compute(deadbeats, list_)))

    df_ = bl.DataFrame(list_, columns=['id', 'name', 'amount'])
    print(bl.compute(deadbeats, df_))
    bl_df_dir = dir(df_)

    df_ = pd.DataFrame(list_, columns=['id', 'name', 'amount'])
    print(df_[df_.amount < 0].name)
    pd_df_dir = dir(df_)

    print(len(bl_df_dir), len(pd_df_dir))
    print(len([d for d in bl_df_dir if d in pd_df_dir]))
    print([d for d in bl_df_dir if d not in pd_df_dir])
    print([d for d in pd_df_dir if d not in bl_df_dir])

    df_ = bl.Data([(1, 'Alice', 100),
                   (2, 'Bob', -200),
                   (3, 'Charlie', 300),
                   (4, 'Denis', 400),
                   (5, 'Edith', -500)],
                  fields=['id', 'name', 'balance'])

    print(repr(df_))
    print(repr(df_[df_.balance < 0]))

    print(repr(df_[df_.balance < 0].name))
    print(list(df_[df_.balance < 0].name))

    iris = bl.Data('examples/iris.csv')
    print(repr(iris))

    iris = bl.Data('sqlite:///examples/iris.db::iris')
    print(repr(iris))

    print(repr(bl.by(iris.species, min=iris.petal_width.min(),
                     max=iris.petal_width.max())))

    result = bl.by(iris.species, min=iris.petal_width.min(),
                   max=iris.petal_width.max())

    print(odo(result, bl.DataFrame))
    print(odo(result, pd.DataFrame))

    ### odo has weird issue with unicode filenames, apparently...
    name = 'output.csv'
    print(odo(result, bl.CSV(name)))

    print(repr(iris.sepal_length.mean()))
    print(repr(bl.mean(iris.sepal_length)))

    print(repr(bl.by(iris.species, shortest=iris.petal_length.min(),
                     longest=iris.petal_length.max(),
                     average=iris.petal_length.mean())))

    print(repr(iris.head()))

    iris = bl.transform(iris, sepal_ratio=iris.sepal_length / iris.sepal_width,
                        petal_ratio=iris.petal_length / iris.petal_width)
    print(repr(iris.head()))

    versicolor = iris[iris.species.like('%versicolor')]
    print(repr(versicolor))

    print((len(versicolor), len(versicolor.fields)))

    print(repr(iris.relabel(petal_length='PETAL-LENGTH',
                            petal_width='PETAL-WIDTH')))

    pd_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Joe', 'Bob'],
                          'amount': [100, 200, 300, 400],
                          'id': [1, 2, 3, 4]})

    # put the `df` DataFrame odo a Blaze Data object
    bl_df = bl.DataFrame(pd_df)
    bl_dt = bl.Data(pd_df)

    print(repr(pd_df.amount * 2))
    print(repr(bl_df.amount * 2))
    print(repr(bl_dt.amount * 2))

    print(repr(pd_df[['id', 'amount']]))
    print(repr(bl_df[['id', 'amount']]))
    print(repr(bl_dt[['id', 'amount']]))

    print(repr(pd_df[pd_df.amount > 300]))
    print(repr(bl_df[bl_df.amount > 300]))
    print(repr(bl_dt[bl_dt.amount > 300]))

    print(repr(pd_df.groupby('name').amount.mean()))
    print(repr(pd_df.groupby(['name', 'id']).amount.mean()))

    print(repr(bl_df.groupby('name').amount.mean()))
    print(repr(bl_df.groupby(['name', 'id']).amount.mean()))

    print(repr(bl.by(bl_dt.name, amount=bl_dt.amount.mean())))
    print(repr(bl.by(bl.merge(bl_dt.name, bl_dt.id),
                     amount=bl_dt.amount.mean())))

    #pd.merge(pd_df, pd_df2, on='name')
    #bl.join(bl_dt, bl_dt2, 'name')

    print(repr(pd_df.amount.map(lambda x: x + 1)))
    print(repr(bl_df.amount.map(lambda x: x + 1)))
    print(repr(bl_dt.amount.map(lambda x: x + 1, 'int64')))

    print(repr(pd_df.rename(columns={'name': 'alias', 'amount': 'dollars'})))
    print(repr(bl_df.rename(columns={'name': 'alias', 'amount': 'dollars'})))
    print(repr(bl_dt.relabel(name='alias', amount='dollars')))

    print(repr(pd_df.drop_duplicates()))
    print(repr(bl_df.drop_duplicates()))
    print(repr(bl_dt.distinct()))

    print(repr(pd_df.name.drop_duplicates()))
    print(repr(bl_df.name.drop_duplicates()))
    print(repr(bl_dt.name.distinct()))

    print(repr(pd_df.amount.mean()))
    print(repr(bl_df.amount.mean()))
    print(repr(bl_dt.amount.mean()))

    print(repr(pd_df))
    print(repr(bl_df))
    print(repr(bl_dt))

    print(repr(pd_df.amount.value_counts()), '\n')
    print(repr(bl_df.amount.value_counts()), '\n')
    print(repr(bl_dt.amount.count_values()), '\n')

    print(repr(pd_df.dtypes), '\n')
    print(repr(bl_df.dtypes), '\n')
    print(repr(bl_df.columns), '\n')
    print(repr(bl_dt.dshape), '\n')

    print(repr(pd_df.amount.dtypes), '\n')
    print(repr(bl_df.amount.dtypes), '\n')
    print(repr(bl_dt.amount.dshape), '\n')

    print(type(pd_df), type(bl_df), type(bl_dt), '\n')

    os.remove('output.csv')
    for fn_ in glob.glob('*.csv.gz'):
        os.remove(fn_)

    return
Ejemplo n.º 39
0
def compute_stats(data_dir, filename, category, commodity):
    ### PROBLEM: some commodities are still spread over multiple files => solve by gathering stats over multiple files
    csvr = bz.resource(filename)
    ds = bz.discover(csvr)
    d = bz.Data(filename, dshape=ds)
    category = list(d.category)[0]
    #commodity = list(d.commodity)[0]
    d = bz.transform(d, year=d.date.year, month=d.date.month)
    outdir = path.join(data_dir, 'stats', category, commodity)
    if not path.isdir(outdir):
        os.makedirs(outdir)

    nas_by_commodity(d, commodity, outdir)

    ### NOTE: transform: add date and month column
    # problem here is that it's not simply calling a predefined function
    ### NEED PANDAS FOR THIS
    #bz.by(bz.merge(d.year, d.month), commodityTonnage=d.commodityTonnage.sum())
    start = time.time()
    df = odo.odo(d, pd.DataFrame)
    elapsed = time.time() - start
    print('{0} took {1} secs to be loaded into pandas df with odo'.format(commodity, np.round(elapsed, 2)))

    # TODO: read the latest date from config/or use yesterdays date
    date_range = pd.date_range('1/1/2002', time.strftime('%m/%d/%Y'))
    #df = fill_records(df, date_range)

    ### In what particular periods do NAs occur?
    # does data quality improve over time? na ratio per (year, month) => further group by commodity before saving final dataframe
    get_loc_nas(df, commodity, [], ['state'], outdir)
    #get_loc_nas(df, commodity, [], ['state', 'district'], outdir)
    #get_loc_nas(df, commodity, [], ['state', 'district', 'market'], outdir)

    get_loc_nas(df, commodity, ['year', 'month'], ['state'], outdir)
    #get_loc_nas(df, commodity, ['year', 'month'], ['state', 'district'], outdir)
    #get_loc_nas(df, commodity, ['year', 'month'], ['state', 'district', 'market'], outdir)

    get_loc_nas(df, commodity, ['year'], ['state'], outdir)
    #get_loc_nas(df, commodity, ['year'], ['state', 'district'], outdir)
    #get_loc_nas(df, commodity, ['year'], ['state', 'district', 'market'], outdir)

    get_loc_nas(df, commodity, ['month'], ['state'], outdir)
    #get_loc_nas(df, commodity, ['month'], ['state', 'district'], outdir)
    #get_loc_nas(df, commodity, ['month'], ['state', 'district', 'market'], outdir)

    nas_over_time(df, commodity, ['year', 'month'], [], outdir)
    nas_over_time(df, commodity, ['year'], [], outdir)
    nas_over_time(df, commodity, ['month'], [], outdir)

    ### NOTE: taking into account that commodityTonnages are repeated:
    commodityTonnages = d[['date', 'state', 'district', 'market', 'category', 'commodity', 'commodityTonnage', 'year', 'month']].distinct()
    content, commodityTonnage_by_year_month = commodityTonnage_over_time(commodityTonnages, outdir)
    commodityTonnage_by_year(commodityTonnages, outdir)
    if content:
        commodityTonnage_by_month(commodityTonnage_by_year_month, outdir)

    #commodityTonnage_by_level(commodityTonnages, outdir, ['state', 'district', 'market'])
    #commodityTonnage_by_level(commodityTonnages, outdir, ['state', 'district'])
    commodityTonnage_by_level(commodityTonnages, outdir, ['state'])
    """
    commodityTonnage_by_month(commodityTonnage_by_year_month, outdir)
    commodityTonnage_by_year(commodityTonnages, outdir)
    # commodityTonnage by state,( district,) market
    commodityTonnage_by_market(commodityTonnages, outdir)
    commodityTonnage_by_district(commodityTonnages, outdir)
    commodityTonnage_by_state(commodityTonnages, outdir)
    """

    get_coverages(df, date_range, outdir, commodity)
    ### TODO:
    # display some of these statistics in the application
    """
    make nice printout
    to compute commodityTonnage stats by commodity, first have to drop variety column and then remove all duplicates
    df.drop_duplicates() --> df.distinct()
    trick to work with blaze: fill unwanted columns (variety) with unique string/float and call drop duplicates on dataframe

    Questions per commodity (can be answered with single files):
        What is the variety with the most commodityTonnages per commodity? => commodityTonnages per variety
        What markets have the most commodityTonnages per commodity?
        What months have the most commodityTonnages per commodity?
        What weekdays have the most commodityTonnages per commodity?
        Average price per commodity?
            - by year
            - by month (seasonal phenomena?)
            - by year, month
        TODO:
        What is the NA ratio of commodityTonnages? Are there any particular patterns to this ratio?
            - by month? => DONE
            - by market? --> COULD USE THIS TO VISUALIZE DATA QUALITY!
            - by state?

    Questions that can't be answered with single files => use pymongo or odo to load multiple collections into dataframe?
    - commodityTonnage tonnage by commodity by(commodity, ==> commodityTonnage.sum()) (total)
    - commodityTonnage tonnage by category?
    - avg price by commodity by(commodity, ==> modal.mean())
    - minimum modal price by commodity
    - maximum modal price by commodity
    - commodityTonnage tonnages by years (total)
    - commodityTonnage tonnages by months (accumulative)
    - commodityTonnage tonnages by commodity,years (total)
    - commodityTonnage tonnages by commodity, month (accumulative)
    - commodityTonnage tonnages by commodity, month, year

    TODO: test all of these in ipython first
    by(df.commodity, total_commodityTonnages = df.commodityTonnage.sum())

    - NA percentage for tonnages of commodity

    - NA percentage for tonnages of commodity
    """
    return
Ejemplo n.º 40
0
    def test_novel_deltas(self, asset_info):
        base_dates = pd.DatetimeIndex(
            [pd.Timestamp('2014-01-01'),
             pd.Timestamp('2014-01-04')])
        repeated_dates = base_dates.repeat(3)
        baseline = pd.DataFrame({
            'sid': self.sids * 2,
            'value': (0, 1, 2, 1, 2, 3),
            'asof_date': repeated_dates,
            'timestamp': repeated_dates,
        })
        expr = bz.Data(baseline, name='expr', dshape=self.dshape)
        deltas = bz.Data(baseline, name='deltas', dshape=self.dshape)
        deltas = bz.transform(
            deltas,
            value=deltas.value + 10,
            timestamp=deltas.timestamp + timedelta(days=1),
        )
        expected_views = keymap(
            pd.Timestamp, {
                '2014-01-03':
                np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0],
                          [10.0, 11.0, 12.0]]),
                '2014-01-06':
                np.array([[10.0, 11.0, 12.0], [10.0, 11.0, 12.0],
                          [11.0, 12.0, 13.0]]),
            })
        if len(asset_info) == 4:
            expected_views = valmap(
                lambda view: np.c_[view, [np.nan, np.nan, np.nan]],
                expected_views,
            )
            expected_output_buffer = [10, 11, 12, np.nan, 11, 12, 13, np.nan]
        else:
            expected_output_buffer = [10, 11, 12, 11, 12, 13]

        cal = pd.DatetimeIndex([
            pd.Timestamp('2014-01-01'),
            pd.Timestamp('2014-01-02'),
            pd.Timestamp('2014-01-03'),
            # omitting the 4th and 5th to simulate a weekend
            pd.Timestamp('2014-01-06'),
        ])

        with tmp_asset_finder(equities=asset_info) as finder:
            expected_output = pd.DataFrame(
                expected_output_buffer,
                index=pd.MultiIndex.from_product((
                    sorted(expected_views.keys()),
                    finder.retrieve_all(asset_info.index),
                )),
                columns=('value', ),
            )
            self._run_pipeline(
                expr,
                deltas,
                expected_views,
                expected_output,
                finder,
                calendar=cal,
                start=cal[2],
                end=cal[-1],
                window_length=3,
                compute_fn=op.itemgetter(-1),
            )