Example #1
0
    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], names=["first", "second"])

        index2 = MultiIndex.from_arrays([["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], names=["first", "second"])

        df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
        df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])

        df1 = df1.sortlevel(0)
        df2 = df2.sortlevel(0)

        joined = df1.join(df2, how="outer")
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

        df1 = df1.sortlevel(1)
        df2 = df2.sortlevel(1)

        joined = df1.join(df2, how="outer").sortlevel(0)
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)
Example #2
0
    def test_join_index_mixed(self):
        df1 = DataFrame({"A": 1.0, "B": 2, "C": "foo", "D": True}, index=np.arange(10), columns=["A", "B", "C", "D"])
        self.assertEqual(df1["B"].dtype, np.int64)
        self.assertEqual(df1["D"].dtype, np.bool_)

        df2 = DataFrame(
            {"A": 1.0, "B": 2, "C": "foo", "D": True}, index=np.arange(0, 10, 2), columns=["A", "B", "C", "D"]
        )

        # overlap
        joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
        expected_columns = ["A_one", "B_one", "C_one", "D_one", "A_two", "B_two", "C_two", "D_two"]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1["bool"] = True
        df1["string"] = "foo"

        df2 = DataFrame(index=np.arange(5, 15))
        df2["int"] = 1
        df2["float"] = 1.0

        for kind in ["inner", "outer", "left", "right"]:

            joined = df1.join(df2, how=kind)
            expected = _join_by_hand(df1, df2, how=kind)
            assert_frame_equal(joined, expected)

            joined = df2.join(df1, how=kind)
            expected = _join_by_hand(df2, df1, how=kind)
            assert_frame_equal(joined, expected)
Example #3
0
    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on="C")
        self.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
        self.assert_series_equal(merged["MergedD"], target["D"], check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
        joined = df.join(df2, on="key")
        expected = DataFrame({"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]})
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
        df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
        joined = df_a.join(df_b, on="one")
        joined = joined.join(df_c, on="one")
        self.assertTrue(np.isnan(joined["two"]["c"]))
        self.assertTrue(np.isnan(joined["three"]["c"]))

        # merge column not p resent
        self.assertRaises(KeyError, target.join, source, on="E")

        # overlap
        source_copy = source.copy()
        source_copy["A"] = 0
        self.assertRaises(ValueError, target.join, source_copy, on="A")
Example #4
0
 def test_join_segfault(self):
     # 1532
     df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]})
     df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]})
     df1 = df1.set_index(["a", "b"])
     df2 = df2.set_index(["a", "b"])
     # it works!
     for how in ["left", "right", "outer"]:
         df1.join(df2, how=how)
Example #5
0
    def test_join_on_singlekey_list(self):
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])

        # corner cases
        joined = df.join(df2, on=["key"])
        expected = df.join(df2, on="key")

        assert_frame_equal(joined, expected)
Example #6
0
    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=["a", "b"])
        c = Series(randn(30))
        a["c"] = c
        d = DataFrame(randn(30, 1), columns=["q"])

        # it works!
        a.join(d)
        d.join(a)
Example #7
0
    def test_join_on_inner(self):
        df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])

        joined = df.join(df2, on="key", how="inner")

        expected = df.join(df2, on="key")
        expected = expected[expected["value"].notnull()]
        self.assert_series_equal(joined["key"], expected["key"], check_dtype=False)
        self.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
        self.assert_index_equal(joined.index, expected.index)
Example #8
0
    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
        df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame({"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]}, index=[1, 2, 3, 3, "a"])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
        df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"])
        tm.assert_frame_equal(result, expected)
Example #9
0
    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how="outer")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")

        result = result.reset_index()
        expected = expected[result.columns]
        expected["a"] = expected.a.astype("int64")
        expected["b"] = expected.b.astype("int64")
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how="inner")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")

        result = result.reset_index()

        assert_frame_equal(result, expected.ix[:, result.columns])

        # GH 11519
        df = DataFrame(
            {
                "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
                "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
                "C": np.random.randn(8),
                "D": np.random.randn(8),
            }
        )
        s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST")
        inner = df.join(s, how="inner")
        outer = df.join(s, how="outer")
        left = df.join(s, how="left")
        right = df.join(s, how="right")
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)
Example #10
0
    def test_join_sort(self):
        left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
        right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])

        joined = left.join(right, on="key", sort=True)
        expected = DataFrame(
            {"key": ["bar", "baz", "foo", "foo"], "value": [2, 3, 1, 4], "value2": ["a", "b", "c", "c"]},
            index=[1, 2, 0, 3],
        )
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on="key", sort=False)
        self.assert_index_equal(joined.index, pd.Index(lrange(4)))
Example #11
0
    def test_join_inner_multiindex(self):
        key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
        key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False
        )

        expected2 = merge(to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False)

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        self.assertTrue(joined.index.is_monotonic)
        assert_frame_equal(joined, expected)
Example #12
0
 def test_join_on_series_buglet(self):
     # GH #638
     df = DataFrame({"a": [1, 1]})
     ds = Series([2], index=[1], name="b")
     result = df.join(ds, on="a")
     expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
     tm.assert_frame_equal(result, expected)
Example #13
0
    def to_dataframe(self, selected_fields=None, excluded_fields=None):
        from ..services import locations

        if excluded_fields:
            qs = self.exclude(*excluded_fields)
        else:
            qs = self.exclude(*self.DEFAULT_EXCLUDED_FIELDS)
        if selected_fields:
            qs = self.only(*selected_fields)

        df = DataFrame(list(qs.as_pymongo())).convert_objects(convert_numeric=True)
        if df.empty:
            return df

        # add fields with no values
        fields = filter(
            lambda f: f not in df.columns,
            map(lambda field: field.name, [field for group in self.first().form.groups for field in group.fields]),
        )

        for field in fields:
            df[field] = Series(np.nan, index=df.index)

        # do cleanup of subdocument fields
        for field in self.SUBDOCUMENT_FIELDS:
            temp = df.pop(field).tolist()
            temp2 = [i if not isnull(i) else {} for i in temp]
            df = df.join(DataFrame(temp2))

        rv_map = locations.registered_voters_map()

        df["registered_voters"] = df.location.apply(lambda i: rv_map.get(i, 0))

        return df
    def test_join_aware(self):
        rng = date_range("1/1/2011", periods=10, freq="H")
        ts = Series(np.random.randn(len(rng)), index=rng)

        ts_utc = ts.tz_localize("utc")

        self.assertRaises(Exception, ts.__add__, ts_utc)
        self.assertRaises(Exception, ts_utc.__add__, ts)

        test1 = DataFrame(
            np.zeros((6, 3)), index=date_range("2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central")
        )
        test2 = DataFrame(
            np.zeros((3, 3)),
            index=date_range("2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"),
            columns=lrange(3, 6),
        )

        result = test1.join(test2, how="outer")
        ex_index = test1.index.union(test2.index)

        self.assertTrue(result.index.equals(ex_index))
        self.assertTrue(result.index.tz.zone == "US/Central")

        # non-overlapping
        rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central")

        rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern")

        result = rng.union(rng2)
        self.assertTrue(result.tz.zone == "UTC")
Example #15
0
def parse_GDS_columns(lines, subsets):
    """Parse list of line with columns description from SOFT file
    of GDS (GEO Dataset)

    :param lines: iterable -- iterator over lines
    :returns: pandas.DataFrame -- columns description

    """
    data = []
    index = []
    for line in lines:
        line = line.rstrip()
        if line.startswith("#"):
            tmp = __parse_entry(line)
            data.append(tmp[1])
            index.append(tmp[0])

    df = DataFrame(data, index=index, columns=["description"])
    subset_ids = {"disease_state": {}, "individual": {}}
    for subsetname, subset in subsets.iteritems():
        for expid in subset.metadata["sample_id"][0].split(","):
            if subset.get_type() == "disease state":
                subset_ids["disease_state"][expid] = subset.metadata["description"][0]
            elif subset.get_type() == "individual":
                subset_ids["individual"][expid] = subset.metadata["description"][0]
            else:
                stderr("Unknown subset type: %s for subset %s\n" % (subset.get_type(), subsetname))

    return df.join(DataFrame(subset_ids))
Example #16
0
def foreach_dataframe(self, func, force_dict=False, *args, **kwargs):
    """
        Really just does a foreach with each being dfs in a panel. 
    """
    d = {}
    for key, df in self.iteritems():
        d[key] = func(df, *args, **kwargs)
    container = PanelDict
    for key, result in d.items():
        if isinstance(result, Series):
            container = DataFrame
            break
        if isinstance(result, DataFrame):
            container = Panel
            break

    index = []
    for key, result in d.items():
        if not isinstance(result, (DataFrame, Series)):
            continue
        result.name = key
        ind = result.index
        index = set(index).union(ind)

    if force_dict:
        return PanelDict(d)

    res = DataFrame(None, index=index)
    for key, result in d.items():
        res = res.join(result)

    res = res.sort()
    return res
Example #17
0
 def dataframe(self):
     tss = self.eval()
     df = DataFrame()
     # FIXME: should do something about potential for dupe names
     for ts, h in zip(tss, self.hidden):
         if not h and type(ts) != type(""):
             df = df.join(ts, how="outer")
     return df
    def test_join_str_datetime(self):
        str_dates = ["20120209", "20120222"]
        dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]

        A = DataFrame(str_dates, index=lrange(2), columns=["aa"])
        C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)

        tst = A.join(C, on="aa")

        self.assertEqual(len(tst.columns), 3)
Example #19
0
def get_results_df(db, rev):
    """Takes a git commit hash and returns a Dataframe of benchmark results
    """
    bench = DataFrame(db.get_benchmarks())
    results = DataFrame(map(list, db.get_rev_results(rev).values()))

    # Sinch vbench.db._reg_rev_results returns an unlabeled dict,
    # we have to break encapsulation a bit.
    results.columns = db._results.c.keys()
    results = results.join(bench["name"], on="checksum").set_index("checksum")
    return results
Example #20
0
File: tank.py Project: cpcloud/span
    def _read_tsq(self, event_name):
        """Read the metadata (TSQ) file of a TDT Tank.

        Returns
        -------
        b : pandas.DataFrame
            Recording metadata
        """
        # create the path name
        tsq_name = self.path + os.extsep + self.header_ext

        # read in the raw data as a numpy rec array and convert to DataFrame
        b = DataFrame(np.fromfile(tsq_name, dtype=self.tsq_dtype))

        # zero based indexing
        b.channel -= 1
        b.channel = b.channel.astype(f8)

        # -1s are invalid
        b.channel[b.channel == -1] = np.nan

        b.type = EventTypes[b.type].reset_index(drop=True)
        b.format = DataTypes[b.format].reset_index(drop=True)

        b.timestamp[np.logical_not(b.timestamp)] = np.nan
        b.fs[np.logical_not(b.fs)] = np.nan

        # fragile subtraction (i.e., what if TDT changes this value?)
        b.size -= 10

        # create some new indices based on the electrode array
        srt = Indexer.sort("channel").reset_index(drop=True)
        shank = srt.shank[b.channel].reset_index(drop=True)

        tsq = b.join(shank)

        # convert the event_name to a number
        name = name2num(event_name)

        # get the row of the metadata where its value equals the name-number
        row = tsq.name == name

        # make sure there's at least one event
        assert row.any(), "no event named %s in tank: %s" % (event_name, self.path)

        # get all the metadata for those events
        tsq = tsq[row]

        # convert to integer where possible
        tsq.channel = tsq.channel.astype(int)
        tsq.shank = tsq.shank.astype(int)

        return tsq, row
Example #21
0
class InfoTable(DataFrameWidget):
    def __init__(self, samples=None):
        self.initVars()
        super(InfoTable, self).__init__(self.table)

    def initVars(self):
        """Initialises variables."""
        self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"]
        self.table = DataFrame(columns=self.columns)

    ########################################################################
    def update(self):
        plateID = self.table["Plate ID"]
        plateName = self.table["Plate Name"]
        plateKea = self.table["Plate Kea"]
        well = self.table["Well"]
        self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1)
        self.table.insert(0, "Plate ID", plateID)
        self.table.insert(1, "Plate Name", plateName)
        self.table.insert(2, "Plate Kea", plateKea)
        self.table.insert(3, "Well", well)
        self.setDataFrame(self.table)

    def append(self, appendage):
        self.table = self.table.append(appendage, ignore_index=True)
        self.update()

    def editPlates(self, edits):
        self.table = self.table.set_index("Plate ID")
        edits = edits.set_index("ID")
        self.table.update(edits)
        self.table = self.table.reset_index()

    def importPlateData(self, plateData, key):
        plateData = plateData.set_index(key)
        self.table = self.table.set_index(key)
        self.table.update(plateData)
        self.table = self.table.reset_index()

    def importSampleData(self, sampleData, tableKey, importKey):
        sampleData[tableKey] = sampleData[importKey]
        sampleData = sampleData.set_index(tableKey)
        self.table = self.table.set_index(tableKey)
        self.table = self.table.join(sampleData, rsuffix="_new")
        self.table = self.table.reset_index()

    def getKeaSexTestingData(self):
        table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]]
        table = table.set_index(["Plate ID", "Well"])
        table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True)
        return table
def rolling_mean(data, window, min_periods=1, center=False):
    if len(data) < 2:
        return data
    """ Function that computes a rolling mean

    Parameters
    ----------
    data : DataFrame or Series
           If a DataFrame is passed, the rolling_mean is computed for all columns.
    window : int or string
             If int is passed, window is the number of observations used for calculating
             the statistic, as defined by the function pd.rolling_mean()
             If a string is passed, it must be a frequency string, e.g. '90S'. This is
             internally converted into a DateOffset object, representing the window size.
    min_periods : int
                  Minimum number of observations in window required to have a value.

    Returns
    -------
    Series or DataFrame, if more than one column
    """

    def f(x):
        """Function to apply that actually computes the rolling mean"""
        offset = pd.datetools.to_offset(window)

        if center == False:
            dslice = col[x - offset.delta + timedelta(0, 0, 1) : x]
            # adding a microsecond because when slicing with labels start and endpoint
            # are inclusive
        else:
            dslice = col[x - offset.delta / 2 + timedelta(0, 0, 1) : x + pd.datetools.to_offset(window).delta / 2]
        if dslice.size < min_periods:
            return np.nan
        else:
            return dslice.mean()

    data = DataFrame(data.copy())
    dfout = DataFrame()
    if isinstance(window, int):
        dfout = pd.rolling_mean(data, window, min_periods=min_periods, center=center)
    elif isinstance(window, basestring):
        idx = Series(pd.to_datetime(data.index), index=data.index)
        for colname, col in data.iterkv():
            result = idx.apply(f)
            result.name = colname
            dfout = dfout.join(result, how="outer")
    if dfout.columns.size == 1:
        dfout = dfout.ix[:, 0]
    return dfout
    def summary_frame(self):
        """
        Creates a DataFrame with all available influence results.

        Returns
        -------
        frame : DataFrame
            A DataFrame with all results.

        Notes
        -----
        The resultant DataFrame contains six variables in addition to the
        DFBETAS. These are:

        * cooks_d : Cook's Distance defined in `Influence.cooks_distance`
        * standard_resid : Standardized residuals defined in
          `Influence.resid_studentized_internal`
        * hat_diag : The diagonal of the projection, or hat, matrix defined in
          `Influence.hat_matrix_diag`
        * dffits_internal : DFFITS statistics using internally Studentized
          residuals defined in `Influence.dffits_internal`
        * dffits : DFFITS statistics using externally Studentized residuals
          defined in `Influence.dffits`
        * student_resid : Externally Studentized residuals defined in
          `Influence.resid_studentized_external`
        """
        from pandas import DataFrame

        # row and column labels
        data = self.results.model.data
        row_labels = data.row_labels
        beta_labels = ["dfb_" + i for i in data.xnames]

        # grab the results
        summary_data = DataFrame(
            dict(
                cooks_d=self.cooks_distance[0],
                standard_resid=self.resid_studentized_internal,
                hat_diag=self.hat_matrix_diag,
                dffits_internal=self.dffits_internal[0],
                student_resid=self.resid_studentized_external,
                dffits=self.dffits[0],
            ),
            index=row_labels,
        )
        # NOTE: if we don't give columns, order of above will be arbitrary
        dfbeta = DataFrame(self.dfbetas, columns=beta_labels, index=row_labels)

        return dfbeta.join(summary_data)
Example #24
0
    def _make_wiserf_csvs(X, Y):
        infile = tempfile().name
        descfile = tempfile().name

        tmpX = DataFrame(X).add_prefix("x")
        if Y is not None:
            tmpY = DataFrame(Y).add_prefix("y")
            tmpX = tmpX.join(tmpY, how="outer")
        columns = tmpX.columns
        tmpX.to_csv(infile, header=False, index=False)
        with open(descfile, "w") as outfile:
            outfile.write("label-type regression\n")
            outfile.write("num-features %d\n" % WiseRFRegressor._max_fte(X))
            for indx, column in enumerate(columns):
                if column.startswith("y"):
                    outfile.write("class-column %d\n" % indx)
        return [infile, descfile]
Example #25
0
def compute_one(t, df, **kwargs):
    assert isinstance(t.apply, Reduction)
    grouper = DataFrame(compute(t.grouper, {t.child: df}))
    pregrouped = DataFrame(compute(t.apply.child, {t.child: df}))

    full = grouper.join(pregrouped)
    groups = full.groupby(unpack(grouper.columns))[unpack(pregrouped.columns)]

    g = TableSymbol("group", t.apply.child.schema)
    reduction = t.apply.subs({t.apply.child: g})
    result = compute(reduction, {g: groups})

    if isinstance(result, Series):
        result.name = unpack(pregrouped.columns)
        result = DataFrame(result)

    return result[list(pregrouped.columns)].reset_index()
def main():

    # Get links to survey pages
    home_url = "http://www.igmchicago.org/igm-economic-experts-panel"
    home_contents = get_page_contents(home_url)
    urls = re.findall(r"<h2><a href=\"(\S+?results\?SurveyID=\S+?)\"", home_contents)
    urls = ["http://www.igmchicago.org" + url for url in urls]

    # Loop through survey pages
    df = DataFrame()
    question_count = 0
    for url in reversed(urls):

        contents = get_page_contents(url)

        questions = re.findall(r"surveyQuestion\">([\s\S]+?)</h3>", contents)
        responder_list = re.findall(r"\?id=([\d]+)?\">([\s\w.]+?)</a>", contents)

        responses = re.findall(r"<span class=\"option-[\d]+?\">([\s\w.]+?)</span>", contents)
        num_responders = len(responses) / len(questions)

        # Loop through sub-questions (A, B, etc) within each page
        for i, question in enumerate(questions):
            question = clean_string(question)
            question_count += 1
            print(question)

            # Restrict range to responses for this sub-question
            rng = (i * num_responders, (i + 1) * num_responders)

            # Collect sub-question, its url suffix, and the responses
            prefix = "(%03d" % question_count + ") "
            q_responses = Series(responses[rng[0] : rng[1]], index=responder_list[rng[0] : rng[1]])
            q_url_suffix = re.findall("=(.+)", url)[0]
            q_responses = q_responses.append(Series([q_url_suffix], index=["q_url_suffix"]))
            q_responses.name = prefix + question.strip()

            # Add question data to dataframe
            df = df.join(q_responses, how="outer")

    # Move responder id from index to column, only after all joins are complete
    df["responder_id"] = [pair[0] for pair in df.index]
    df.index = [pair[1] if type(pair) == tuple else pair for pair in df.index]

    # Write to file
    df.to_json("survey_results.json")
Example #27
0
    def test_pandas_join(self):

        multi_index = MultiIndex.from_product([[1, 2], ["A", "B", "C"]], names=["REALIZATION", "LABEL"])

        data = DataFrame(data=[[1, 2, 3], [2, 4, 6], [4, 8, 12]] * 2, index=multi_index, columns=["C1", "C2", "C3"])

        new_column = DataFrame(data=[4.0, 4.4, 4.8], index=[1, 2, 3], columns=["C4"])
        new_column.index.name = "REALIZATION"

        result = data.join(new_column, how="inner")

        self.assertFloatEqual(result["C4"][1]["A"], 4.0)
        self.assertFloatEqual(result["C4"][1]["B"], 4.0)
        self.assertFloatEqual(result["C4"][1]["C"], 4.0)

        self.assertFloatEqual(result["C4"][2]["A"], 4.4)
        self.assertFloatEqual(result["C4"][2]["B"], 4.4)
        self.assertFloatEqual(result["C4"][2]["C"], 4.4)
 def merge_variables(self, to_merge):
     """Merges time series variables into new time series variables.
     :param to_merge: dictionary mapping new variable name to list of variables to be merged.
     :return:
     """
     dold = self._data.copy()
     s = Series(data=np.zeros((dold.shape[0],)), index=dold.index).replace(0, np.nan)
     dnew = DataFrame(
         dict([(k, s) for k in to_merge.keys() if len(set(to_merge[k]).intersection(dold.columns)) > 0])
     )
     for newvar in dnew.columns:
         for oldvar in to_merge[newvar]:
             if oldvar in dold.columns:
                 dnew[newvar][dold[oldvar].notnull()] = dold[oldvar][dold[oldvar].notnull()]
                 del dold[oldvar]
     dnew = dnew.join(dold, how="outer")
     dnew.sort(axis=1, inplace=True)
     dnew.sort(axis=0, inplace=True)
     self._data = dnew
Example #29
0
    def parse_data(self, articles):
        """ Responsible to parse articles in order to extract data.
        Data is extracted as a DataFrame containing the following columns:
        - Article metadata: only the metadata defined in self.metadata_column are extracted
        - Article tags: all tags are extracted, the name defined in self.tags_column are used to rename columns
        Data is indexed by a generated ID (integer).

        :param articles: The articles to parse.
        """
        tags = []
        metadata = []
        # TODO not the more efficient way to do that I think.
        for article in articles:
            if hasattr(article, "tags"):
                # Extracting all tags name from an article and putting them in a Series
                tags.append(
                    Series([tag.name for tag in article.tags], ["tag_" + str(x) for x in range(len(article.tags))])
                )
            # Selecting metadata, only the ones specified in the columns
            metadata.append(
                Series(
                    dict([(i, article.metadata[i]) for i in self.metadata_columns if i in article.metadata]),
                    self.metadata_columns,
                )
            )
        # Creating the tags DataFrame
        tags_data_frame = DataFrame(tags)
        # Renaming columns, leaving the remaining ones with the generated name "tag_"
        # Mapping current column names to the new ones in order to make a replacement
        if self.tag_columns is not None:
            replacement = dict(zip(tags_data_frame.columns.get_values()[: len(self.tag_columns)], self.tag_columns))
            # Inplace means no copy
            tags_data_frame.rename(columns=replacement, inplace=True)
        # Creating the metadata DataFrame
        metadata_data_frame = DataFrame(metadata)
        # Replacing data in column category by its string value
        # TODO maybe a better way to do that, it seems a bit ugly
        metadata_data_frame["category"] = metadata_data_frame["category"].apply(lambda x: str(x))
        # Merging the two DataFrame together
        self.data = metadata_data_frame.join(tags_data_frame)
Example #30
0
    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=["a", "b"], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=["c"], dtype=np.float32)
        joined = a.join(b)
        self.assertEqual(joined.dtypes["a"], "float64")
        self.assertEqual(joined.dtypes["b"], "float64")
        self.assertEqual(joined.dtypes["c"], "float32")

        a = np.random.randint(0, 5, 100).astype("int64")
        b = np.random.random(100).astype("float64")
        c = np.random.random(100).astype("float32")
        df = DataFrame({"a": a, "b": b, "c": c})
        xpdf = DataFrame({"a": a, "b": b, "c": c})
        s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
        rs = df.merge(s, left_on="a", right_index=True)
        self.assertEqual(rs.dtypes["a"], "int64")
        self.assertEqual(rs.dtypes["b"], "float64")
        self.assertEqual(rs.dtypes["c"], "float32")
        self.assertEqual(rs.dtypes["md"], "float32")

        xp = xpdf.merge(s, left_on="a", right_index=True)
        assert_frame_equal(rs, xp)