Example #1
0
def compute_fscore(data_set_df, user_info_df, label="gender", min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=["importance"])
    i = 0
    for index, values in df_filtered.iterrows():
        try:
            if min_not_nan < 0:
                f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v)
                feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_fs.loc[index] = np.nan
                else:
                    f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)])
                    feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            if float(i) % 10000 == 0 and i > 0:
                print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_fs.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_fs.sort_values("importance", ascending=False, inplace=True, na_position="last")
    return feature_fs
Example #2
0
    def test_sort_index_multicolumn(self):
        import random

        A = np.arange(5).repeat(20)
        B = np.tile(np.arange(5), 20)
        random.shuffle(A)
        random.shuffle(B)
        frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=["A", "B"])
        result = frame.sort_values(by=["A", "B"])
        indexer = np.lexsort((frame["B"], frame["A"]))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=["A", "B"], ascending=False)
        result = frame.sort_values(by=["A", "B"], ascending=False)
        indexer = np.lexsort((frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=["B", "A"])
        result = frame.sort_values(by=["B", "A"])
        indexer = np.lexsort((frame["A"], frame["B"]))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)
Example #3
0
def compute_mics(data_set_df, user_info_df, label="gender", min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_mics = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=["importance"])
    i = 0
    for index, values in df_filtered.iterrows():
        # if len(feature_mics) > 1000:
        #     break
        m = minepy.MINE()
        try:
            if min_not_nan < 0:
                m.compute_score(values, y_v)
                feature_mics.loc[index] = m.mic()
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_mics.loc[index] = np.nan
                else:
                    m.compute_score(nan_removed, y_v[nan_removed.index.astype(int)])
                    feature_mics.loc[index] = m.mic()
            # if len(feature_mics) > 1000:
            #     break
            # if float(i) % 10000 == 0 and i > 0:
            #     print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_mics.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_mics.sort_values("importance", ascending=False, inplace=True, na_position="last")
    return feature_mics
Example #4
0
    def test_sort_datetimes(self):

        # GH 3461, argsort / lexsort differences for a datetime column
        df = DataFrame(
            ["a", "a", "a", "b", "c", "d", "e", "f", "g"], columns=["A"], index=date_range("20130101", periods=9)
        )
        dts = [
            Timestamp(x)
            for x in [
                "2004-02-11",
                "2004-01-21",
                "2004-01-26",
                "2005-09-20",
                "2010-10-04",
                "2009-05-12",
                "2008-11-12",
                "2010-09-28",
                "2010-09-28",
            ]
        ]
        df["B"] = dts[::2] + dts[1::2]
        df["C"] = 2.0
        df["A1"] = 3.0

        df1 = df.sort_values(by="A")
        df2 = df.sort_values(by=["A"])
        assert_frame_equal(df1, df2)

        df1 = df.sort_values(by="B")
        df2 = df.sort_values(by=["B"])
        assert_frame_equal(df1, df2)
Example #5
0
    def test_stable_descending_multicolumn_sort(self):
        nan = np.nan
        df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})
        # test stable mergesort
        expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]}, index=[2, 5, 4, 6, 1, 3, 0])
        sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort")
        assert_frame_equal(sorted_df, expected)

        expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3])
        sorted_df = df.sort_values(["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort")
        assert_frame_equal(sorted_df, expected)
Example #6
0
def compute_randomized_lr_score(data_set_df, user_info_df, label="gender"):
    # print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered.dropna(how="all")
    x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values

    clf = RandomizedLogisticRegression()
    # print "\t\t\tfitting LR model..."
    clf.fit(x_imp.T, y_v)
    feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=["importance"])
    feature_importances.sort_values("importance", ascending=False, inplace=True, na_position="last")
    return feature_importances
 def getData(self, params):
     top = int(params["top"])
     regex = re.compile("^<.*>(\w+.*)</.>")
     df = createDataframe()
     source = [str(regex.findall(line)).strip("[]") for line in df["source"] if line != None]
     source = dict(Counter(source))
     appSource = source.keys()
     count = source.values()
     tweetSource = DataFrame({"AppSource": appSource, "Count": count})
     tweetSource = tweetSource[["AppSource", "Count"]]
     tweetSource.sort_values(by="Count", ascending=False, inplace=True)
     return tweetSource[:top]
Example #8
0
    def test_sort_index_duplicates(self):

        # with 9816, these are all translated to .sort_values

        df = DataFrame([lrange(5, 9), lrange(4)], columns=["a", "a", "b", "b"])

        with assertRaisesRegexp(ValueError, "duplicate"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by="a")
        with assertRaisesRegexp(ValueError, "duplicate"):
            df.sort_values(by="a")

        with assertRaisesRegexp(ValueError, "duplicate"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by=["a"])
        with assertRaisesRegexp(ValueError, "duplicate"):
            df.sort_values(by=["a"])

        with assertRaisesRegexp(ValueError, "duplicate"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                # multi-column 'by' is separate codepath
                df.sort_index(by=["a", "b"])
        with assertRaisesRegexp(ValueError, "duplicate"):
            # multi-column 'by' is separate codepath
            df.sort_values(by=["a", "b"])

        # with multi-index
        # GH4370
        df = DataFrame(np.random.randn(4, 2), columns=MultiIndex.from_tuples([("a", 0), ("a", 1)]))
        with assertRaisesRegexp(ValueError, "levels"):
            # use .sort_values #9816
            with tm.assert_produces_warning(FutureWarning):
                df.sort_index(by="a")
        with assertRaisesRegexp(ValueError, "levels"):
            df.sort_values(by="a")

        # convert tuples to a list of tuples
        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=[("a", 1)])
        expected = df.sort_values(by=[("a", 1)])

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=("a", 1))
        result = df.sort_values(by=("a", 1))
        assert_frame_equal(result, expected)
Example #9
0
def thread_participation_evolution(pm_frame, project, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to threads in project with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = "all threads"
        title = "Participation per thread in {} (threshold = {})".format(project, n)
    else:
        thread_type = "research threads"
        title = "Participation per thread in {}\
                 (threshold = {}, only research-threads)".format(
            project, n
        )
    data = pm_frame.loc[project][["basic", thread_type]]
    data = data.dropna()
    all_authors = set().union(*data[thread_type, "authors"])
    author_thread = DataFrame(columns=all_authors)
    for author in author_thread.columns:
        author_thread[author] = data[thread_type, "authors"].apply(lambda thread, author=author: author in thread)
    author_thread = author_thread.T
    author_thread = author_thread.sort_values(by=data.index.tolist(), ascending=False)
    author_thread = author_thread.drop("Anonymous") if skip_anon else author_thread
    author_thread.columns.name = "Threads"
    select = author_thread.sum(axis=1) >= n
    return author_thread, data.index, select, title
Example #10
0
def project_participation_evolution(pm_frame, all_authors, n=2, skip_anon=True, research_only=False):
    """Assembles data on participation to projects with n as thresh.
    Returns DataFrame, index, selection and title for data for use
    by stacked bar-plot and heatmap functions."""
    if not research_only:
        thread_type = "all threads"
        data, _ = get_last(pm_frame, thread_type)
        all_authors = list(all_authors)
        title = "Participation per project in Polymath\
                 (threshold = {})".format(
            n
        )
    else:
        thread_type = "research threads"
        data, _ = get_last(pm_frame, thread_type)
        all_authors = set().union(*data["research threads", "authors (accumulated)"])
        title = "Participation per project in Polymath\
                 (threshold = {}, only research-threads)".format(
            n
        )
    data.index = data.index.droplevel(1)
    author_project = DataFrame(columns=all_authors)
    for author in author_project.columns:
        author_project[author] = data[thread_type, "authors (accumulated)"].apply(
            lambda project, author=author: author in project
        )
    author_project = author_project.T
    author_project = author_project.sort_values(by=data.index.tolist(), ascending=False)
    author_project = author_project.drop("Anonymous") if skip_anon else author_project
    select = author_project.sum(axis=1) >= n
    return author_project, data.index, select, title
Example #11
0
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=["A", "B"], ascending=[1, 0])
        result = df.sort_values(by=["A", "B"], ascending=[1, 0])

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)
        assert_frame_equal(result, expected)

        # test with multiindex, too
        idf = df.set_index(["A", "B"])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        assert_frame_equal(result, expected)

        # also, Series!
        result = idf["C"].sort_index(ascending=[1, 0])
        assert_series_equal(result, expected["C"])
Example #12
0
    def __init__(self, column, baseline, adjustments=None):
        self.column = column
        self.baseline = baseline.values.astype(self.column.dtype)
        self.dates = baseline.index
        self.assets = baseline.columns

        if adjustments is None:
            adjustments = DataFrame(index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS)
        else:
            # Ensure that columns are in the correct order.
            adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
            adjustments.sort_values(["apply_date", "sid"], inplace=True)

        self.adjustments = adjustments
        self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
        self.adjustment_end_dates = DatetimeIndex(adjustments.end_date)
        self.adjustment_sids = Int64Index(adjustments.sid)
Example #13
0
    def test_sort_inplace(self):
        frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"])

        sorted_df = frame.copy()
        sorted_df.sort_values(by="A", inplace=True)
        expected = frame.sort_values(by="A")
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.copy()
        sorted_df.sort_values(by="A", ascending=False, inplace=True)
        expected = frame.sort_values(by="A", ascending=False)
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.copy()
        sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True)
        expected = frame.sort_values(by=["A", "B"], ascending=False)
        assert_frame_equal(sorted_df, expected)
Example #14
0
 def __repr__(self):
     repr_table = DataFrame(self.cluster_info, columns=["Cluster_no", "Cluster_size", "Description", "Non_red_size"])
     repr_table = repr_table[["Cluster_no", "Cluster_size", "Non_red_size", "Description"]]
     repr_table = repr_table.sort_values("Non_red_size", ascending=False)[:20]
     repr_string = "Id value for clustering: {}\n\n".format(self.id_val)
     repr_string += repr_table.to_string(index=False)
     if len(self.cluster_info) > 20:
         repr_string += "\n... {} more entries...".format(len(self.cluster_info) - 20)
     return repr_string
Example #15
0
def get_sorted_data():
    temp_year = start
    while temp_year != end:
        for temp_quater in [1, 2, 3, 4]:
            item = str(temp_year) + "-" + str(temp_quater)
            data_total_raw[item] = sql.read_sql(item, engine)
            print item + " has load."
        temp_year += 1
    for item in data_total_raw.keys():
        data_total_sorted[item] = DataFrame.sort_values(data_total_raw[item], axis=0, ascending=False, by="eps")
Example #16
0
    def test_sort_nat_values_in_int_column(self):

        # GH 14922: "sorting with large float and multiple columns incorrect"

        # cause was that the int64 value NaT was considered as "na". Which is
        # only correct for datetime64 columns.

        int_values = (2, int(NaT))
        float_values = (2.0, -1.797693e308)

        df = DataFrame(dict(int=int_values, float=float_values), columns=["int", "float"])

        df_reversed = DataFrame(
            dict(int=int_values[::-1], float=float_values[::-1]), columns=["int", "float"], index=[1, 0]
        )

        # NaT is not a "na" for int64 columns, so na_position must not
        # influence the result:
        df_sorted = df.sort_values(["int", "float"], na_position="last")
        assert_frame_equal(df_sorted, df_reversed)

        df_sorted = df.sort_values(["int", "float"], na_position="first")
        assert_frame_equal(df_sorted, df_reversed)

        # reverse sorting order
        df_sorted = df.sort_values(["int", "float"], ascending=False)
        assert_frame_equal(df_sorted, df)

        # and now check if NaT is still considered as "na" for datetime64
        # columns:
        df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values), columns=["datetime", "float"])

        df_reversed = DataFrame(
            dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]),
            columns=["datetime", "float"],
            index=[1, 0],
        )

        df_sorted = df.sort_values(["datetime", "float"], na_position="first")
        assert_frame_equal(df_sorted, df_reversed)

        df_sorted = df.sort_values(["datetime", "float"], na_position="last")
        assert_frame_equal(df_sorted, df_reversed)
Example #17
0
def compute_importances(data_set_df, user_info_df, label="gender", split_modal=False, n_est=10, max_depth=None):
    print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_importances = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=["importance"])
    modalities = data_set_df.index.levels[0]

    def compute(x):
        x_imp = pc.fill_nan_features(x)
        try:
            m = (
                ExtraTreesClassifier(n_estimators=n_est)
                if max_depth is None
                else ExtraTreesClassifier(n_estimators=n_est, max_depth=3)
            )
            print "\t\t\tfitting RF model..."
            m.fit(x_imp.T, y_v)

            # if len(feature_mics) > 1000:
            #     break
            # print m.feature_importances_
            for order, index in enumerate(x.index):
                feature_importances.loc[index] = m.feature_importances_[order]
                if float(order) % 10000 == 0 and order > 0:
                    print "\t\t\t%s features are done" % order
        except ValueError as e:
            # print "value error occurs during processing %r" % index
            pass

    if split_modal is True:
        for modal in modalities:
            x = df_filtered.loc[modal].dropna(how="all")
            compute(x)
    else:
        x = df_filtered.dropna(how="all")
        compute(x)

    feature_importances.sort_values("importance", ascending=False, inplace=True, na_position="last")
    return feature_importances
Example #18
0
    def test_sort_values(self):
        frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC"))

        # by column (axis=0)
        sorted_df = frame.sort_values(by="A")
        indexer = frame["A"].argsort().values
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        indexer = indexer[::-1]
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        assert_frame_equal(sorted_df, expected)

        # GH4839
        sorted_df = frame.sort_values(by=["A"], ascending=[False])
        assert_frame_equal(sorted_df, expected)

        # multiple bys
        sorted_df = frame.sort_values(by=["B", "C"])
        expected = frame.loc[[2, 1, 3]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=["B", "C"], ascending=False)
        assert_frame_equal(sorted_df, expected[::-1])

        sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False])
        assert_frame_equal(sorted_df, expected)

        self.assertRaises(ValueError, lambda: frame.sort_values(by=["A", "B"], axis=2, inplace=True))

        # by row (axis=1): GH 10806
        sorted_df = frame.sort_values(by=3, axis=1)
        expected = frame
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
        expected = frame.reindex(columns=["C", "B", "A"])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 2], axis="columns")
        expected = frame.reindex(columns=["B", "A", "C"])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
        expected = frame.reindex(columns=["C", "B", "A"])
        assert_frame_equal(sorted_df, expected)

        msg = r"Length of ascending \(5\) != length of by \(2\)"
        with assertRaisesRegexp(ValueError, msg):
            frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
Example #19
0
    def test_sort_nan(self):
        # GH3917
        nan = np.nan
        df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]})

        # sort one column only
        expected = DataFrame({"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5])
        sorted_df = df.sort_values(["A"], na_position="first")
        assert_frame_equal(sorted_df, expected)

        expected = DataFrame({"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3])
        sorted_df = df.sort_values(["A"], na_position="first", ascending=False)
        assert_frame_equal(sorted_df, expected)

        expected = df.reindex(columns=["B", "A"])
        sorted_df = df.sort_values(by=1, axis=1, na_position="first")
        assert_frame_equal(sorted_df, expected)

        # na_position='last', order
        expected = DataFrame({"A": [1, 1, 2, 4, 6, 8, nan], "B": [2, 9, nan, 5, 5, 4, 5]}, index=[3, 0, 1, 6, 4, 5, 2])
        sorted_df = df.sort_values(["A", "B"])
        assert_frame_equal(sorted_df, expected)

        # na_position='first', order
        expected = DataFrame({"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, nan, 5, 5, 4]}, index=[2, 3, 0, 1, 6, 4, 5])
        sorted_df = df.sort_values(["A", "B"], na_position="first")
        assert_frame_equal(sorted_df, expected)

        # na_position='first', not order
        expected = DataFrame({"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5])
        sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first")
        assert_frame_equal(sorted_df, expected)

        # na_position='last', not order
        expected = DataFrame({"A": [8, 6, 4, 2, 1, 1, nan], "B": [4, 5, 5, nan, 2, 9, 5]}, index=[5, 4, 6, 1, 3, 0, 2])
        sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last")
        assert_frame_equal(sorted_df, expected)

        # Test DataFrame with nan label
        df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, index=[1, 2, 3, 4, 5, 6, nan])

        # NaN label, ascending=True, na_position='last'
        sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last")
        expected = DataFrame(
            {"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, index=[1, 2, 3, 4, 5, 6, nan]
        )
        assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=True, na_position='first'
        sorted_df = df.sort_index(na_position="first")
        expected = DataFrame(
            {"A": [4, 1, 2, nan, 1, 6, 8], "B": [5, 9, nan, 5, 2, 5, 4]}, index=[nan, 1, 2, 3, 4, 5, 6]
        )
        assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=False, na_position='last'
        sorted_df = df.sort_index(kind="quicksort", ascending=False)
        expected = DataFrame(
            {"A": [8, 6, 1, nan, 2, 1, 4], "B": [4, 5, 2, 5, nan, 9, 5]}, index=[6, 5, 4, 3, 2, 1, nan]
        )
        assert_frame_equal(sorted_df, expected)

        # NaN label, ascending=False, na_position='first'
        sorted_df = df.sort_index(kind="quicksort", ascending=False, na_position="first")
        expected = DataFrame(
            {"A": [4, 8, 6, 1, nan, 2, 1], "B": [5, 4, 5, 2, 5, nan, 9]}, index=[nan, 6, 5, 4, 3, 2, 1]
        )
        assert_frame_equal(sorted_df, expected)
Example #20
0
    return Series([x.min(), x.max()], index=["min", "max"])


frame.apply(f)

f = lambda x: "%.2f" % x
frame.applymap(f)  # element-wise operation


# sort index and values
obj = Series([4, 7, -3, 2])
obj.order()
frame = DataFrame(np.random.randn(5, 4), index=["three", "one", "two", "five", "four"], columns=["d", "a", "b", "c"])
frame.sort_index()
frame.sort_index(axis=1)
frame.sort_values(by="a")  # frame.sort_values(by=['a','b'])

obj.rank()
obj.rank(method="first")  # 'average', 'min', 'max', 'first'

frame.rank(axis=1)

# descriptive statistics
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list("abcd"), columns=["one", "two"])
df.describe()
# skipna=True, mean, std, var, sum,
# max, min, argmax, argmin, idxmax, idxmin,
# cumsum, cumprod, diff, pct_change

# Correlation and Covariance
df = DataFrame(np.random.randn(100, 3), columns=list("abc"))
Example #21
0
obj = Series(range(4), index=["d", "a", "b", "c"])
print(obj.sort_index())

frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=["d", "a", "b", "c"])
print(frame.sort_index())
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1, ascending=False))

obj = Series([4, 7, -3, -2])
print(obj.sort_values())
obj = Series([4, np.nan, 7, np.nan, -3, -2])
print(obj.sort_values())

frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by="b"))
print(frame.sort_values(by=["a", "b"]))

# rank
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
print(obj.rank(method="first"))
print(obj.rank(method="max", ascending=False))

frame = DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis=1))

"""
duplicate index
"""
obj = Series(range(5), index=["a", "a", "b", "b", "c"])
Example #22
0
    def test_sort_index(self):
        frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"])

        # axis=0
        unordered = frame.ix[[3, 2, 4, 1]]
        sorted_df = unordered.sort_index(axis=0)
        expected = frame
        assert_frame_equal(sorted_df, expected)

        sorted_df = unordered.sort_index(ascending=False)
        expected = frame[::-1]
        assert_frame_equal(sorted_df, expected)

        # axis=1
        unordered = frame.ix[:, ["D", "B", "C", "A"]]
        sorted_df = unordered.sort_index(axis=1)
        expected = frame
        assert_frame_equal(sorted_df, expected)

        sorted_df = unordered.sort_index(axis=1, ascending=False)
        expected = frame.ix[:, ::-1]
        assert_frame_equal(sorted_df, expected)

        # by column
        sorted_df = frame.sort_values(by="A")
        indexer = frame["A"].argsort().values
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        indexer = indexer[::-1]
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        assert_frame_equal(sorted_df, expected)

        # GH4839
        sorted_df = frame.sort_values(by=["A"], ascending=[False])
        assert_frame_equal(sorted_df, expected)

        # check for now
        sorted_df = frame.sort_values(by="A")
        assert_frame_equal(sorted_df, expected[::-1])
        expected = frame.sort_values(by="A")
        assert_frame_equal(sorted_df, expected)

        expected = frame.sort_values(by=["A", "B"], ascending=False)
        sorted_df = frame.sort_values(by=["A", "B"])
        assert_frame_equal(sorted_df, expected[::-1])

        self.assertRaises(ValueError, lambda: frame.sort_values(by=["A", "B"], axis=2, inplace=True))

        msg = "When sorting by column, axis must be 0"
        with assertRaisesRegexp(ValueError, msg):
            frame.sort_values(by="A", axis=1)

        msg = r"Length of ascending \(5\) != length of by \(2\)"
        with assertRaisesRegexp(ValueError, msg):
            frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
print "根据索引排序,对于DataFrame可以指定轴。"
obj = Series(range(4), index=["d", "a", "b", "c"])
print obj.sort_index()
frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
print frame.sort_index()
print frame.sort_index(axis=1)  # axis=1 表示对列进行操作
print frame.sort_index(axis=1, ascending=False)  # 降序
print

print "根据值排序"
obj = Series([4, 7, -3, 2])
print obj.sort_values()  # order已淘汰
print

print "DataFrame指定列排序"
frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
print frame
print frame.sort_values(by="b")  # sort_index(by = ...)已淘汰
print frame.sort_values(by=["a", "b"])
print

print "rank,求排名的平均位置(从1开始)"
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print obj.rank()
print obj.rank(method="first")  # 去第一次出现,不求平均值。
print obj.rank(ascending=False, method="max")  # 逆序,并取最大值。所以-5的rank是7.
frame = DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
print frame
print frame.rank(axis=1)
Example #24
0
import os
from pandas import Series, DataFrame
import pandas as pd
import concat
import statsmodels.formula.api as sm

os.chdir(r"C:\fantasy\data")
qb = pd.read_csv("RB Data.csv")
df_qb = DataFrame(qb)

# sort by player and week, also let's only keep 2015 for now
df_qb = df_qb[df_qb.year == 2015]
df_qb_sort = df_qb.sort_values(["Player", "week"])

df_qb_sort.head()

points_rolling = pd.concat(
    [
        df_qb_sort.Player,
        df_qb_sort["h/a"],
        df_qb_sort.week,
        df_qb_sort.dk_salary,
        df_qb_sort.total,
        df_qb_sort.dk_points,
        df_qb_sort.groupby("Player").dk_points.shift(),
        df_qb_sort.groupby("Player").dk_points.shift(2),
        df_qb_sort.groupby("Player").dk_points.shift(3),
    ],
    axis=1,
)
list(df_qb)
Example #25
0
 def test_stable_descending_sort(self):
     # GH #6399
     df = DataFrame([[2, "first"], [2, "second"], [1, "a"], [1, "b"]], columns=["sort_col", "order"])
     sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False)
     assert_frame_equal(df, sorted_df)
Example #26
0
obj = Series([4, 7, -5, 3])
myprint(obj.values)
myprint(obj[obj > 0])
myprint(obj.isnull())

data = {
    "state": ["Ohino", "Ohino", "Ohino", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9],
}

frame = DataFrame(data)
myprint(frame)
myprint(frame["state"])
myprint(frame.dtypes)
myprint(frame.head(1))

myprint(frame.index)

# 行列转换
myprint(frame.T)

# 排序
myprint(frame.sort_values(by="pop"))


# 选择一列
myprint(frame["year"])

# 切片
myprint(frame[1:3])
                    percent_cover_table,
                    onclause=and_(taxa_tbl_subq_stmt.c.taxa_table_key == percent_cover_table.taxa_percent_cover_fkey),
                )
            )
        if data_type == "individual_table":
            count_tbl_subq_stmt = select([taxa_tbl_subq_stmt, individual_table]).select_from(
                taxa_tbl_subq_stmt.join(
                    individual_table,
                    onclause=and_(taxa_tbl_subq_stmt.c.taxa_table_key == count_table.taxa_individual_fkey),
                )
            )

        tbl_subq_result = conn.execute(count_tbl_subq_stmt)
        tbl_subq_df = DataFrame(tbl_subq_result.fetchall())
        tbl_subq_df.columns = tbl_subq_result.keys()
        tbl_subq_df.sort_values(data_type + "_key", inplace=True)
        print("done")

        tbl_subq_df.replace({"NaN": "NA"}, inplace=True)
        tbl_subq_df.replace({"-99999": "NA"}, inplace=True)
        tbl_subq_df.replace({"-9999": "NA"}, inplace=True)
        tbl_subq_df.replace({-99999: "NA"}, inplace=True)
        tbl_subq_df.replace({-9999: "NA"}, inplace=True)
        tbl_subq_df.fillna("NA", inplace=True)

        # def test_repop():
        try:
            assert (len(tbl_subq_df) == len(original_data)) is True
            metadata_dict["df_length"] = "pass"
            print("dataframe lengths matched: ", z)
        except Exception as e:
Example #28
0
# 次ページのリンク取得
next = soup.find("a", class_="page-move__target page-move__target--next")
while next != None:
    next_page = next.get("href")
    r2 = s.get(next_page)
    soup = BeautifulSoup(r2.text, "html.parser")
    omises = soup.find_all("li", class_="js-cassette js-rstlst-cassete list-rst is-blocklink js-bookmark")

    get_omise_info(omises)

    # 次ページのリンク取得
    next = soup.find("a", class_="page-move__target page-move__target--next")


df = DataFrame(
    data,
    columns=[
        "name",
        "point_gokei",
        "point_hiru",
        "point_yoru",
        "hiru_yosan",
        "yoru_yosan",
        "seki",
        "kemuri",
        "koshitsu",
        "comment",
    ],
)
df.sort_values(by="point_gokei", ascending=False)