Example #1
0
def predictionRatio(df, metric="Levenshtein"):
    # Generate all possible combinations for string matching
    soc_media_1, soc_media_2 = df.columns
    # Convert everything to lower case
    df[soc_media_1] = df[soc_media_1].str.lower()
    df[soc_media_2] = df[soc_media_2].str.lower()

    df_known = DataFrame([df[soc_media_1].tolist()] * df.shape[0], index=df.index, columns=df.index)
    df_search = DataFrame([df[soc_media_2].tolist()] * df.shape[0], index=df.index, columns=df.index)
    df_known_list = df_known.applymap(lambda x: list([x]))
    df_search_list = df_search.applymap(lambda x: list([x]))
    df_search_list = df_known_list + df_search_list.T

    # Find the indices of columns for each row  based on metric
    # For Levenshtein get the min., for JaroWinkler get the max.
    if metric == "Levenshtein":
        search_res = df_search_list.applymap(lambda x: Levenshtein.distance(x[0], x[1]))
        indices = search_res.idxmin(axis=1)
    else:
        search_res = df_search_list.applymap(lambda x: Levenshtein.jaro_winkler(x[0], x[1]))
        indices = search_res.idxmax(axis=1)

    # Get the matches for social media account
    match = df[soc_media_2].ix[indices]
    df_t = DataFrame()
    df_t["actual"] = df[soc_media_2].reset_index(drop=True)
    df_t["match"] = match.reset_index(drop=True)
    # Find the ratio of correct matches
    match_count = (df_t.actual == df_t.match).value_counts()
    ratio = float(match_count[True]) / (match_count[True] + match_count[False])
    return ratio
Example #2
0
    def test_applymap(self):
        applied = self.frame.applymap(lambda x: x * 2)
        assert_frame_equal(applied, self.frame * 2)
        result = self.frame.applymap(type)

        # GH #465, function returning tuples
        result = self.frame.applymap(lambda x: (x, x))
        tm.assertIsInstance(result["A"][0], tuple)

        # GH 2909, object conversion to float in constructor?
        df = DataFrame(data=[1, "a"])
        result = df.applymap(lambda x: x)
        self.assertEqual(result.dtypes[0], object)

        df = DataFrame(data=[1.0, "a"])
        result = df.applymap(lambda x: x)
        self.assertEqual(result.dtypes[0], object)

        # GH2786
        df = DataFrame(np.random.random((3, 4)))
        df2 = df.copy()
        cols = ["a", "a", "a", "a"]
        df.columns = cols

        expected = df2.applymap(str)
        expected.columns = cols
        result = df.applymap(str)
        assert_frame_equal(result, expected)

        # datetime/timedelta
        df["datetime"] = Timestamp("20130101")
        df["timedelta"] = pd.Timedelta("1 min")
        result = df.applymap(str)
        for f in ["datetime", "timedelta"]:
            self.assertEqual(result.loc[0, f], str(df.loc[0, f]))
Example #3
0
    def test_to_csv_date_format(self):
        from pandas import to_datetime

        pname = "__tmp_to_csv_date_format__"
        with ensure_clean(pname) as path:
            for engine in [None, "python"]:
                w = FutureWarning if engine == "python" else None

                dt_index = self.tsframe.index
                datetime_frame = DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index)

                with tm.assert_produces_warning(w, check_stacklevel=False):
                    datetime_frame.to_csv(path, date_format="%Y%m%d", engine=engine)

                # Check that the data was put in the specified format
                test = read_csv(path, index_col=0)

                datetime_frame_int = datetime_frame.applymap(lambda x: int(x.strftime("%Y%m%d")))
                datetime_frame_int.index = datetime_frame_int.index.map(lambda x: int(x.strftime("%Y%m%d")))

                assert_frame_equal(test, datetime_frame_int)

                with tm.assert_produces_warning(w, check_stacklevel=False):
                    datetime_frame.to_csv(path, date_format="%Y-%m-%d", engine=engine)

                # Check that the data was put in the specified format
                test = read_csv(path, index_col=0)
                datetime_frame_str = datetime_frame.applymap(lambda x: x.strftime("%Y-%m-%d"))
                datetime_frame_str.index = datetime_frame_str.index.map(lambda x: x.strftime("%Y-%m-%d"))

                assert_frame_equal(test, datetime_frame_str)

                # Check that columns get converted
                datetime_frame_columns = datetime_frame.T

                with tm.assert_produces_warning(w, check_stacklevel=False):
                    datetime_frame_columns.to_csv(path, date_format="%Y%m%d", engine=engine)

                test = read_csv(path, index_col=0)

                datetime_frame_columns = datetime_frame_columns.applymap(lambda x: int(x.strftime("%Y%m%d")))
                # Columns don't get converted to ints by read_csv
                datetime_frame_columns.columns = datetime_frame_columns.columns.map(lambda x: x.strftime("%Y%m%d"))

                assert_frame_equal(test, datetime_frame_columns)

                # test NaTs
                nat_index = to_datetime(["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"])
                nat_frame = DataFrame({"A": nat_index}, index=nat_index)

                with tm.assert_produces_warning(w, check_stacklevel=False):
                    nat_frame.to_csv(path, date_format="%Y-%m-%d", engine=engine)

                test = read_csv(path, parse_dates=[0, 1], index_col=0)

                assert_frame_equal(test, nat_frame)
def gs(str, list):
    s = list
    t = pd.read_csv(str, usecols=s)

    w = DataFrame(t)

    try:
        plt.scatter(w[s[0]], w[s[1]], color="red")

        plt.show()
    except:
        pass
    try:
        w.hist()
        plt.show()

        w.plot(kind="box", by=list)
        plt.show()
    except:
        pass

    t = w.applymap(np.isreal)
    print t

    b = "".join(s)
    for i in t[b]:
        if i == False:

            a = w[b].value_counts()

            a.plot(kind="bar")

            plt.show()
            break
Example #5
0
    def test_to_csv_date_format(self):
        with ensure_clean("__tmp_to_csv_date_format__") as path:
            dt_index = self.tsframe.index
            datetime_frame = DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index)
            datetime_frame.to_csv(path, date_format="%Y%m%d")

            # Check that the data was put in the specified format
            test = read_csv(path, index_col=0)

            datetime_frame_int = datetime_frame.applymap(lambda x: int(x.strftime("%Y%m%d")))
            datetime_frame_int.index = datetime_frame_int.index.map(lambda x: int(x.strftime("%Y%m%d")))

            assert_frame_equal(test, datetime_frame_int)

            datetime_frame.to_csv(path, date_format="%Y-%m-%d")

            # Check that the data was put in the specified format
            test = read_csv(path, index_col=0)
            datetime_frame_str = datetime_frame.applymap(lambda x: x.strftime("%Y-%m-%d"))
            datetime_frame_str.index = datetime_frame_str.index.map(lambda x: x.strftime("%Y-%m-%d"))

            assert_frame_equal(test, datetime_frame_str)

            # Check that columns get converted
            datetime_frame_columns = datetime_frame.T
            datetime_frame_columns.to_csv(path, date_format="%Y%m%d")

            test = read_csv(path, index_col=0)

            datetime_frame_columns = datetime_frame_columns.applymap(lambda x: int(x.strftime("%Y%m%d")))
            # Columns don't get converted to ints by read_csv
            datetime_frame_columns.columns = datetime_frame_columns.columns.map(lambda x: x.strftime("%Y%m%d"))

            assert_frame_equal(test, datetime_frame_columns)

            # test NaTs
            nat_index = to_datetime(["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"])
            nat_frame = DataFrame({"A": nat_index}, index=nat_index)
            nat_frame.to_csv(path, date_format="%Y-%m-%d")

            test = read_csv(path, parse_dates=[0, 1], index_col=0)

            assert_frame_equal(test, nat_frame)
Example #6
0
def test():
    frame = DataFrame(numpy.random.randn(4, 3), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    format = lambda x: "%.2f" % x
    range = lambda x: x.max() - x.min()

    # http://stackoverflow.com/questions/19798153/difference-between-map-applymap-and-apply-methods-in-pandas/19798528#19798528
    print(frame.apply(range))
    print("")
    print(frame.applymap(format))
    print("")
    print(frame.apply(range).map(format))

    return frame
Example #7
0
    def compare_panel_lengths(self, panels, reference_label="GAL_Completo"):
        ref_count = len(Panel(reference_label).rs_ids)

        comparison = DataFrame({})
        for panel in panels:
            s = Series({"AIMs count": len(panel.rs_ids)}, name=panel.label)
            comparison = comparison.append(s)

        comparison = comparison.applymap(int)
        comparison.sort_index(ascending=False, inplace=True)
        comparison["Ratio"] = comparison["AIMs count"] / ref_count
        comparison["Ratio"] = comparison["Ratio"].map(lambda x: round(x, 2))
        comparison["AIMs count"] = comparison["AIMs count"].map(thousands_separator)
        comparison.index.name = "Panel"

        return comparison
print df3
print "*" * 15
print "Definimos la función F"


def f(x):
    if type(x) is str:
        return "applymap_" + x
    elif x:
        return 100 * x
    else:
        return


print "Aplicamos F al dataframe"
df.applymap(f)
print df
print "*" * 15

print "Definimos de nuevo el dataframe"
df = pd.DataFrame(data={"A": [1, 2], "B": [2.6, 1.3]})
print df
print "añadimos columnas combinando las actuales"
df["C"] = df["A"] + df["B"]
df["D"] = df["A"] * 3
df["E"] = np.sqrt(df["A"])
print df
print "*" * 15
print "Datos disponibles de un dataframe"
print " descripcion del dataframe"
print df.describe()
Example #9
0
 def test_default_handler(self):
     value = object()
     frame = DataFrame({"a": ["a", value]})
     expected = frame.applymap(str)
     result = pd.read_json(frame.to_json(default_handler=str))
     assert_frame_equal(expected, result)
Example #10
0
print obj5[:2]
obj5[obj5 < 5] = 3
print obj5
print obj5.ix["Ohio", ["one", "two"]]

s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])

print s1 + s2

df1 = DataFrame(np.arange(9).reshape((3, 3)), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"])
df2 = DataFrame(np.arange(12).reshape((4, 3)), columns=list("bcd"), index=["Utah", "Ohin", "Texa", "Colorado"])

print df1 + df2
print df1.add(df2, fill_value=0)

series2 = df2.ix[0]

print df2 - series2

ff = lambda x: x.max() - x.min()

print df2.apply(ff)
print df2.apply(ff, axis=1)

df3 = DataFrame(np.random.randn(3, 3), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"])
ff2 = lambda x: "%.2f" % x
print df3
print df3.applymap(ff2)
print df3
print df3.sort_index(by="b")
Example #11
0
frame = DataFrame(np.random.randn(4, 3), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])

np.abs(frame)
f = lambda x: x.max() - x.min()
frame.apply(f)
frame.apply(f, axis=1)


def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])


frame.apply(f)

format = lambda x: "%.2f" % x
frame.applymap(format)
frame["e"].map(format)

obj = Series(range(4), index=["d", "a", "b", "c"])
obj.sort_index()

frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=["d", "a", "b", "c"])
frame.sort_index()
frame.sort_index(axis=1)

frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
frame
frame.sort_index(by="b")
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
obj.rank(method="first")
Example #12
0
data2.sort_index(by=["clarity"])
data2.sort_index(by=["clarity"], ascending=False)

# groupby
data4 = DataFrame(data[:100], columns=["cut", "color", "clarity", "carat", "price"])
data4.groupby("clarity").mean()  # 数値のものだけ集計される
data4.groupby(["cut", "clarity"]).mean()  # 2水準以上の場合
data4.groupby(["cut", "clarity"]).mean()["price"]  # 結果に対してのアクセス

# apply
data5 = DataFrame(data[:6], columns=["carat", "price", "depth"])
f = lambda x: x.max() - x.min()
data5.apply(f)
data5.apply(f, axis=1)  # 行方向(デフォルトは列方向)
f2 = lambda x: "%.2f" % x  # 数値の書式を下2桁表示に変更
data5.applymap(f2)  # データフレームの各要素に適用

# vlookup
clarity_to_class = {"SI1": "A", "SI2": "B", "VS1": "C", "VS2": "D", "VVS2": "E"}
data2["class"] = data2["clarity"].map(clarity_to_class)

# DB:SELECT文
# import pandas.io.sql as sql
# con = sqlite3.connect(':memory:')
# sql.read_frame('select * from test', con)

# データを書き出す-csv
data.to_csv("output.csv")

# データを書き出す-エクセル
writer = pd.ExcelWriter("output.xlsx")
Example #13
0
class Scores(object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """

    @classmethod
    def from_df(cls, df, uri=None, modality=None, aggfunc=np.mean):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        dataframe = pivot_table(
            df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc
        )

        annotation = Annotation(uri=uri, modality=modality)
        for index, _ in dataframe.iterrows():
            segment = Segment(*index[0])
            track = index[1]
            annotation[segment, track] = ""

        labels = dataframe.columns

        return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values)

    def __init__(self, uri=None, modality=None, annotation=None, labels=None, values=None, dtype=None):

        super(Scores, self).__init__()

        names = [PYANNOTE_SEGMENT + "_" + field for field in Segment._fields] + [PYANNOTE_TRACK]

        if annotation:
            annotation = annotation.copy()
            index = Index([s + (t,) for s, t in annotation.itertracks()], name=names)

        else:
            annotation = Annotation(uri=uri, modality=modality)
            index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names)

        self.annotation_ = annotation
        columns = None if labels is None else list(labels)
        data = None if values is None else np.array(values)
        dtype = np.float if values is None else values.dtype

        self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns)

        self.hasChanged_ = True

        self.modality = modality
        self.uri = uri

    def copy(self):
        self._reindexIfNeeded()
        copied = self.__class__(uri=self.uri, modality=self.modality)
        copied.dataframe_ = self.dataframe_.copy()
        copied.annotation_ = self.annotation_.copy()
        copied.hasChanged_ = self.hasChanged_
        return copied

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self.dataframe_.drop(tuple(segment), axis=0, inplace=True)
            del self.annotation_[segment]
            self.hasChanged_ = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self.dataframe_.drop(tuple(segment) + (track,), axis=0, inplace=True)
            del self.annotation_[segment, track]
            self.hasChanged_ = True

        else:
            raise KeyError("")

    # value = scores[segment, track, label]
    def __getitem__(self, key):

        if len(key) == 2:
            key = (key[0], "_", key[1])

        segment, track, label = key
        return self.dataframe_.at[tuple(segment) + (track,), label]

    # scores[segment, track, label] = value
    # scores[segment, label] ==== scores[segment, '_', label]
    def __setitem__(self, key, value):

        if len(key) == 2:
            key = (key[0], "_", key[1])

        segment, track, label = key

        # do not add empty track
        if not segment:
            return

        self.dataframe_.at[tuple(segment) + (track,), label] = value
        self.annotation_[segment, track] = label
        self.hasChanged_ = True

    def __len__(self):
        """Number of annotated segments"""
        return len(self.annotation_)

    def __nonzero__(self):
        return self.__bool__()

    def __bool__(self):
        """False if annotation is empty"""
        return True if self.annotation_ else False

    def __contains__(self, included):
        """Check if segments are annotated

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` is annotated, False otherwise.
        """
        return included in self.annotation_

    def __iter__(self):
        """Iterate over sorted segments"""
        return iter(self.annotation_.get_timeline(copy=False))

    def __reversed__(self):
        """Reverse iterate over sorted segments"""
        return reversed(self.annotation_.get_timeline(copy=False))

    def itersegments(self):
        return iter(self)

    def tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return self.annotation_.get_tracks(segment)

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return self.annotation_.has_track(segment, track)

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        return self.annotation_.get_track_by_name(track)

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name


        Returns
        -------
        track : str
            New track name
        """

        return self.annotation_.new_track(segment, candidate=None, prefix=None)

    def itertracks(self):
        """Iterate over annotation as (segment, track) tuple"""
        return self.annotation_.itertracks()

    def itervalues(self):
        """Iterate over scores as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._reindexIfNeeded()

        labels = self.labels()

        # yield one (segment, track, label) tuple per loop
        for index, columns in self.dataframe_.iterrows():
            segment = Segment(*index[:-1])
            track = index[-1]
            for label in labels:
                value = columns[label]
                if not np.isnan(value):
                    yield segment, track, label, value

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return dict(self.dataframe_.xs(tuple(segment) + (track,)))

    def labels(self):
        """List of labels

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        return sorted(self.dataframe_.columns, key=str)

    def _reindexIfNeeded(self):

        if not self.hasChanged_:
            return

        names = [PYANNOTE_SEGMENT + "_" + field for field in Segment._fields] + [PYANNOTE_TRACK]

        new_index = Index([s + (t,) for s, t in self.annotation_.itertracks()], name=names)

        self.dataframe_ = self.dataframe_.reindex(new_index)

        self.hasChanged_ = False

        return

    def retrack(self):
        """
        """

        self._reindexIfNeeded()
        retracked = self.copy()

        annotation = self.annotation_.retrack()
        retracked.annotation_ = annotation

        names = [PYANNOTE_SEGMENT + "_" + field for field in Segment._fields] + [PYANNOTE_TRACK]
        new_index = Index([s + (t,) for s, t in annotation.itertracks()], name=names)
        retracked.dataframe_.index = new_index

        return retracked

    def apply(self, func, axis=0):

        applied = self.copy()
        applied.dataframe_ = self.dataframe_.apply(func, axis=axis)
        applied.hasChanged_ = True

        return applied

    def rank(self, ascending=False):
        """

        Parameters
        ----------
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        rank : `Scores`

        """

        ranked = self.copy()
        ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        ranked.hasChanged_ = True
        return ranked

    def nbest(self, n, ascending=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """

        filtered = self.copy()
        ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN)
        filtered.hasChanged_ = True
        return filtered

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        self._reindexIfNeeded()

        if not isinstance(labels, set):
            raise TypeError("labels must be provided as a set of labels.")

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        subset = Scores(uri=self.uri, modality=self.modality)
        subset.annotation_ = self.annotation_
        subset.dataframe_ = self.dataframe_[list(labels)]

        return subset

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        if not self:
            return Annotation(uri=self.uri, modality=self.modality)

        best = self.nbest(1, ascending=False)
        large_enough = best.copy()

        if posterior:
            unknown_posterior = 1.0 - self.dataframe_.sum(axis=1)

            large_enough.dataframe_ = ((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T

        else:

            large_enough.dataframe_ = (best.dataframe_.T > threshold).T

        large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN)

        annotation = Annotation(uri=self.uri, modality=self.modality)
        for segment, track, label, value in large_enough.itervalues():
            label = label if value else Unknown()
            annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""

        mapped = self.copy()
        mapped.dataframe_ = self.dataframe_.applymap(func)
        mapped.hasChanged_ = True
        return mapped

    def crop(self, focus, mode="strict"):
        """Crop on focus

        Parameters
        ----------
        focus : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : same type as caller
            Cropped version of the caller containing only tracks matching
            the provided focus and mode.

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.

        """

        if isinstance(focus, Segment):
            return self.crop(Timeline([focus], uri=self.uri), mode=mode)

        self._reindexIfNeeded()
        cropped = self.copy()

        if mode in ["strict", "loose"]:

            new_annotation = self.annotation_.crop(focus, mode=mode)
            keep = [new_annotation.has_track(segment, track) for segment, track in self.itertracks()]
            cropped.dataframe_ = self.dataframe_[keep]
            cropped.annotation_ = new_annotation
            cropped.hasChanged_ = True

            return cropped

        elif mode in ["intersection"]:

            raise NotImplementedError("")

            # # two original segments might be cropped into the same resulting
            # # segment -- therefore, we keep track of the mapping
            # intersection, mapping = timeline.crop(coverage,
            #                                       mode=mode, mapping=True)
            #
            # # create new empty annotation
            # A = self.__class__(uri=self.uri, modality=self.modality)
            #
            # for cropped in intersection:
            #     for original in mapping[cropped]:
            #         for track in self.tracks(original):
            #             # try to use original track name (candidate)
            #             # if it already exists, create a brand new one
            #             new_track = A.new_track(cropped, candidate=track)
            #             # copy each value, column by column
            #             for label in self.dataframe_.columns:
            #                 value = self.dataframe_.get_value((original, track),
            #                                            label)
            #                 A.dataframe_ = A.dataframe_.set_value((cropped, new_track),
            #                                         label, value)
            #
            # return A

    def __str__(self):
        """Human-friendly representation"""
        if self:
            self._reindexIfNeeded()
            return str(self.dataframe_)
        else:
            return ""

    def _repr_png_(self):
        from .notebook import repr_scores

        return repr_scores(self)
Example #14
0
f = lambda x: x.max() - x.min()
print(frame.apply(f))
print(frame.apply(f, axis=1))


def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])


print(frame.apply(f))
print(frame.apply(f, axis=1))

# how to format
format = lambda x: "%.2f" % x
print(frame.applymap(format))
print(frame["e"].map(format))

"""
sort and order
"""
obj = Series(range(4), index=["d", "a", "b", "c"])
print(obj.sort_index())

frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=["d", "a", "b", "c"])
print(frame.sort_index())
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1, ascending=False))

obj = Series([4, 7, -3, -2])
print(obj.sort_values())
Example #15
0
def plotter(
    title,
    df,
    kind="line",
    x_label=None,
    y_label=None,
    style="ggplot",
    figsize=(8, 4),
    save=False,
    legend_pos="best",
    reverse_legend="guess",
    num_to_plot=7,
    tex="try",
    colours="Accent",
    cumulative=False,
    pie_legend=True,
    partial_pie=False,
    show_totals=False,
    transparent=False,
    output_format="png",
    interactive=False,
    black_and_white=False,
    show_p_val=False,
    indices=False,
    **kwargs
):
    """Visualise corpus interrogations.

    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: pandas.core.frame.DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os

    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings

        warnings.simplefilter("ignore", ShimWarning)
    except:
        pass

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except:
        pass

    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt

    import pandas
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    # check what environment we're in
    tk = check_t_kinter()
    running_python_tex = check_pytex()
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np

        new_cmap = colors.LinearSegmentedColormap.from_list(
            "trunc({n},{a:.2f},{b:.2f})".format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))
        )
        return new_cmap

    def get_savename(imagefolder, save=False, title=False, ext="png"):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re

            s = s.lower()
            s = re.sub(r"[^\w\s-]", "", s)
            s = re.sub(r"\s+", "-", s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", "-", s)
            return s

        # name as
        if not ext.startswith("."):
            ext = "." + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        # this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith("%s%s" % (ext, ext)):
            savename = savename.replace("%s%s" % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series=False, using_tex=False, absolutes=True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append("%s (%.2f\%%)" % (w, perc))
                else:
                    the_labs.append("%s (%.2f %%)" % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append("%s (n=%d)" % (w, score))
                else:
                    the_labs.append("%s (n=%d)" % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index=the_labs)
            dataframe.columns = ["Total"]
        return dataframe

    def auto_explode(dataframe, input, was_series=False, num_to_plot=7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # check if we're doing subplots
    sbplt = False
    if "subplots" in kwargs:
        if kwargs["subplots"] is True:
            sbplt = True
    kwargs["subplots"] = sbplt

    if colours is True:
        colours = "Paired"

    # todo: get this dynamically instead.
    styles = ["dark_background", "bmh", "grayscale", "ggplot", "fivethirtyeight", "matplotlib", False, "mpl-white"]
    # if style not in styles:
    # raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == "mpl-white":
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = "matplotlib"

    if style is not False and style.startswith("seaborn"):
        colours = False

    # use 'draggable = True' to make a draggable legend
    dragmode = kwargs.get("draggable", False)
    kwargs.pop("draggable", None)

    if kwargs.get("savepath"):
        mpl.rcParams["savefig.directory"] = kwargs.get("savepath")
        kwargs.pop("savepath", None)

    mpl.rcParams["savefig.bbox"] = "tight"
    mpl.rcParams.update({"figure.autolayout": True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams["font.family"] = "sans-serif"
    mpl.rcParams["text.latex.unicode"] = True

    if tex == "try" or tex is True:
        try:
            rc("text", usetex=True)
            rc("font", **{"family": "serif", "serif": ["Computer Modern"]})
            using_tex = True
        except:
            matplotlib.rc("font", family="sans-serif")
            matplotlib.rc("font", serif="Helvetica Neue")
            matplotlib.rc("text", usetex="false")
            rc("text", usetex=False)
    else:
        rc("text", usetex=False)

    if interactive:
        using_tex = False

    if show_totals is False:
        show_totals = "none"

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs["kind"] = kind.lower()

    if interactive:
        if kwargs["kind"].startswith("bar"):
            interactive_types = [3]
        elif kwargs["kind"] == "area":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "line":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "pie":
            interactive_types = None
            warnings.warn("Interactive plotting not yet available for pie plots.")
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == "pie":
        piemode = True
        # always the best spot for pie
        # if legend_pos == 'best':
        # legend_pos = 'lower left'
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            kwargs["pctdistance"] = 0.6
            if using_tex:
                kwargs["autopct"] = r"%1.1f\%%"
            else:
                kwargs["autopct"] = "%1.1f%%"

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True

    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series and not all(x.lower() == "total" for x in list(dataframe.columns)):
        for name, ax in zip(["Total"] * 2 + ["tkintertable-order"] * 2, [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis=ax, errors="ignore")
            except:
                pass
    else:
        dataframe = dataframe.drop("tkintertable-order", errors="ignore")
        dataframe = dataframe.drop("tkintertable-order", axis=1, errors="ignore")

    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == "guess":

            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = "all"
            dataframe = dataframe.T
            if y_label is None:
                y_label = "Percentage of all matches"
            if x_label is None:
                x_label = ""

    # set backend?
    output_formats = ["svgz", "ps", "emf", "rgba", "raw", "pdf", "svg", "eps", "png", "pgf"]
    if output_format not in output_formats:
        raise ValueError("%s output format not recognised. Must be: %s" % (output_format, ", ".join(output_formats)))

    # don't know if these are necessary
    if "pdf" in output_format:
        plt.switch_backend(output_format)
    if "pgf" in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == "all":
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get("explode"):
        kwargs["explode"] = auto_explode(dataframe, kwargs["explode"], was_series=was_series, num_to_plot=num_to_plot)
    else:
        kwargs.pop("explode", None)

    legend = kwargs.get("legend", False)

    # cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != "Total":
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == "Total":
            plotting_a_totals_column = True
            if not "legend" in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ["slope", "intercept", "r", "p", "stderr"]
    try:
        dataframe = dataframe.drop(statfields, axis=1, errors="ignore")
    except:
        pass
    try:
        dataframe.ix["p"]
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]["p"]

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return "p < 0.001"
                        else:
                            return r"p $<$ 0.001"
                    else:
                        return "p = %s" % format(val, ".3f")

                pstr = p_string_formatter(pval)
                newname = "%s (%s)" % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis=0, inplace=True, errors="ignore")
        else:
            warnings.warn(
                "No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values."
            )
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis=0, inplace=True, errors="ignore")

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0, :].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if kind in ["pie", "line", "area"]:
                if colours:
                    if not plotting_a_totals_column:
                        if colours == "Default":
                            colours = "Paired"
                        kwargs["colormap"] = colours
            # else:

            if colours:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == "Default":
                colours = "Paired"
            kwargs["colormap"] = colours
        else:
            if num_to_plot > 0:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours

    # multicoloured bar charts
    if colours:
        if kind.startswith("bar"):
            if len(list(dataframe.columns)) == 1:
                if not black_and_white:
                    import numpy as np

                    the_range = np.linspace(0, 1, num_to_plot)
                    cmap = plt.get_cmap(colours)
                    kwargs["colors"] = [cmap(n) for n in the_range]
                # make a bar width ... ? ...
                # kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ["bar", "barh", "area", "line", "pie"]:
        if was_series:
            legend = False
        if kind == "pie":
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ["barh", "area"]:
        if reverse_legend == "guess":
            rev_leg = True
    if not "rev_leg" in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = "best"

    # cut dataframe if just_totals
    try:
        tst = dataframe["Combined total"]
        dataframe = dataframe.head(num_to_plot)
    except:
        pass

    # rotate automatically
    if "rot" not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            # if 'kind' in kwargs:
            # if kwargs['kind'] in ['barh', 'area']:
            # xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs["rot"] = 45

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs["title"] = title

    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings

        interactive = False
        warnings.warn("No interactive subplots yet, sorry.")
        return

    # not using pandas for labels or legend anymore.
    # kwargs['labels'] = None
    # kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get("ncol"):
                kwargs["ncol"] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {"framealpha": 0.8, "shadow": kwargs.get("shadow", False), "ncol": kwargs.pop("ncol", 1)}

        # determine legend position based on this dict
        if legend_pos:
            possible = {
                "best": 0,
                "upper right": 1,
                "upper left": 2,
                "lower left": 3,
                "lower right": 4,
                "right": 5,
                "center left": 6,
                "center right": 7,
                "lower center": 8,
                "upper center": 9,
                "center": 10,
                "o r": 2,
                "outside right": 2,
                "outside upper right": 2,
                "outside center right": "center left",
                "outside lower right": "lower left",
            }

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError(
                        "legend_pos value must be one of:\n%s\n or an int between 0-10."
                        % ", ".join(list(possible.keys()))
                    )
            leg_options["loc"] = the_loc
            # weirdness needed for outside plot
            if legend_pos in ["o r", "outside right", "outside upper right"]:
                leg_options["bbox_to_anchor"] = (1.02, 1)
            if legend_pos == "outside center right":
                leg_options["bbox_to_anchor"] = (1.02, 0.5)
            if legend_pos == "outside lower right":
                leg_options["loc"] == "upper right"
                leg_options["bbox_to_anchor"] = (0.5, 0.5)

        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith("o"):
                leg_options["borderaxespad"] = 1

    if not piemode:
        if show_totals.endswith("both") or show_totals.endswith("legend"):
            dataframe = rename_data_with_total(
                dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
            )
    else:
        if pie_legend:
            if show_totals.endswith("both") or show_totals.endswith("legend"):
                dataframe = rename_data_with_total(
                    dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
                )

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs["y"] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.index)

    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == "area":
        areamode = True

    if legend is False:
        kwargs["legend"] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == "line":
                kwargs["marker"] = ","
        if not piemode:
            kwargs["alpha"] = 0.1

    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq="A")
                    dataframe = dataframe.set_index(n)

        if kwargs.get("filled"):
            if areamode or kind.startswith("bar"):
                dataframe = filler(dataframe)
            kwargs.pop("filled", None)

    MARKERSIZE = 4
    COLORMAP = {
        0: {"marker": None, "dash": (None, None)},
        1: {"marker": None, "dash": [5, 5]},
        2: {"marker": "o", "dash": (None, None)},
        3: {"marker": None, "dash": [1, 3]},
        4: {"marker": "s", "dash": [5, 2, 5, 2, 5, 10]},
        5: {"marker": None, "dash": [5, 3, 1, 2, 1, 10]},
        6: {"marker": "o", "dash": (None, None)},
        7: {"marker": None, "dash": [5, 3, 1, 3]},
        8: {"marker": "1", "dash": [1, 3]},
        9: {"marker": "*", "dash": [5, 5]},
        10: {"marker": "2", "dash": [5, 2, 5, 2, 5, 10]},
        11: {"marker": "s", "dash": (None, None)},
    }

    HATCHES = {
        0: {"color": "#dfdfdf", "hatch": "/"},
        1: {"color": "#6f6f6f", "hatch": "\\"},
        2: {"color": "b", "hatch": "|"},
        3: {"color": "#dfdfdf", "hatch": "-"},
        4: {"color": "#6f6f6f", "hatch": "+"},
        5: {"color": "b", "hatch": "x"},
    }

    if black_and_white:
        if kind == "line":
            kwargs["linewidth"] = 1

        cmap = plt.get_cmap("Greys")
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == "bar":
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs["colormap"] = new_cmap

    class dummy_context_mgr:
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""

        def __enter__(self):
            return None

        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != "matplotlib" else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                kwargs["legend"] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs["stacked"] = False
                    rev_leg = False
            ax = dataframe.plot(figsize=figsize, **kwargs)
            if areamode:
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels
        else:
            plt.gcf().set_tight_layout(False)
            if not piemode:
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize=figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend(
                    handles,
                    labels,
                    loc=leg_options["loc"],
                    bbox_to_anchor=(0, -0.1, 1, 1),
                    bbox_transform=plt.gcf().transFigure,
                )

                # this line allows layouts with missing plots
                # i.e. layout = (5, 2) with only nine plots
                plt.gcf().set_tight_layout(False)

        if "rot" in kwargs:
            if kwargs["rot"] != 0 and kwargs["rot"] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation=kwargs["rot"], ha="right")

        if transparent:
            plt.gcf().patch.set_facecolor("white")
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == "line":
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color("black")
                    # line.set_width(1)
                    line.set_dashes(COLORMAP[c]["dash"])
                    line.set_marker(COLORMAP[c]["marker"])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    if areamode:
                        handles = handles[-len(handles) / 2 :]
                        labels = labels[-len(labels) / 2 :]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    lgd = plt.legend(handles, labels, **leg_options)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ["%s (%s: %d)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ["%s (%s: %.2f%%)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                # if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                # else:
                if kind == "line":
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels=ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)

    if piemode:
        if not sbplt:
            plt.axis("equal")
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = "Year"

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0]  # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = "Year"
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = "Year"
                else:
                    x_label = "Group"
            except:
                x_label = "Group"

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    def is_number(s):
        """check if str can be can be made into float/int"""
        try:
            float(s)  # for int, long and float
        except ValueError:
            try:
                complex(s)  # for complex
            except ValueError:
                return False
        return True

    # for now, always turn off sci notation
    from matplotlib.ticker import ScalarFormatter

    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            if all(is_number(s) for s in list(dataframe.index)):
                plt.gca().xaxis.set_major_formatter(ScalarFormatter())
        except:
            pass
    try:
        if all(is_number(s) for s in list(dataframe.columns)):
            plt.gca().yaxis.set_major_formatter(ScalarFormatter())
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = "Percentage"
    else:
        y_l = "Absolute frequency"

    def suplabel(axis, label, label_prop=None, labelpad=5, ha="center", va="center"):
        """ Add super ylabel or xlabel to the figure
        Similar to matplotlib.suptitle
        axis       - string: "x" or "y"
        label      - string
        label_prop - keyword dictionary for Text
        labelpad   - padding from the axis (default: 5)
        ha         - horizontal alignment (default: "center")
        va         - vertical alignment (default: "center")
        """
        fig = plt.gcf()
        xmin = []
        ymin = []
        for ax in fig.axes:
            xmin.append(ax.get_position().xmin)
            ymin.append(ax.get_position().ymin)
        xmin, ymin = min(xmin), min(ymin)
        dpi = fig.dpi
        if axis.lower() == "y":
            rotation = 90.0
            x = xmin - float(labelpad) / dpi
            y = 0.5
        elif axis.lower() == "x":
            rotation = 0.0
            x = 0.5
            y = ymin - float(labelpad) / dpi
        else:
            raise Exception("Unexpected axis: x or y")
        if label_prop is None:
            label_prop = dict()
        plt.gcf().text(x, y, label, rotation=rotation, transform=fig.transFigure, ha=ha, va=va, **label_prop)

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)
        else:
            if type(y_label) == str:
                the_y = y_label
            else:
                the_y = y_l
            # suplabel('y', the_y, labelpad = 1.5)
            plt.gcf().text(0.04, 0.5, the_y, va="center", rotation="vertical")
            # plt.subplots_adjust(left=0.5)

        #    if not piemode:
        #        if type(y_label) == str:
        #            plt.ylabel(y_label)
        #        else:
        #            plt.ylabel(y_l)

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        # plt.gca().suptitle(title, fontsize = 16)
        # plt.subplots_adjust(top=0.9)
        # get all axes
        if "layout" not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)

        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis("equal")

            # show grid
            a.grid(b=kwargs.get("grid", False))
            kwargs.pop("grid", None)

    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if kind.startswith("bar"):
            width = ax.containers[0][0].get_width()

        # show grid
        ax.grid(b=kwargs.get("grid", False))
        kwargs.pop("grid", None)

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0, the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)

    if "layout" not in kwargs:
        if not sbplt:
            plt.tight_layout()

    if save:
        import os

        if running_python_tex:
            imagefolder = "../images"
        else:
            imagefolder = "images"

        savename = get_savename(imagefolder, save=save, title=title, ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith("o"):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches="tight", format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print("\n" + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()

    if sbplt:
        plt.subplots_adjust(right=0.8)
        plt.subplots_adjust(left=0.1)

    if not interactive and not running_python_tex and not running_spider and not tk:
        plt.gcf().show()
        return
    elif running_spider or tk:
        return plt

    if interactive:
        plt.subplots_adjust(right=0.8)
        plt.subplots_adjust(left=0.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
Example #16
0
frame.sub(series, axis=0)

# function mapping
f = lambda x: x.max() - x.min()
frame.apply(f)  # column-wise operation
frame.apply(f, axis=1)  # row-wise operation


def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])


frame.apply(f)

f = lambda x: "%.2f" % x
frame.applymap(f)  # element-wise operation


# sort index and values
obj = Series([4, 7, -3, 2])
obj.order()
frame = DataFrame(np.random.randn(5, 4), index=["three", "one", "two", "five", "four"], columns=["d", "a", "b", "c"])
frame.sort_index()
frame.sort_index(axis=1)
frame.sort_values(by="a")  # frame.sort_values(by=['a','b'])

obj.rank()
obj.rank(method="first")  # 'average', 'min', 'max', 'first'

frame.rank(axis=1)
# -*- coding: utf-8 -*-

import numpy as np
from pandas import Series, DataFrame

print "函数"
frame = DataFrame(np.random.randn(4, 3), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
print frame
print np.abs(frame)
print

print "lambda以及应用"
f = lambda x: x.max() - x.min()
print frame.apply(f)
print frame.apply(f, axis=1)


def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])


print frame.apply(f)
print

print "applymap和map"
_format = lambda x: "%.2f" % x
print frame.applymap(_format)
print frame["e"].map(_format)
Example #18
0
def plotter(
    title,
    df,
    x_label=None,
    y_label=None,
    style="ggplot",
    figsize=(8, 4),
    save=False,
    legend_pos="best",
    reverse_legend="guess",
    num_to_plot=7,
    tex="try",
    colours="Paired",
    cumulative=False,
    pie_legend=True,
    partial_pie=False,
    show_totals=False,
    transparent=False,
    output_format="png",
    interactive=False,
    black_and_white=False,
    show_p_val=False,
    indices="guess",
    **kwargs
):
    """plot interrogator() or editor() output.

    **kwargs are for pandas first, which can then send them through to matplotlib.plot():

    http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.plot.html
    http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot

    pie_legend: False to label slices rather than give legend
    show_totals: where to show percent/abs frequencies: False, 'plot', 'legend', or 'both'

    """

    import corpkit
    import os
    import matplotlib as mpl

    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    from matplotlib import rc
    import pandas
    import pandas as pd
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from corpkit.tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    tk = check_t_kinter()

    running_python_tex = check_pytex()
    # incorrect spelling of spider on purpose
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np

        new_cmap = colors.LinearSegmentedColormap.from_list(
            "trunc({n},{a:.2f},{b:.2f})".format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))
        )
        return new_cmap

    def get_savename(imagefolder, save=False, title=False, ext="png"):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re

            s = s.lower()
            s = re.sub(r"[^\w\s-]", "", s)
            s = re.sub(r"\s+", "-", s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", "-", s)
            return s

        # name as
        if not ext.startswith("."):
            ext = "." + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        # this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith("%s%s" % (ext, ext)):
            savename = savename.replace("%s%s" % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series=False, using_tex=False, absolutes=True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append("%s (%.2f\%%)" % (w, perc))
                else:
                    the_labs.append("%s (%.2f %%)" % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append("%s (n=%d)" % (w, score))
                else:
                    the_labs.append("%s (n=%d)" % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pd.DataFrame(vals, index=the_labs)
            dataframe.columns = ["Total"]
        return dataframe

    def auto_explode(dataframe, input, was_series=False, num_to_plot=7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # are we doing subplots?
    sbplt = False
    if "subplots" in kwargs:
        if kwargs["subplots"] is True:
            sbplt = True

    if colours is True:
        colours = "Paired"

    styles = ["dark_background", "bmh", "grayscale", "ggplot", "fivethirtyeight"]
    if style not in styles:
        raise ValueError("Style %s not found. Use %s" % (style, ", ".join(styles)))

    if "savepath" in kwargs.keys():
        mpl.rcParams["savefig.directory"] = kwargs["savepath"]
        del kwargs["savepath"]

    mpl.rcParams["savefig.bbox"] = "tight"

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams["font.family"] = "sans-serif"
    mpl.rcParams["text.latex.unicode"] = True

    if tex == "try" or tex is True:
        try:
            rc("text", usetex=True)
            rc("font", **{"family": "serif", "serif": ["Computer Modern"]})
            using_tex = True
        except:
            matplotlib.rc("font", family="sans-serif")
            matplotlib.rc("font", serif="Helvetica Neue")
            matplotlib.rc("text", usetex="false")
            rc("text", usetex=False)
    else:
        rc("text", usetex=False)

    if interactive:
        using_tex = False

    if show_totals is False:
        show_totals = "none"

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    if "kind" not in kwargs:
        kwargs["kind"] = "line"

    if interactive:
        if kwargs["kind"].startswith("bar"):
            interactive_types = [3]
        elif kwargs["kind"] == "area":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "line":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "pie":
            interactive_types = None
            warnings.warn("Interactive plotting not yet available for pie plots.")
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if "kind" in kwargs:
        if kwargs["kind"] == "pie":
            piemode = True
            # always the best spot for pie
            # if legend_pos == 'best':
            # legend_pos = 'lower left'
            if show_totals.endswith("plot") or show_totals.endswith("both"):
                kwargs["pctdistance"] = 0.6
                if using_tex:
                    kwargs["autopct"] = r"%1.1f\%%"
                else:
                    kwargs["autopct"] = "%1.1f%%"

    # if piemode:
    # if partial_pie:
    # kwargs['startangle'] = 180

    kwargs["subplots"] = sbplt

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True

    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series:
        for name, ax in zip(["Total"] * 2 + ["tkintertable-order"] * 2, [0, 1, 0, 1]):
            dataframe = dataframe.drop(name, axis=ax, errors="ignore")
    else:
        dataframe = dataframe.drop("tkintertable-order", errors="ignore")
        dataframe = dataframe.drop("tkintertable-order", axis=1, errors="ignore")

    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == "guess":

            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = "all"
            dataframe = dataframe.T
            if y_label is None:
                y_label = "Percentage of all matches"
            if x_label is None:
                x_label = ""

    # set backend?
    output_formats = ["svgz", "ps", "emf", "rgba", "raw", "pdf", "svg", "eps", "png", "pgf"]
    if output_format not in output_formats:
        raise ValueError("%s output format not recognised. Must be: %s" % (output_format, ", ".join(output_formats)))

    # don't know if these are necessary
    if "pdf" in output_format:
        plt.switch_backend(output_format)
    if "pgf" in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == "all":
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if "explode" in kwargs:
        if not piemode:
            del kwargs["explode"]
    if piemode:
        if "explode" in kwargs:
            if not sbplt:
                kwargs["explode"] = auto_explode(
                    dataframe, kwargs["explode"], was_series=was_series, num_to_plot=num_to_plot
                )

    if "legend" in kwargs:
        legend = kwargs["legend"]
    else:
        legend = True

    # cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != "Total":
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == "Total":
            plotting_a_totals_column = True
            if not "legend" in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ["slope", "intercept", "r", "p", "stderr"]
    try:
        dataframe = dataframe.drop(statfields, axis=1)
    except:
        pass
    try:
        dataframe.ix["p"]
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]["p"]
                newname = "%s (p=%s)" % (col, format(pval, ".5f"))
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis=0, inplace=True)
        else:
            warnings.warn(
                "No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values."
            )
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis=0, inplace=True)

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0, :].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if "kind" in kwargs:
                if kwargs["kind"] in ["pie", "line", "area"]:
                    if colours:
                        if not plotting_a_totals_column:
                            if colours == "Default":
                                colours = "Paired"
                            kwargs["colormap"] = colours
            # else:
            if colours:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == "Default":
                colours = "Paired"
            kwargs["colormap"] = colours
        else:
            if num_to_plot > 0:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours
        # else:
        # if len(dataframe.T.columns) < 8:
        # try:
        # del kwargs['colormap']
        # except:
        # pass

    # multicoloured bar charts
    if "kind" in kwargs:
        if colours:
            if kwargs["kind"].startswith("bar"):
                if len(list(dataframe.columns)) == 1:
                    if not black_and_white:
                        import numpy as np

                        the_range = np.linspace(0, 1, num_to_plot)
                        cmap = plt.get_cmap(colours)
                        kwargs["colors"] = [cmap(n) for n in the_range]
                    # make a bar width ... ?
                    # kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if "kind" in kwargs:
        if kwargs["kind"] in ["bar", "barh", "area", "line", "pie"]:
            if was_series:
                legend = False
            if kwargs["kind"] == "pie":
                if pie_legend:
                    legend = True
                else:
                    legend = False
        if kwargs["kind"] in ["barh", "area"]:
            if reverse_legend == "guess":
                rev_leg = True
    if not "rev_leg" in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = "best"

    # cut dataframe if just_totals
    try:
        tst = dataframe["Combined total"]
        dataframe = dataframe.head(num_to_plot)
    except:
        pass

    # rotate automatically
    if "rot" not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            # if 'kind' in kwargs:
            # if kwargs['kind'] in ['barh', 'area']:
            # xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs["rot"] = 45

    # no title for subplots because ugly,
    if sbplt:
        if "title" in kwargs:
            del kwargs["title"]
    else:
        kwargs["title"] = title

    # no interactive subplots yet:

    if sbplt and interactive:
        import warnings

        interactive = False
        warnings.warn("No interactive subplots yet, sorry.")
        return

    # not using pandas for labels or legend anymore.
    # kwargs['labels'] = None
    # kwargs['legend'] = False

    if legend:
        # kwarg options go in leg_options
        leg_options = {"framealpha": 0.8}
        if "shadow" in kwargs:
            leg_options["shadow"] = True
        if "ncol" in kwargs:
            leg_options["ncol"] = kwargs["ncol"]
            del kwargs["ncol"]
        else:
            if num_to_plot > 6:
                leg_options["ncol"] = num_to_plot / 7

        # determine legend position based on this dict
        if legend_pos:
            possible = {
                "best": 0,
                "upper right": 1,
                "upper left": 2,
                "lower left": 3,
                "lower right": 4,
                "right": 5,
                "center left": 6,
                "center right": 7,
                "lower center": 8,
                "upper center": 9,
                "center": 10,
                "o r": 2,
                "outside right": 2,
                "outside upper right": 2,
                "outside center right": "center left",
                "outside lower right": "lower left",
            }

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError(
                        "legend_pos value must be one of:\n%s\n or an int between 0-10." % ", ".join(possible.keys())
                    )
            leg_options["loc"] = the_loc
            # weirdness needed for outside plot
            if legend_pos in ["o r", "outside right", "outside upper right"]:
                leg_options["bbox_to_anchor"] = (1.02, 1)
            if legend_pos == "outside center right":
                leg_options["bbox_to_anchor"] = (1.02, 0.5)
            if legend_pos == "outside lower right":
                leg_options["loc"] == "upper right"
                leg_options["bbox_to_anchor"] = (0.5, 0.5)

        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith("o"):
                leg_options["borderaxespad"] = 1

    if not piemode:
        if show_totals.endswith("both") or show_totals.endswith("legend"):
            dataframe = rename_data_with_total(
                dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
            )
    else:
        if pie_legend:
            if show_totals.endswith("both") or show_totals.endswith("legend"):
                dataframe = rename_data_with_total(
                    dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
                )

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs["y"] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.index)

    areamode = False
    if "kind" in kwargs:
        if kwargs["kind"] == "area":
            areamode = True

    if legend is False:
        kwargs["legend"] = False

    # cumulative grab first col
    if cumulative:
        kwargs["y"] = list(dataframe.columns)[0]

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kwargs["kind"] == "line":
                kwargs["marker"] = ","
        if not piemode:
            kwargs["alpha"] = 0.1

    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pd.PeriodIndex([d for d in list(dataframe.index)], freq="A")
                    dataframe = dataframe.set_index(n)

    MARKERSIZE = 4
    COLORMAP = {
        0: {"marker": None, "dash": (None, None)},
        1: {"marker": None, "dash": [5, 5]},
        2: {"marker": "o", "dash": (None, None)},
        3: {"marker": None, "dash": [1, 3]},
        4: {"marker": "s", "dash": [5, 2, 5, 2, 5, 10]},
        5: {"marker": None, "dash": [5, 3, 1, 2, 1, 10]},
        6: {"marker": "o", "dash": (None, None)},
        7: {"marker": None, "dash": [5, 3, 1, 3]},
        8: {"marker": "1", "dash": [1, 3]},
        9: {"marker": "*", "dash": [5, 5]},
        10: {"marker": "2", "dash": [5, 2, 5, 2, 5, 10]},
        11: {"marker": "s", "dash": (None, None)},
    }

    HATCHES = {
        0: {"color": "#dfdfdf", "hatch": "/"},
        1: {"color": "#6f6f6f", "hatch": "\\"},
        2: {"color": "b", "hatch": "|"},
        3: {"color": "#dfdfdf", "hatch": "-"},
        4: {"color": "#6f6f6f", "hatch": "+"},
        5: {"color": "b", "hatch": "x"},
    }

    if black_and_white:
        if kwargs["kind"] == "line":
            kwargs["linewidth"] = 1

        cmap = plt.get_cmap("Greys")
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kwargs["kind"] == "bar":
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs["colormap"] = new_cmap

    # use styles and plot

    with plt.style.context((style)):

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs["stacked"] = False
                    rev_leg = False
            ax = dataframe.plot(figsize=figsize, **kwargs)
        else:
            if not piemode and not sbplt:
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize=figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend(
                    handles,
                    labels,
                    loc=leg_options["loc"],
                    bbox_to_anchor=(0, -0.1, 1, 1),
                    bbox_transform=plt.gcf().transFigure,
                )
                if not tk:
                    plt.show()
                    return
        if "rot" in kwargs:
            if kwargs["rot"] != 0 and kwargs["rot"] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation=kwargs["rot"], ha="right")

        if transparent:
            plt.gcf().patch.set_facecolor("white")
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            # plt.grid()
            plt.gca().set_axis_bgcolor("w")
            if kwargs["kind"] == "line":
                # white background

                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color("black")
                    # line.set_width(1)
                    line.set_dashes(COLORMAP[c]["dash"])
                    line.set_marker(COLORMAP[c]["marker"])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(COLORMAP.keys()):
                        c = 0

        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    if not rev_leg:
                        lgd = plt.legend(**leg_options)
                    else:
                        handles, labels = plt.gca().get_legend_handles_labels()
                        lgd = plt.legend(handles[::-1], labels[::-1], **leg_options)

            # if black_and_white:
            # lgd.set_facecolor('w')

        # if interactive:
        # if legend:
        # lgd.set_title("")
        # if not sbplt:
        # if 'layout' not in kwargs:
        # plt.tight_layout()

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ["%s (%s: %d)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ["%s (%s: %.2f%%)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                # if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                # else:
                if kwargs["kind"] == "line":
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels=ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)

            # works:
            # plugins.connect(plt.gcf(), plugins.LineLabelTooltip(l, labels[i]))

        # labels = ["Point {0}".format(i) for i in range(num_to_plot)]
        # tooltip = plugins.LineLabelTooltip(lines)
        # mpld3.plugins.connect(plt.gcf(), mpld3.plugins.PointLabelTooltip(lines))

    if piemode:
        if not sbplt:
            plt.axis("equal")
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = "Year"

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0]  # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = "Year"
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = "Year"
                else:
                    x_label = "Group"
            except:
                x_label = "Group"

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    # no offsets for numerical x and y values
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            # check if x axis can be an int
            check_x_axis = list(dataframe.index)[0]
            can_it_be_int = int(check_x_axis)
            # if so, set these things
            from matplotlib.ticker import ScalarFormatter

            plt.gca().xaxis.set_major_formatter(ScalarFormatter())
        except:
            pass

    # same for y axis
    try:
        # check if x axis can be an int
        check_y_axis = list(dataframe.columns)[0]
        can_it_be_int = int(check_y_axis)
        # if so, set these things
        from matplotlib.ticker import ScalarFormatter

        plt.gca().yaxis.set_major_formatter(ScalarFormatter())
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = "Percentage"
    else:
        y_l = "Absolute frequency"

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        # plt.suptitle(title, fontsize = 16)
        # get all axes
        if "layout" not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)

        # set subplot titles

        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis("equal")

    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if "kind" in kwargs:
            if kwargs["kind"].startswith("bar"):
                width = ax.containers[0][0].get_width()

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0, the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")

    # if not running_python_tex:
    # plt.gcf().show()

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)
    # if 'layout' not in kwargs:
    # plt.tight_layout()

    if save:
        import os

        if running_python_tex:
            imagefolder = "../images"
        else:
            imagefolder = "images"

        savename = get_savename(imagefolder, save=save, title=title, ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith("o"):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches="tight", format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print "\n" + time + ": " + savename + " created."
        else:
            raise ValueError("Error making %s." % savename)

    if not interactive and not running_python_tex and not running_spider and not tk:
        plt.show()
        return
    if running_spider or tk or sbplt:
        return plt

    if interactive:
        plt.subplots_adjust(right=0.8)
        plt.subplots_adjust(left=0.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
Example #19
0
df3 = df.copy()
mean = df3["float_col"].mean()
print(df3)
print(df3["float_col"].fillna(mean))

### 4. Map, Apply ###
print("---")
###

print(df["str_col"].dropna().map(lambda x: "map_" + x))

print(df.ix[:, ["int_col", "float_col"]].apply(np.sqrt))

print(df.ix[:, ["int_col", "float_col"]].apply(np.sum))

print(df.applymap(some_fn))

### Vectorized mathematical and string operations ###

df = pd.DataFrame(data={"A": [1, 2], "B": [1.2, 1.3]})

df["C"] = df["A"] + df["B"]

print(df)

df["D"] = df["A"] * 3

print(df)

df["E"] = np.sqrt(df["A"])
Example #20
0
class StatFileClass:
    global FinalSummary

    def __init__(self, temp=0):
        self.FileName = temp
        self.FileNameHead = self.FileName[0 : self.FileName.find(".")]
        self.BankName = self.FileName[0 : self.FileName.find("银行") + 2]
        if "美元" in temp:
            self.Currency = "USD"
        elif "日币" in temp:
            self.Currency = "JPY"
        else:
            self.Currency = "CNY"
        if "待核查" in temp:
            self.CountType = "待核查"
        elif "专户" in temp:
            self.CountType = "专户"
        else:
            self.CountType = "一般户"
        self.DateLable = StatRule1.ix[self.BankName, "交易日期字段"]
        self.TimeLable = StatRule1.ix[self.BankName, "交易时间字段"]
        self.TimeFormat = StatRule1.ix[self.BankName, "时间格式"].split(",")
        self.IncomeLable = StatRule1.ix[self.BankName, "收入字段"]
        self.PayLable = StatRule1.ix[self.BankName, "支出字段"]
        self.BalanceLable = StatRule1.ix[self.BankName, "当日余额字段"]
        self.KeyLable1 = StatRule1.ix[self.BankName, "大类字段"]
        self.KeyLable2 = StatRule1.ix[self.BankName, "子类字段"].split("+")  # 字符串list
        self.CountLable = StatRule1.ix[self.BankName, "户名字段"]
        self.SkipRows = StatRule1.ix[self.BankName, "数据开始行"] - 1
        self.ERateUSD = StatRule1.ix[self.BankName, "美元汇率"]
        self.ERateJPY = StatRule1.ix[self.BankName, "日元汇率"]

        # 计算汇率
        if self.Currency == "USD":
            self.ERate = self.ERateUSD
        elif self.Currency == "JPY":
            self.ERate = self.ERateJPY
        else:
            self.ERate = 1

        # 判断收入支出类型
        if self.IncomeLable != self.PayLable:  # 非中国银行,将收入和支出合并
            self.RawData = read_excel(
                self.FileName,
                skiprows=self.SkipRows,
                converters={self.IncomeLable: str, self.PayLable: str, self.BalanceLable: str},
            )
            self.IncomeData = self.RawData[self.IncomeLable].astype(float).fillna(0)
            self.PayData = self.RawData[self.PayLable].astype(float).fillna(0)
            self.IncomeData = self.IncomeData - self.PayData
        else:
            self.RawData = read_excel(
                self.FileName, skiprows=self.SkipRows, converters={self.IncomeLable: str, self.BalanceLable: str}
            )
            self.IncomeData = self.RawData[self.IncomeLable].astype(float)
        self.IncomeType = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        for i in range(self.IncomeData.shape[0]):
            if self.IncomeData[i] > 0:
                self.IncomeType[i] = "收入"
            else:
                self.IncomeType[i] = "支出"

        # 计算本币收入及余额
        self.IncomeDataLocal = self.IncomeData * self.ERate

        # 处理日期数据
        self.Date = self.RawData[self.DateLable].astype(str)  # 字符串直接转换为日期数据
        self.Time = self.RawData[self.TimeLable].astype(str)
        if len(self.TimeFormat) == 3:  # 如果有字符串长度参数则截取
            for i in self.Time.index:
                self.Time[i] = self.Time[i][int(self.TimeFormat[1]) - 1 : int(self.TimeFormat[2])]
        for i in self.Time.index:
            self.Time[i] = datetime.strptime(self.Time[i], self.TimeFormat[0]).strftime(
                "%H:%M:%S"
            )  # 按照格式处理为时间数据,再转化为格式化的字符串
        # 获取大类数据
        self.KeyWord1 = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        if self.KeyLable1 == "无":
            self.KeyWord1[:] = "无"
        else:
            self.KeyWord1 = self.RawData[self.KeyLable1]

        # 获取交易户名数据
        self.CountName = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        self.CountData = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        if self.CountLable == "无":
            self.CountName[:] = "无"
            self.CountData[:] = "无"
        else:
            self.CountData = self.RawData[self.CountLable]
            self.CountData.fillna("无", inplace=True)
            for i in range(self.IncomeData.shape[0]):
                TempData = list(
                    set(
                        list(
                            zip(
                                *list(
                                    ClassifyRuleDF.ix[self.BankName].ix[self.IncomeType[i]].ix[self.KeyWord1[i]].index
                                )
                            )
                        )[0]
                    )
                )  # 获取户名的唯一值的list
                if len(TempData) == 1:  # 子类字段只有一个
                    self.CountName[i] = TempData[0]
                else:
                    bFindResult = False
                    for j in TempData:
                        if j in self.CountData[i]:
                            self.CountName[i] = j
                            bFindResult = True
                            break
                    if not bFindResult:
                        self.CountName[i] = "无"

        # 获取子类数据并分类
        self.KeyWord2 = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        self.KeyData2 = self.RawData[self.KeyLable2]
        self.KeyData2.fillna(" ", inplace=True)  # 由于有多个关键字段,空值不能赋为'无',而是空格
        self.KeyData2 = self.KeyData2.apply(JoinStr, axis=1)
        self.ClassifyResult = Series(zeros(self.IncomeData.shape[0]))  # 初始化
        for i in range(self.KeyData2.shape[0]):
            try:
                TempData = list(
                    set(
                        list(
                            ClassifyRuleDF.ix[self.BankName]
                            .ix[self.IncomeType[i]]
                            .ix[self.KeyWord1[i]]
                            .ix[self.CountName[i]]
                            .index
                        )
                    )
                )
                if len(TempData) == 1:  # 子类字段只有一个
                    self.KeyWord2[i] = TempData[0]
                else:
                    TempKeyWord = [
                        m.split("+") for m in TempData
                    ]  # 按分隔符分割关键字,[['银票托收'],['销售收入'],['BEPS'],['BEPS','网吧']]
                    TempKeyWord.sort(
                        key=lambda x: len(x), reverse=True
                    )  # 按关键字个数排序;关键字越多,排序越靠前,[['BEPS','网吧'],['银票托收'],['销售收入'],['BEPS']]
                    bFindResult = False
                    for j in TempKeyWord:  # j = ['BEPS','网吧']
                        bFindResult2 = True
                        for k in j:  # k = 'BEPS'
                            # if k not in str(list(self.KeyData2.ix[i])):   #只要有一个关键字不匹配,则放弃搜索该关键字
                            if k not in self.KeyData2.ix[i]:
                                bFindResult2 = False
                                break
                        if bFindResult2:  # 全部关键字匹配,则认为匹配成功
                            self.KeyWord2[i] = "+".join(j)  # 用+号重新连接为表中的关键字
                            bFindResult = True
                            break
                    if not bFindResult:
                        self.KeyWord2[i] = "无"
                if self.KeyWord2[i] in TempData:
                    self.ClassifyResult[i] = (
                        ClassifyRuleDF.ix[self.BankName]
                        .ix[self.IncomeType[i]]
                        .ix[self.KeyWord1[i]]
                        .ix[self.CountName[i]]
                        .ix[self.KeyWord2[i]]
                    )
                    if type(self.ClassifyResult[i]) != str:  # 如果出现多个分类结果,取第一个;
                        self.ClassifyResult[i] = self.ClassifyResult[i].ix[0]
                else:
                    self.ClassifyResult[i] = "分类错误"
            except (Exception) as e:
                print(e, ", 分类错误")
                self.ClassifyResult[i] = "分类错误"

        # ==============================================================================
        #         #产生当日分类汇总
        #         self.Summary = self.IncomeData.copy()
        #         self.Summary.index = [self.Date,self.IncomeType,self.ClassifyResult]
        #         TempIndex = set(list(self.Summary.index)) #合并日期、收入类型、分类结果都相同的项
        #         TempIncome = array([self.Summary.ix[i].sum() for i in TempIndex])
        #         self.Summary = DataFrame(TempIncome, columns  = ['原币收入'])
        #         TempIncomeLocal = TempIncome
        #         if self.Currency == 'USD':
        #             TempIncomeLocal = TempIncome * self.ERateUSD
        #         elif self.Currency == 'JPY':
        #             TempIncomeLocal *= TempIncome * self.ERateJPY
        #         self.Summary['本币收入'] = TempIncomeLocal
        #         self.Summary['银行名称'] = self.BankName
        #         self.Summary['账户类型'] = self.CountType
        #         self.Summary['币种'] = self.Currency
        #         TempIndex = list(zip(*list(TempIndex)))
        #         self.Summary['交易日期'] = TempIndex[0]
        #         self.Summary['收支类型'] = TempIndex[1]
        #         self.Summary['分类结果'] = TempIndex[2]
        #
        # ==============================================================================
        # 结果输出
        self.ResultDF = concat(
            [
                self.Date,
                self.Time,
                self.IncomeData,
                self.IncomeDataLocal,
                self.IncomeType,
                self.KeyWord1,
                self.CountName,
                self.KeyWord2,
                self.ClassifyResult,
            ],
            axis=1,
        )
        self.ResultDF.columns = ["交易日期", "交易时间", "收入", "本币收入", "收支类型", "大类", "对方户名", "子类", "分类结果"]
        self.ResultDF["银行名称"] = self.BankName
        self.ResultDF["账户类型"] = self.CountType
        self.ResultDF["币种"] = self.Currency

        self.ResultDF2 = concat(
            [
                self.Date,
                self.Time,
                self.IncomeType,
                self.IncomeData,
                self.IncomeDataLocal,
                self.KeyWord1,
                self.CountData,
                self.KeyData2,
                self.ClassifyResult,
            ],
            axis=1,
        )
        self.ResultDF2.columns = ["交易日期", "交易时间", "收支类型", "交易原币金额", "交易本币金额", "大类", "对方户名", "子类", "分类结果"]
        self.ResultDF2["银行名称"] = self.FileNameHead
        self.ResultDF2["汇率"] = self.ERate
        self.ResultDF2["币种"] = self.Currency
        self.ResultDF2 = self.ResultDF2.reindex(
            columns=["银行名称", "交易日期", "交易时间", "收支类型", "币种", "交易原币金额", "汇率", "交易本币金额", "大类", "子类", "分类结果", "对方户名"]
        )

        # 单日余额汇总
        # 余额数据导入
        self.BalanceData = self.RawData[self.BalanceLable]
        self.BalanceData = DataFrame([self.Date, self.BalanceData], index=["交易日期", "余额"]).T
        self.BalanceData = self.BalanceData.groupby(["交易日期"]).last()  # 取每天的最后一笔交易的余额数据
        self.BalanceData = self.BalanceData.applymap(RemoveComma)  # 对每个元素去除逗号
        self.BalanceData = self.BalanceData.astype(float)
        # 计算本币余额
        self.BalanceDataLocal = self.BalanceData * self.ERate
        self.BalanceData["本币余额"] = self.BalanceDataLocal
        self.BalanceData["银行名称"] = self.BankName
        self.BalanceData["账户类型"] = self.CountType
        self.BalanceData["币种"] = self.Currency
        self.BalanceData["交易日期"] = self.BalanceData.index
        self.BalanceData = self.BalanceData.reindex(columns=["交易日期", "余额", "本币余额", "银行名称", "账户类型", "币种"])
        self.BalanceData.index = self.BalanceData["交易日期"].map(ReturnDate)
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
Example #22
0
# broadcasting examples.
arr = np.arange(12.0).reshape((3, 4))
test_arr = arr - arr[0]

frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
series = frame.ix[0]
bcast_test1 = frame - series
bcast_test2 = series - frame

# apply: kind of like R.
f = lambda x: x.max() - x.min()
frame = DataFrame(np.random.randn(4, 3), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
# try frame.apply(f), frame.apply(f, axis=1)
formatter = lambda x: "%.2f" % x
frame.applymap(formatter)

# check out assignment.
frametest = frame
frametest["e"] = frametest["e"].map(formatter)
# now look at frame.

# Basic statistics over frames

df = DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=["a", "b", "c", "d"], columns=["one", "two"]
)
# df.sum(), df.sum(axis=1)
df.mean(axis=1)
df.mean(axis=1, skipna=False)
df.describe()  # cool