Example #1
0
    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], names=["first", "second"])

        index2 = MultiIndex.from_arrays([["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], names=["first", "second"])

        df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
        df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])

        df1 = df1.sortlevel(0)
        df2 = df2.sortlevel(0)

        joined = df1.join(df2, how="outer")
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

        df1 = df1.sortlevel(1)
        df2 = df2.sortlevel(1)

        joined = df1.join(df2, how="outer").sortlevel(0)
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)
Example #2
0
    def test_empty_string(self):
        data = """\
One,Two,Three
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
        df = read_csv(StringIO(data))
        xp = DataFrame(
            {
                "One": ["a", "b", np.nan, "d", "e", np.nan, "g"],
                "Two": [1, 2, 3, 4, 5, 6, 7],
                "Three": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
            }
        )
        assert_frame_equal(xp.reindex(columns=df.columns), df)

        df = read_csv(StringIO(data), na_values={"One": [], "Three": []})
        xp = DataFrame(
            {
                "One": ["a", "b", "", "d", "e", "nan", "g"],
                "Two": [1, 2, 3, 4, 5, 6, 7],
                "Three": ["one", "two", "three", "nan", "five", "", "seven"],
            }
        )
        assert_frame_equal(xp.reindex(columns=df.columns), df)
def strategy_statistics(strategy_name, strategy_count):
    all_qr = QR.objects(strategy_name=strategy_name)
    if not all_qr:
        print "Wrong Strategy Name!"
        return

    trading_date = QR.objects().distinct("date")
    trading_date.sort()
    trading_date = trading_date[0 - strategy_count :]
    bt_result = {}
    for d in trading_date:
        bt_result[str(d.date())] = back_test_success(strategy_name, d)

    frame = DataFrame(bt_result)
    pd.set_option("display.width", 200)
    pd.set_option("display.max_rows", 2000)
    print frame.reindex(
        [
            "count",
            "one_back_test",
            "one_yield_expectation",
            "three_back_test",
            "three_yield_expectation",
            "five_back_test",
            "five_yield_expectation",
        ]
    ).T
    pd.set_option("display.width", None)
    pd.set_option("display.max_rows", None)
Example #4
0
    def test_sort_values(self):
        frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC"))

        # by column (axis=0)
        sorted_df = frame.sort_values(by="A")
        indexer = frame["A"].argsort().values
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        indexer = indexer[::-1]
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        assert_frame_equal(sorted_df, expected)

        # GH4839
        sorted_df = frame.sort_values(by=["A"], ascending=[False])
        assert_frame_equal(sorted_df, expected)

        # multiple bys
        sorted_df = frame.sort_values(by=["B", "C"])
        expected = frame.loc[[2, 1, 3]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=["B", "C"], ascending=False)
        assert_frame_equal(sorted_df, expected[::-1])

        sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False])
        assert_frame_equal(sorted_df, expected)

        self.assertRaises(ValueError, lambda: frame.sort_values(by=["A", "B"], axis=2, inplace=True))

        # by row (axis=1): GH 10806
        sorted_df = frame.sort_values(by=3, axis=1)
        expected = frame
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
        expected = frame.reindex(columns=["C", "B", "A"])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 2], axis="columns")
        expected = frame.reindex(columns=["B", "A", "C"])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
        expected = frame.reindex(columns=["C", "B", "A"])
        assert_frame_equal(sorted_df, expected)

        msg = r"Length of ascending \(5\) != length of by \(2\)"
        with assertRaisesRegexp(ValueError, msg):
            frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
    def test_reindex_boolean(self):
        frame = DataFrame(np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2])

        reindexed = frame.reindex(np.arange(10))
        self.assertEqual(reindexed.values.dtype, np.object_)
        self.assertTrue(isnull(reindexed[0][1]))

        reindexed = frame.reindex(columns=lrange(3))
        self.assertEqual(reindexed.values.dtype, np.object_)
        self.assertTrue(isnull(reindexed[1]).all())
    def test_reindex_multi(self):
        df = DataFrame(np.random.randn(3, 3))

        result = df.reindex(lrange(4), lrange(4))
        expected = df.reindex(lrange(4)).reindex(columns=lrange(4))

        assert_frame_equal(result, expected)

        df = DataFrame(np.random.randint(0, 10, (3, 3)))

        result = df.reindex(lrange(4), lrange(4))
        expected = df.reindex(lrange(4)).reindex(columns=lrange(4))

        assert_frame_equal(result, expected)

        df = DataFrame(np.random.randint(0, 10, (3, 3)))

        result = df.reindex(lrange(2), lrange(2))
        expected = df.reindex(lrange(2)).reindex(columns=lrange(2))

        assert_frame_equal(result, expected)

        df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"])

        result = df.reindex(index=[0, 1], columns=["a", "b"])
        expected = df.reindex([0, 1]).reindex(columns=["a", "b"])

        assert_frame_equal(result, expected)
    def test_reindex_axes(self):
        # GH 3317, reindexing by both axes loses freq of the index
        df = DataFrame(
            np.ones((3, 3)),
            index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)],
            columns=["a", "b", "c"],
        )
        time_freq = date_range("2012-01-01", "2012-01-03", freq="d")
        some_cols = ["a", "b"]

        index_freq = df.reindex(index=time_freq).index.freq
        both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq
        seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq
        self.assertEqual(index_freq, both_freq)
        self.assertEqual(index_freq, seq_freq)
Example #8
0
def import_training_data(fname=None, verbose=False):
    count = 0
    result = DataFrame({"text": [], "class": [], "rumor": [], "event": [], "features": []})
    for event in rumor_terms.event_rumor_map:
        for rumor in rumor_terms.event_rumor_map[event]:
            if verbose:
                print "processing data from %s, %s" % (event, rumor)
            pos_examples = [
                x
                for x in client["code_comparison"][rumor].find(
                    {"first_final": {"$in": ["Affirm", "Deny", "Neutral"]}, "second_final": "Uncertainty"}
                )
            ]
            neg_examples = [
                x
                for x in client["code_comparison"][rumor].find(
                    {"first_final": {"$in": ["Affirm", "Deny", "Neutral"]}, "second_final": {"$ne": "Uncertainty"}}
                )
            ]
            examples = pos_examples
            examples += random.sample(neg_examples, len(pos_examples))
            for tweet in examples:
                if tweet["text"]:
                    # full_tweet = get_tweet_meta_data(tweet,event,rumor)
                    features = {}
                    # if full_tweet:
                    #    features['has_mention'] = find_mention(full_tweet['text'])
                    # else:
                    #    features['has_mention'] = False
                    if "?" in tweet["text"]:
                        features["is_question"] = True
                    else:
                        features["is_question"] = False
                    text = process_tweet(tweet, event, rumor)
                    if "Uncertainty" in tweet["second_final"]:
                        classification = 1
                    else:
                        classification = 0
                    result = result.append(
                        DataFrame(
                            {
                                "text": text,
                                "class": classification,
                                "rumor": rumor,
                                "event": event,
                                "features": json.dumps(features),
                            },
                            index=[count],
                        )
                    )
                    count += 1
    result = result.reindex(numpy.random.permutation(result.index))

    if fname:
        fpath = os.path.join(os.path.dirname(__file__), os.pardir, "dicts/") + fname
        f = open(fpath, "w")
        pickle.dump(result, f)
    if verbose:
        print result
    return result
def readDatasetIntoDataFrame():

    # Open file
    f = open("SpamHamDataset.txt", "r")

    # New DataFrame with two columns
    df = DataFrame(columns=("label", "text"))

    count = 0
    for line in f:
        tokens = line.split()
        flag = tokens[0]  # The first word of each row is the label.
        text = ""

        # Concatenate all tokens, except the label, to get the content of the message itself.
        for x in range(1, tokens.__len__()):
            text = text + tokens[x]
            text = text + " "
            sig = 0
            if flag == "spam":
                sig = 1
        # print label, "---", text
        df.loc[count] = [sig, text]
        count = count + 1

    # Housekeeping
    df = df.reindex(random.permutation(df.index))

    return df
Example #10
0
def plot_scores(scores, title, x_label, classifier_names):
    """ Make a barplot of the scores of some performance measure.

        Parameters
        ----------
        scores : dict
            Where the keys are the classifier names and the values are the scores.

        title : str
            Title of the plot.

        x_label : str
            Label for the x-axis

        classifier_names : array
            List of the names of the classifiers, the order of which will be used
            to order the bars.
    """

    scores = DataFrame(scores, index=[x_label])
    scores = scores.reindex(columns=classifier_names)

    format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
    fig, ax = plt.subplots(figsize=(9, 5))
    scores.plot(ax=ax, kind="bar", title=title, fontsize=12)
    ax.legend(bbox_to_anchor=(1.5, 0.6))
    ax.set_xticklabels([], rotation=0)
    ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))

    plt.show()
Example #11
0
def pickle_from_db(event_list, fname, verbose=False):
    for event in event_list:
        result = DataFrame({"text": [], "event": [], "features": [], "unique_id": [], "raw_text": []})
        count = 0
        if verbose:
            print "processing data from %s" % (event)
        examples = client[insert_db][event].find()
        for tweet in examples:
            if verbose and count % 1000 == 0 and count != 0:
                print "processed %s tweets" % count
            if tweet["text"]:
                result = result.append(
                    DataFrame(
                        {
                            "text": tweet["text"],
                            "event": event,
                            "features": json.dumps(tweet["features"]),
                            "unique_id": tweet["unique_id"],
                            "raw_text": tweet["raw_text"],
                        },
                        index=[count],
                    )
                )
                count += 1
                if count == 50:
                    break
        result = result.reindex(numpy.random.permutation(result.index))

        fpath = os.path.join(os.path.dirname(__file__), os.pardir, "dicts/") + event + "_" + fname
        f = open(fpath, "w")
        pickle.dump(result, f)
        f.close()
        if verbose:
            print result
            print "dumped %s tweets" % len(result)
    def test_reindex_name_remains(self):
        s = Series(random.rand(10))
        df = DataFrame(s, index=np.arange(len(s)))
        i = Series(np.arange(10), name="iname")

        df = df.reindex(i)
        self.assertEqual(df.index.name, "iname")

        df = df.reindex(Index(np.arange(10), name="tmpname"))
        self.assertEqual(df.index.name, "tmpname")

        s = Series(random.rand(10))
        df = DataFrame(s.T, index=np.arange(len(s)))
        i = Series(np.arange(10), name="iname")
        df = df.reindex(columns=i)
        self.assertEqual(df.columns.name, "iname")
    def test_reindex_with_nans(self):
        df = DataFrame(
            [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]],
            columns=["a", "b"],
            index=[100.0, 101.0, np.nan, 102.0, 103.0],
        )

        result = df.reindex(index=[101.0, 102.0, 103.0])
        expected = df.iloc[[1, 3, 4]]
        assert_frame_equal(result, expected)

        result = df.reindex(index=[103.0])
        expected = df.iloc[[4]]
        assert_frame_equal(result, expected)

        result = df.reindex(index=[101.0])
        expected = df.iloc[[1]]
        assert_frame_equal(result, expected)
Example #14
0
    def test_reindex_frame_add_nat(self):
        rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s")
        df = DataFrame({"A": np.random.randn(len(rng)), "B": rng})

        result = df.reindex(range(15))
        self.assert_(np.issubdtype(result["B"].dtype, np.datetime64))

        mask = com.isnull(result)["B"]
        self.assert_(mask[-5:].all())
        self.assert_(not mask[:-5].any())
Example #15
0
    def testWithXEffects(self):
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1'])

        assert_almost_equal(result._y.values.flat, [1, 4, 5])

        res = result._x
        exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]],
                          columns=['x1_30', 'x1_9', 'x2', 'intercept'],
                          index=res.index, dtype=float)
        assert_frame_equal(res, exp_x.reindex(columns=res.columns))
    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(lrange(15))
        self.assertTrue(np.isnan(result.values[-5:]).all())

        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=lrange(5), fill_value=0.0)
        expected = df.copy()
        expected[4] = 0.0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value="foo")
        expected = df.copy()
        expected[4] = "foo"
        assert_frame_equal(result, expected)

        # reindex_axis
        result = df.reindex_axis(lrange(15), fill_value=0.0, axis=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        result = df.reindex_axis(lrange(5), fill_value=0.0, axis=1)
        expected = df.reindex(columns=lrange(5)).fillna(0)
        assert_frame_equal(result, expected)

        # other dtypes
        df["foo"] = "foo"
        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)
Example #17
0
 def train(self, input_type, query_classes):
     log("********************** " + input_type + " **********************")
     current_dir = os.path.abspath(os.path.dirname(__file__))
     # If there is no or only one possible outcomes for the input type,
     # there is no need to train any classifier.
     if len(query_classes) <= 1:
         return DummyClassifier(query_classes.keys()[0])
         # Build DataFrame by going through all data files.
     data = DataFrame({"text": [], "class": []})
     for query_class_name in query_classes:
         path = current_dir + "/../data/" + query_class_name + ".txt"
         log("Opening " + path)
         lines = [line.rstrip("\n") for line in open(path)]
         rows = []
         index = []
         for text in lines:
             if text in index:
                 log("duplicate in " + path + ": " + text)
                 exit(1)
             rows.append({"text": text, "class": query_class_name})
             index.append(text)
         data = data.append(DataFrame(rows, index))
         # Build the pipeline.
     pipeline = Pipeline(
         [
             ("count_vectorizer", CountVectorizer(ngram_range=(1, 2))),
             #     ('classifier',         PassiveAggressiveClassifier())
             ("classifier", LinearSVC()),
         ]
     )
     # Train and k-fold cross-validate. Introduce randomness.
     data = data.reindex(numpy.random.permutation(data.index))
     k_fold = KFold(n=len(data), n_folds=6)
     scores = []
     for train_indices, test_indices in k_fold:
         train_text = data.iloc[train_indices]["text"].values
         train_y = data.iloc[train_indices]["class"].values.astype(str)
         test_text = data.iloc[test_indices]["text"].values
         test_y = data.iloc[test_indices]["class"].values.astype(str)
         pipeline.fit(train_text, train_y)
         predictions = pipeline.predict(test_text)
         score = f1_score(test_y, predictions, pos_label=None if len(query_classes) == 2 else 1, average="weighted")
         scores.append(score)
     log("Total documents classified:" + str(len(data)))
     log("Score:" + str(sum(scores) / len(scores)))
     # Save the classifier,
     if not os.path.exists(current_dir + "/../models"):
         os.makedirs(current_dir + "/../models")
     with open(current_dir + "/../models/dumped_classifier_" + input_type + ".pkl", "wb") as fid:
         log("Saving model for " + input_type)
         cPickle.dump(pipeline, fid)
     return pipeline
Example #18
0
    def testWithXEffectsAndDroppedDummies(self):
        result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=["x1"], dropped_dummies={"x1": 30})

        res = result._x
        assert_almost_equal(result._y.values.flat, [1, 4, 5])
        exp_x = DataFrame(
            [[1.0, 0.0, 14.0, 1.0], [0, 1, 17, 1], [0, 0, 48, 1]],
            columns=["x1_6", "x1_9", "x2", "intercept"],
            index=res.index,
            dtype=float,
        )

        assert_frame_equal(res, exp_x.reindex(columns=res.columns))
Example #19
0
    def testWithXEffects(self):
        result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=["x1"])

        assert_almost_equal(result._y.values.flat, [1, 4, 5])

        res = result._x
        exp_x = DataFrame(
            [[0, 0, 14, 1], [0, 1, 17, 1], [1, 0, 48, 1]],
            columns=["x1_30", "x1_9", "x2", "intercept"],
            index=res.index,
            dtype=float,
        )
        assert_frame_equal(res, exp_x.reindex(columns=res.columns))
    def test_reindex_columns_method(self):

        # GH 14992, reindexing over columns ignored method
        df = DataFrame(data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], index=[1, 2, 4], columns=[1, 2, 4], dtype=float)

        # default method
        result = df.reindex(columns=range(6))
        expected = DataFrame(
            data=[
                [np.nan, 11, 12, np.nan, 13, np.nan],
                [np.nan, 21, 22, np.nan, 23, np.nan],
                [np.nan, 31, 32, np.nan, 33, np.nan],
            ],
            index=[1, 2, 4],
            columns=range(6),
            dtype=float,
        )
        assert_frame_equal(result, expected)

        # method='ffill'
        result = df.reindex(columns=range(6), method="ffill")
        expected = DataFrame(
            data=[[np.nan, 11, 12, 12, 13, 13], [np.nan, 21, 22, 22, 23, 23], [np.nan, 31, 32, 32, 33, 33]],
            index=[1, 2, 4],
            columns=range(6),
            dtype=float,
        )
        assert_frame_equal(result, expected)

        # method='bfill'
        result = df.reindex(columns=range(6), method="bfill")
        expected = DataFrame(
            data=[[11, 11, 12, 13, 13, np.nan], [21, 21, 22, 23, 23, np.nan], [31, 31, 32, 33, 33, np.nan]],
            index=[1, 2, 4],
            columns=range(6),
            dtype=float,
        )
        assert_frame_equal(result, expected)
Example #21
0
    def create_dataframe(self):
        """
        Formats data into a dataframe
        """
        data_dict = dict()
        index = []
        for row in self.data:
            if not row.desc in ("root"):
                index.append(row.desc)
                data_dict[row.desc] = row.vals

        df = DataFrame(data_dict).T
        df = df.reindex(index)
        return df
Example #22
0
    def testWithXEffectsAndDroppedDummies(self):
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=["x1"], dropped_dummies={"x1": 30})

        res = result._x
        # .flat is flatiter instance
        assert_almost_equal(result._y.values.flat, [1, 4, 5], check_dtype=False)
        exp_x = DataFrame(
            [[1.0, 0.0, 14.0, 1.0], [0, 1, 17, 1], [0, 0, 48, 1]],
            columns=["x1_6", "x1_9", "x2", "intercept"],
            index=res.index,
            dtype=float,
        )

        assert_frame_equal(res, exp_x.reindex(columns=res.columns))
    def _pandas_interp(self, data, indices):
        """The actual transformation based on the following stackoverflow 
        entry: http://stackoverflow.com/a/10465162
        """
        new_index = np.arange(indices[-1] + 1)

        data_frame = DataFrame(data, index=indices)
        data_frame_reindexed = data_frame.reindex(new_index)
        data_interpol = data_frame_reindexed.apply(Series.interpolate)

        del new_index
        del data_frame
        del data_frame_reindexed

        return data_interpol
Example #24
0
    def get_results_dataframe(self, default=False, index_by_code=False):
        """
        Formats data into a dataframe
        """
        datas = self._compute()
        self._compute_uc()
        uc = self.uc
        dfs = dict()

        for scenario, dico in datas.iteritems():
            data = dico["data"]
            data_default = dico["default"]

            data_dict = dict()
            index = []

            if default is True:
                data = data_default

            for row in data:
                if not row.desc in ("root"):
                    if row.code == "revdisp":
                        revdisp = row.vals
                    if index_by_code is True:
                        index.append(row.code)
                        data_dict[row.code] = row.vals
                    else:
                        index.append(row.desc)
                        data_dict[row.desc] = row.vals

            df = DataFrame(data_dict).T

            df = df.reindex(index)
            df = df.rename(columns={0: scenario})
            nivvie = revdisp / uc[scenario]  # TODO: include savings !!
            df = concat([df, DataFrame({scenario: nivvie}, index=["nivvie"])])
            dfs[scenario] = df

        first = True

        for df in dfs.itervalues():
            if first:
                df_final = df
                first = False
            else:
                df_final = concat([df_final, df], axis=1, join="inner")

        return df_final
Example #25
0
    def testWithXEffects(self):
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=["x1"])

        # .flat is flatiter instance
        assert_almost_equal(result._y.values.flat, [1, 4, 5], check_dtype=False)

        res = result._x
        exp_x = DataFrame(
            [[0.0, 0.0, 14.0, 1.0], [0, 1, 17, 1], [1, 0, 48, 1]],
            columns=["x1_30", "x1_9", "x2", "intercept"],
            index=res.index,
            dtype=float,
        )
        exp_x[["x1_30", "x1_9"]] = exp_x[["x1_30", "x1_9"]].astype(np.uint8)
        assert_frame_equal(res, exp_x.reindex(columns=res.columns))
Example #26
0
    def update(self, today=datetime.today()):
        df = self.data()
        final = False
        while not final:
            if "date" in df.columns:
                maxdate = max(df["date"])
            else:
                maxdate = max(df.index)
            final = self.isfinal(maxdate)
            if not final:
                print "%s is not final, stripping" % maxdate
                if "date" in df.columns:
                    df = df[df["date"] != maxdate]
                else:
                    df = df.reindex(df.index - [maxdate])
        print "maxdate = %s, today = %s" % (maxdate, today)
        newdf = DataFrame()
        if self.chunktype == "YEAR":
            for y in range(maxdate.year, today.year + 1):
                print "performing update for %d" % (y)
                updf = self._updateyear(y)
                print updf[-3:]
                newdf = newdf.append(updf, ignore_index="date" in df.columns)
        elif self.chunktype == "DAY":
            start = maxdate + timedelta(days=1)
            start = datetime(*(start.timetuple()[:6]))
            dr = DateRange(start, today)
            for d in dr:
                if d == datetime(2011, 12, 26) or d == datetime(2012, 1, 2):
                    continue
                print "performing update for %s" % (d)
                updf = self._updateday(d.date())
                print updf[-3:]
                newdf = newdf.append(updf, ignore_index="date" in df.columns)
        else:
            raise NameError("unknown chunktype " + self.chunktype)

        if "date" in df.columns:
            newdf = newdf[newdf["date"] > maxdate]
        else:
            print newdf.index[-3:]
            newindex = filter(lambda d: d > maxdate, newdf.index)
            print "fetched %d rows, %d rows more recent than maxdate = %s" % (len(newdf), len(newindex), maxdate)
            newdf = newdf.reindex(newindex)
        print "end of new data: %s" % (newdf[-3:])
        self._cache = df.append(newdf, ignore_index="date" in df.columns)
Example #27
0
    def test_getitem_setitem_slice_integers(self):
        index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

        frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=["a", "b", "c", "d"])
        res = frame.ix[1:2]
        exp = frame.reindex(frame.index[2:])
        assert_frame_equal(res, exp)

        frame.ix[1:2] = 7
        self.assert_((frame.ix[1:2] == 7).values.all())

        series = Series(np.random.randn(len(index)), index=index)

        res = series.ix[1:2]
        exp = series.reindex(series.index[2:])
        assert_series_equal(res, exp)

        series.ix[1:2] = 7
        self.assert_((series.ix[1:2] == 7).values.all())
Example #28
0
    def get_results_dataframe(self, default=False, difference=False, index_by_code=False):
        """
        Formats data into a dataframe

        Parameters
        ----------
        default : boolean, default False
                  If True compute the default results
        difference :  boolean, default True
                      If True compute the difference between actual and default results
        index_by_code : boolean, default False
                  Index the row by the code instead of name of the different element
                  of decomp_file

        Returns
        -------
        df : A DataFrame with computed data according to decomp_file
        """
        if self.data is None:
            self.compute(difference=difference)

        data = self.data
        data_default = self.data_default

        data_dict = dict()
        index = []

        if default is True:
            data = data_default

        for row in data:
            if not row.desc in ("root"):
                if index_by_code is True:
                    index.append(row.code)
                    data_dict[row.code] = row.vals
                else:
                    index.append(row.desc)
                    data_dict[row.desc] = row.vals

        df = DataFrame(data_dict).T
        df = df.reindex(index)
        return df
Example #29
0
    def transform(self, X):
        """
        This will run the pyculiarity anomaly detection routine on all columns of a dataset. First it is coerced into a
        pandas DataFrame if it isn't already one, then if there is a specified timestamp index or index col, that is set
        as the index. Otherwise a naive integer is used.


        :param X:
        :return:
        """

        if not isinstance(X, DataFrame):
            X = DataFrame(X)

        if self.datetimestr_col is not None:
            X[self.datetimestr_col] = to_datetime(X[self.datetimestr_col])
            X.rename(columns={self.datetimestr_col: "_index"}, inplace=True)
        elif self.index_col is not None:
            X.rename(columns={self.index_col: "_index"}, inplace=True)
        else:
            X["_index"] = X.index.values

        for col in X.columns.values:
            if col is not "_index":
                df_col = X.reindex(columns=["_index", col])
                out = detect_ts(
                    df_col, max_anoms=self.max_anoms, alpha=self.alpha, direction=self.direction, only_last=None
                )
                X[col] = 0
                X.loc[X["_index"].isin(out["anoms"]["timestamp"].values), col] = 1

        if self.datetimestr_col is not None:
            X.rename(columns={"_index": self.datetimestr_col}, inplace=True)
        elif self.index_col is not None:
            X.rename(columns={"_index": self.index_col}, inplace=True)
        else:
            X.drop(labels=["_index"], inplace=True)

        return X
def viz_dist_mat(df, new_index, show_img=True):
    """
    Re-order a triangular data frame.
    """
    from pandas import DataFrame

    sym_dist = df.values.T + df.values

    sym_df = DataFrame(sym_dist, index=df.index, columns=df.columns)

    reorder_df = sym_df.reindex(index=new_index, columns=new_index)

    # Now restore only the upper triangle

    upptri_df = DataFrame(reorder_df.values * (df.values != 0.0), index=new_index, columns=new_index)

    if show_img:
        import matplotlib.pyplot as p

        p.imshow(upptri_df.values, interpolation="nearest", cmap="binary")
        cbar = p.colorbar()
        cbar.set_label("Distance", fontsize=20)
        p.show()
    return upptri_df