Example #1
0
 def _setTopHit(self):
     #Only 1 hit
     if len(self.structuralHit) == 1:
         [(self.topHit, v)] = self.structuralHit.items()
         return
     #Check overlap
     df = DataFrame(index=self.structuralHit, columns=self.structuralHit)
     for hitName1, hit1 in self.structuralHit.iteritems():
         for hitName2, hit2 in self.structuralHit.iteritems():
             if hitName1 == hitName2: continue
             if hit1.location.overlaps(hit2.location):
                 dif = hit1.location - hit2.location
                 if df[hitName2][hitName1] != dif:
                     df[hitName1][hitName2] = dif
     maxID1, maxID2 = df.max(axis=1).idxmax(), df.max(axis=0).idxmax()
     #If no overlap
     if maxID1 != maxID1:
         print df
         return  #This case is interesting because overlap of blast hit with tracr but no overlap of tracrs
     #else if overlap
     if len(self.structuralHit[maxID1].location) > len(
             self.structuralHit[maxID2].location):
         self.topHit = maxID1
     else:
         self.topHit = maxID2
Example #2
0
    def test_fillna_dict_series(self):
        df = DataFrame({
            'a': [nan, 1, 2, nan, nan],
            'b': [1, 2, 3, nan, nan],
            'c': [nan, 1, 2, 3, 4]
        })

        result = df.fillna({'a': 0, 'b': 5})

        expected = df.copy()
        expected['a'] = expected['a'].fillna(0)
        expected['b'] = expected['b'].fillna(5)
        assert_frame_equal(result, expected)

        # it works
        result = df.fillna({'a': 0, 'b': 5, 'd': 7})

        # Series treated same as dict
        result = df.fillna(df.max())
        expected = df.fillna(df.max().to_dict())
        assert_frame_equal(result, expected)

        # disable this for now
        with assertRaisesRegexp(NotImplementedError, 'column by column'):
            df.fillna(df.max(1), axis=1)
Example #3
0
    def test_fillna_dict_series(self):
        df = DataFrame({
            "a": [np.nan, 1, 2, np.nan, np.nan],
            "b": [1, 2, 3, np.nan, np.nan],
            "c": [np.nan, 1, 2, 3, 4],
        })

        result = df.fillna({"a": 0, "b": 5})

        expected = df.copy()
        expected["a"] = expected["a"].fillna(0)
        expected["b"] = expected["b"].fillna(5)
        tm.assert_frame_equal(result, expected)

        # it works
        result = df.fillna({"a": 0, "b": 5, "d": 7})

        # Series treated same as dict
        result = df.fillna(df.max())
        expected = df.fillna(df.max().to_dict())
        tm.assert_frame_equal(result, expected)

        # disable this for now
        with pytest.raises(NotImplementedError, match="column by column"):
            df.fillna(df.max(1), axis=1)
Example #4
0
    def ylim(zombies: DataFrame, humans: DataFrame) -> int:
        """
        finds the limit for the y-axis of the plot, i.e. the max population
        of either zombies or humans throughout the whole simulation

        :param zombies:
        :param humans:
        :return:
        """
        max_zombies = zombies.max().max()
        max_humans = humans.max().max()
        return max(max_zombies, max_humans)
Example #5
0
    def test_min_max_dt64_api_consistency_with_NaT(self):
        # Calling the following sum functions returned an error for dataframes but
        # returned NaT for series. These tests check that the API is consistent in
        # min/max calls on empty Series/DataFrames. See GH:33704 for more
        # information
        df = DataFrame(dict(x=pd.to_datetime([])))
        expected_dt_series = Series(pd.to_datetime([]))
        # check axis 0
        assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT)
        assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT)

        # check axis 1
        tm.assert_series_equal(df.min(axis=1), expected_dt_series)
        tm.assert_series_equal(df.max(axis=1), expected_dt_series)
    def __init__(self, data: pd.DataFrame, features):
        self.data = data
        self.features = features

        mmin = data.min()
        mmax = data.max()
        feature_size = mmax - mmin

        margin = 0.4
        # margin = 1.4
        # margin = 0.0
        self.mins = data.min() - feature_size * margin
        self.maxs = data.max() + feature_size * margin
        self.limits = np.c_[self.mins, self.maxs]
        self.feature_size = self.maxs - self.mins
def calc_distance_matrix(G, max_distance=None):
    """Returns a matrix containing the shortest distance
    between all nodes in a network

    Parameters
    ----------
    G : graph
       A NetworkX graph

    max_distance : float or None, optional (default='None')
       The maximum possible distance value in the network.
       If None, max_distance is the longest shortest path between
       two nodes of the network (the graph eccentricity)

    Returns
    -------
    dist_matrix : NumPy array
      An NxN numpy array.

    Notes
    -----
    Along the diagonal, the values are all 0.
    Unconnected nodes have a distance of max_distance to other nodes.
    """

    # Network (collaborator) Distance
    dist_matrix = nx.all_pairs_shortest_path_length(G)
    dist_matrix = DataFrame(dist_matrix, index=G.nodes(), columns=G.nodes())
    if max_distance is None:
        max_distance = float(dist_matrix.max().max())
    dist_matrix = dist_matrix.fillna(max_distance)
    # The unconnected ones are infinitely far from the rest
    diag_idx = np.diag_indices(len(dist_matrix), ndim=2)
    dist_matrix.values[diag_idx] = 0
    return dist_matrix
Example #8
0
def normalize(df: DataFrame) -> DataFrame:
    """Normalizes the data"""

    ptid_col: DataFrame = get_del_ptid_col(df)
    df: DataFrame = (df - df.min(axis=0)) / (df.max(axis=0) - df.min(axis=0))
    df: DataFrame = concat([ptid_col, df], axis=1)
    return df
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
Example #10
0
    def _set_yaxis_limits(self, series: pandas.DataFrame):
        """ Sets self.ylimits using the min/max o fthe series values"""
        y_min = series.min()
        y_max = series.max()

        y_limits = (y_min, y_max)
        self.plotter.y_value_limits = y_limits
Example #11
0
def compute_confusion_matrix(target, predicted, normalize=True, sort = True):
    """ returns a confusion matrix as a data frame with labels
    Parameters:
        target (array): The values that are predicted.
        predicted (array): predicted values.
        normalize (bool): If True, Normalize
        normalize (bool): If true sort by value.
    Returns (DataFrame): df with the confusion matrix.
    """

    # Determine the uniqu values in the target list, sort them and assign as labels.
    labels = np.unique(list(target))
    labels.sort()

    # Compute the confusion matrix, place into data frame and normailize if desired.
    confusion = metrics.confusion_matrix(target, predicted, labels)
    confusion = DataFrame(confusion, index=labels, columns=labels)
    if normalize:
        confusion = confusion.apply(lambda x: x / np.sum(x), axis=1)

    # if sort is true: find the max value for each and then sort, the confusion matrix
    if sort:
        #get the max values, order and then use to order the confusion matrix on both axes
        max_values =confusion.max(axis = 1)
        max_values.sort(inplace = True, ascending=False)
        order = max_values.index
        confusion = confusion.loc[order,order]
    return confusion
Example #12
0
def scale(df: pd.DataFrame, method: str) -> pd.DataFrame:
    """
    scales features using different methods.

    Parameters
    ----------
    df: pandas.DataFrame
    method: {"autoscaling", "rescaling", "pareto"}
        Scaling method. `autoscaling` performs mean centering scaling of
        features to unitary variance. `rescaling` scales data to a 0-1 range.
        `pareto` performs mean centering and scaling using the square root of
        the standard deviation

    Returns
    -------
    scaled: pandas.DataFrame
    """
    if method == "autoscaling":
        scaled = (df - df.mean()) / df.std()
    elif method == "rescaling":
        scaled = (df - df.min()) / (df.max() - df.min())
    elif method == "pareto":
        scaled = (df - df.mean()) / df.std().apply(np.sqrt)
    else:
        msg = "Available methods are `autoscaling`, `rescaling` and `pareto`."
        raise ValueError(msg)
    # replace nans generated when dividing by zero
    scaled[scaled.isna()] = 0
    return scaled
    def clean(numpy_array):  #load your csv data here in numpy_array
        data=ut.preprocessData(numpy_array)

        #print dataarray
        #print data

        ###### numpy into pandas dataframe
        df = pd.DataFrame(data)
        #print df
        #print df.dtypes

        df=df.astype('float16')
        #print df.dtypes


        ###### generate preprocessed csv file 
        #df.to_csv('preprocessed_data.csv', sep=',',index=False)

        ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin)
        df_norm= (df - df.min()) / (df.max()-df.min())
        df_norm=df_norm.fillna(-1)

        ##### generate normalized csv 
        #df_norm.to_csv('normalized_data.csv',sep=',', index=False)
        
        return df_norm.as_matrix() 
Example #14
0
def min_max_scale_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Scales the data frame values between 0 and 1 across the columns allowing for easier comparison of line shape on plots
    :param df: data frame to be scaled
    :return: scaled dataframe
    """
    return df.div(df.max(), axis=1)
Example #15
0
def analyze(df: pd.DataFrame):
    """中身を適当に分析してDataFrameに詰めて返す。"""
    if isinstance(df, pd.DataFrame):
        df_result = pd.DataFrame(index=df.columns)
        df_result["dtype"] = df.dtypes
        df_result["null"] = df.isnull().sum()
        df_result["nunique"] = df.nunique()
        df_result["min"] = df.min()
        df_result["median"] = df.median()
        df_result["max"] = df.max()
        df_result["mode"] = df.mode().transpose()[0]
        df_result["mean"] = df.mean()
        df_result["std"] = df.std()
        # # はずれ値のはずれ度合いを見るためにRobustScalerした結果の絶対値を見てみる。
        # numeric_columns = df.select_dtypes(include=np.number).columns
        # df_result["outlier_size"] = np.nan
        # df_result.loc[numeric_columns, "outlier_size"] = (
        #     tk.preprocessing.SafeRobustScaler(clip_range=None)
        #     .fit_transform(df.loc[:, numeric_columns])
        #     .fillna(0)
        #     .abs()
        #     .max()
        #     .round(decimals=1)
        # )
        return df_result
    else:
        raise NotImplementedError()
Example #16
0
    def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture):
        # GH#36907
        tz = tz_naive_fixture
        if isinstance(tz, tzlocal) and is_platform_windows():
            pytest.xfail(
                reason="GH#37659 OSError raised within tzlocal bc Windows "
                "chokes in times before 1970-01-01")

        df = DataFrame({
            "a": [
                Timestamp("2020-01-01 08:00:00", tz=tz),
                Timestamp("1920-02-01 09:00:00", tz=tz),
            ],
            "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT],
        })

        res = df.min(axis=1, skipna=False)
        expected = Series([df.loc[0, "a"], pd.NaT])
        assert expected.dtype == df["a"].dtype

        tm.assert_series_equal(res, expected)

        res = df.max(axis=1, skipna=False)
        expected = Series([df.loc[0, "b"], pd.NaT])
        assert expected.dtype == df["a"].dtype

        tm.assert_series_equal(res, expected)
    def draw_bar3D(cls, title: str, data: pd.DataFrame) -> Bar3D:
        """
        根据df内容绘制3D柱状图
        :param title:           标题
        :param data:            包含三轴数据的dataframe  index为x轴 column为Y轴 value为z轴
        :return:
        """
        data_list = []
        index_list = data.index.tolist()
        column_list = data.columns.tolist()

        # 获取dataframe最大最小值
        min_data = data.min().min()
        max_data = data.max().max()
        # 遍历dataframe,准备待操作数组
        for i in range(len(index_list)):
            for j in range(len(column_list)):
                # 记录 XYZ
                temp_list = [index_list[i], column_list[j], data.iloc[i, j]]
                # print(i,j,index_list[i],column_list[j])
                data_list.append(temp_list)

        c = (
            Bar3D(init_opts=opts.InitOpts(
                width=DEFAULT_WIDTH,
                animation_opts=opts.AnimationOpts(
                    animation_delay=200,
                    animation_easing="bounceOut"),  #   增加启动动效
            )).add(
                series_name=title,
                data=data_list,
                xaxis3d_opts=opts.Axis3DOpts(type_="category",
                                             data=index_list),
                yaxis3d_opts=opts.Axis3DOpts(type_="category",
                                             data=column_list),
                zaxis3d_opts=opts.Axis3DOpts(type_="value"),
            ).set_series_opts(label_opts=opts.LabelOpts(is_show=True)).
            set_global_opts(
                title_opts=opts.TitleOpts(title=title, pos_left="0%"),
                toolbox_opts=opts.ToolboxOpts(),  # 显示工具箱
                tooltip_opts=opts.TooltipOpts(is_show=True),
                axispointer_opts=opts.AxisPointerOpts(
                    is_show=True, type_="none"),  # 指针移动时显示所有数值
                legend_opts=opts.LegendOpts(
                    is_show=True,
                    selected_mode="multiple",
                    # pos_bottom="0%",
                    # pos_right="0%",
                    # orient="vertical",
                ),  # 显示图例说明
                # datazoom_opts=[
                #     opts.DataZoomOpts(
                #         range_start=0, range_end=100, orient="vertical", pos_left="2%"
                #     ),
                #     opts.DataZoomOpts(range_start=0, range_end=100, orient="horizontal"),
                # ],  # 增加缩放配置横纵轴都支持缩放
                visualmap_opts=opts.VisualMapOpts(max_=max_data, min_=min_data)
                # visualmap_opts=opts.VisualMapOpts(type_="color", max_=1, min_=-1),
            ))
        return c
Example #18
0
def decay(close, kind=None, length=None, mode=None, offset=None, **kwargs):
    """Indicator: Decay"""
    # Validate Arguments
    close = verify_series(close)
    length = int(length) if length and length > 0 else 5
    mode = mode.lower() if isinstance(mode, str) else "linear"
    offset = get_offset(offset)

    # Calculate Result
    _mode = "L"
    if mode == "exp" or kind == "exponential":
        _mode = "EXP"
        diff = close.shift(1) - exp(-length)
    else: # "linear"
        diff = close.shift(1) - (1 / length)
    diff[0] = close[0]
    tdf = DataFrame({"close": close, "diff": diff, "0": 0})
    ld = tdf.max(axis=1)

    # Offset
    if offset != 0:
        ld = ld.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        ld.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        ld.fillna(method=kwargs["fill_method"], inplace=True)

    # Name and Categorize it 
    ld.name = f"{_mode}DECAY_{length}"
    ld.category = "trend"

    return ld
Example #19
0
    def testSingle(self, test, fold):
        #
        # devents = xgb.DMatrix( test[ self.variables ].values )
        # prediction = DataFrame( self.models[fold].predict( devents ) )
        #
        # return DataFrame(dtype = float, data = {"predicted_class":prediction.idxmax(axis=1).values,
        #                          "predicted_prob": prediction.max(axis=1).values } )

        devents = xgb.DMatrix(test[self.variables].values)
        prediction = DataFrame(self.models[fold].predict(devents))

        # note: this uses idxmax (the column header of the max value) and tries to convert it to a float
        # therefore renaming of the header should be done AFTER extracting the predicted_class
        df = DataFrame(dtype=float,
                       data={
                           "predicted_frac_class":
                           prediction.idxmax(axis=1).values,
                           "predicted_frac_prob": prediction.max(axis=1).values
                       })

        # header renaming
        headers = []
        for i in range(0, len(prediction.columns)):
            headers.append("predicted_frac_prob_" + str(i))
        prediction.columns = headers

        # horizontal concat (adding columns)
        result = concat([prediction, df], axis=1)

        return result
Example #20
0
def get_answer1(df: pd.DataFrame) -> float:
    Nstep = 100
    df = df2grid(df)
    df = df.astype(int)
    Nflashes = 0
    for itt in range(0, Nstep):
        #step 1
        df += 1

        #step 2
        df_has_flashed = pd.DataFrame(False,
                                      index=df.index,
                                      columns=df.columns)
        max_val_not_flashed = df.max().max()
        while max_val_not_flashed > 9:
            df_flash = df > 9
            for x in itertools.product(list(df.index), list(df.columns)):
                if df_flash.iloc[x] and not df_has_flashed.iloc[x]:
                    #add to all neighbors
                    xn_list = get_neighbor_idx(x, df)
                    for xn in xn_list:
                        df.iloc[xn] += 1
            # update
            df_has_flashed = df_has_flashed | df_flash
            max_val_not_flashed = df[-df_has_flashed].max().max()

        #step 3
        df[df_has_flashed] = 0
        Nflashes += df_has_flashed.sum().sum()

    return Nflashes
Example #21
0
def linear_decay(close, length=None, offset=None, **kwargs):
    """Indicator: Linear Decay"""
    # Validate Arguments
    close = verify_series(close)
    length = int(length) if length and length > 0 else 5
    offset = get_offset(offset)

    # Calculate Result
    diff = close.shift(1) - (1 / length)
    diff[0] = close[0]
    tdf = DataFrame({"close": close, "diff": diff, "0": 0})
    ld = tdf.max(axis=1)

    # Offset
    if offset != 0:
        ld = ld.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        ld.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        ld.fillna(method=kwargs["fill_method"], inplace=True)

    # Name and Categorize it
    ld.name = f"LDECAY_{length}"
    ld.category = "trend"

    return ld
Example #22
0
 def force(G: nx.Graph):
     df = DataFrame(index=G.nodes(), columns=G.nodes())
     for row, data in nx.shortest_path_length(G):
         for col, dist in data.items():
             df.loc[row, col] = dist
     df = df.fillna(df.max().max())
     return df.to_dict()
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series,
                     mapper_dict: Dict) -> DataFrame:
    """
    Get prediction probabilities (if available) or return true and predicted
    labels
    """
    df_preds = DataFrame(est.predict(X_test), index=X_test.index)
    if hasattr(est.named_steps["clf"], "predict_proba"):
        # Get prediction probabilities (if available)
        df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index)

        # Append prediction and prediction probabilities
        df_summ = concat([df_preds, df_probas], axis=1)
        df_summ.columns = ["predicted_label"] + [
            f"probability_of_{i}" for i in range(0, len(np.unique(y_test)))
        ]

        # Get label (class) with maximum prediction probability for each row
        df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1)
        df_summ["probability_of_max_class"] = df_probas.max(axis=1)

        # Compare .predict_proba() and manually extracted prediction
        # probability
        lhs = df_summ["max_class_number_manually"]
        rhs = df_summ["predicted_label"].replace(mapper_dict)
        assert (lhs == rhs).eq(True).all()
    else:
        df_summ = df_preds.copy()
    # Get true label
    df_summ.insert(0, "true_label", y_test)
    return df_summ
Example #24
0
def generate_animation(df: pd.DataFrame):
    global y_max

    dates = df.date.unique()
    ls = [type(item) for item in dates]
    plot_title, ax_title, marker_col, value_col = get_titles()
    # data_filtered = data[data['date'] == dates[0]]
    marker_col = 'Kanton'
    progress_bar = st.progress(0)
    progress_timestep_inc = 1 / len(dates) * 100
    progress_timestep = 0
    anim = st.empty()
    y_max = df.max(axis = 0)['value']
    value_col = 'value'
    data.fillna(0)
    i=1
    for dt in dates:
        data_filtered = df[df['date'] == dt]
        ts = pd.to_datetime(str(dt))
        plot_title = cn.variable_dic[variables[0]] + ', Datum: ' + ts.strftime('%d.%m.%Y')
        chart = get_bar_chart(data_filtered, plot_title, ax_title, marker_col, value_col, 'Kanton')
        anim.altair_chart(chart)
        # chart.bar_chart(data_filtered)
        if dt == cn.DATE_LIST[-1] or progress_timestep > 100:
            progress_timestep = 100
        progress_bar.progress(progress_timestep)
        progress_timestep = int(i * progress_timestep_inc)
        time.sleep(0.8)
        i += 1
Example #25
0
def create_metric_bar_chart_comparison(df: DataFrame,
                                       output_filename: str,
                                       max_y_limit: int = None):
    metric = df.columns[1]
    if max_y_limit:
        max_y_limit = df.max()[metric]
    sns.set(style="whitegrid")
    sns.set_context("paper",
                    rc={
                        "font.size": 14,
                        "axes.titlesize": 32,
                        "axes.labelsize": 18
                    })
    g = sns.catplot(
        x="",
        y=metric,
        hue="Approach",
        data=df,
        kind="bar",
        height=5,
        aspect=1,
        palette=["skyblue", "sandybrown", "green"],
    )
    g.set(ylim=(0, max_y_limit))
    g.savefig(output_filename)
Example #26
0
def learning ():
	#input de dados
	dados = read_csv("lookout_histories.csv")
	data = DataFrame(dados['history']) # Input to system
	output = DataFrame(dados['output']) # Comparison output to system
	output = (output - output.mean()) / (output.max() - output.min()) # Normalization


	model = Sequential()
	model.add(Dense(20, input_dim = data.shape[1], activation = 'relu'))
	model.add(Dense(1, activation = 'sigmoid'))

	print("Modelo pronto")

	model.compile(optimizer='rmsprop',
				loss = 'sparse_categorical_crossentropy',
				metrics=['accuracy'])
	# Train the model, iterating on the data in batches of 32 samples
	model.fit(data, output, epochs=3, batch_size=8)

	print("Modelo terminado")
	# evaluate the model
	scores = model.evaluate(data, output)
	print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
	return()
Example #27
0
def createAALstats(df_aal: pd.DataFrame) -> pd.DataFrame:
    """Group together some basic statistics from each AAL Group"""
    stat_array = np.array([df_aal.mean(),df_aal.median(),df_aal.min(),df_aal.max()]).T
    stat_cols = ['Average','Median','Minimum','Maximum']
    stat_index = df_aal.mean().index
    df_stats = pd.DataFrame(stat_array,columns=stat_cols,index=stat_index)
    return df_stats
Example #28
0
    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(
            np.random.randn(1000, 3),
            columns=["A", "B", "C"],
            index=date_range("1/1/2000", periods=1000),
        )

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype="float64")
        tm.assert_series_equal(result, expected)

        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F"))
Example #29
0
def guiyi(a, h):
    b = a.T  # 先对原始矩阵转置
    c = DataFrame(b)  # 将其变成序列
    d = (c - c.min(axis=0)) / (c.max(axis=0) - c.min(axis=0))  # 归一化处理
    e = np.array(d)  # 将其变成归一化矩阵
    f = e.T  # 再转置回来
    g = np.hstack((h, f))  # 返回归一化后的矩阵加入索引的矩阵
    return g
Example #30
0
def describe(df: pd.DataFrame) -> pd.DataFrame:
    return pd.concat([
        df.mean().rename('mean'),
        df.median().rename('median'),
        df.max().rename('max'),
        df.min().rename('min')
    ],
                     axis=1).T
Example #31
0
 def __init__(self, data: pd.DataFrame, cmap: Colormap):
     super().__init__()
     self._data = data
     self.max = data.max().max()
     self.min = data.min().min()
     self.normalize = Normalize(self.min, self.max)
     self.cmap = cmap
     self.generate_colors()
def _highlight_max(data: pd.DataFrame, color="yellow") -> pd.DataFrame:
    attr = "background-color: {}".format(color)
    data = data.astype(float)
    max_data = data.max(axis=1, level=0)
    is_max = data.eq(max_data, axis=1)
    return pd.DataFrame(
        np.where(is_max, attr, ""), index=data.index, columns=data.columns
    )
Example #33
0
def select_signatures(W: pd.DataFrame, H: pd.DataFrame):
    """
    Scales NMF output by sample and feature totals to select Signatures.
    ------------------------
    Args:
        * W: input W matrix (K x n_features)
        * H: input H matrix (n_samples x K)

    Returns:
        * W: output W matrix with max_id, max, and max_norm columns
        * H: output H matrix with max_id, max, and max_norm columns
    """
    Wnorm = W.copy()
    Hnorm = H.copy()

    # Scale Matrix
    for j in range(W.shape[1]):
        Wnorm.iloc[:,j] *= H.sum(1).values[j]
        Hnorm.iloc[j,:] *= W.sum(0).values[j]

    # Normalize
    Wnorm = Wnorm.div(Wnorm.sum(1),axis=0)
    Hnorm = Hnorm.div(Hnorm.sum(0),axis=1)

    H = H.T
    Hnorm = Hnorm.T

    # Get Max Values
    H_max_id = H.idxmax(axis=1, skipna=True).astype('int')
    H['max'] = H.max(axis=1, skipna=True)
    H['max_id'] = H_max_id
    Hnorm['max_norm']=Hnorm.max(axis=1, skipna=True)

    W_max_id = W.idxmax(axis=1, skipna=True).astype('int')
    W['max'] = W.max(axis=1, skipna=True)
    W['max_id'] = W_max_id
    Wnorm['max_norm']=Wnorm.max(axis=1, skipna=True)

    H['max_norm'] = Hnorm['max_norm']
    W['max_norm'] = Wnorm['max_norm']

    _rename = {x:'S'+x for x in list(H)[:-3]}
    H = H.rename(columns=_rename)
    W = W.rename(columns=_rename)

    return W,H
 def kmeanCunt(data: DataFrame, k):
     from sklearn.cluster import KMeans
     kmodel = KMeans(n_clusters=k)  # 建立模型
     kmodel.fit(data.values.reshape(len(data), 1))  # 训练模型
     c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0)  # 输出聚类中心并排序
     w = c.rolling(2).mean().iloc[1:]  # 相邻2项求中点 作为边界点
     w = [0] + list(w[0]) + [data.max()]  # 把首末加上
     data = pd.cut(data, w)
Example #35
0
    def __generate_trace(self, objectives: DataFrame, metadata: list = None, legend: str = '', normalize: bool = False,
                         **kwargs):
        number_of_objectives = objectives.shape[1]

        if normalize:
            objectives = (objectives - objectives.min()) / (objectives.max() - objectives.min())

        marker = dict(
            color='rgb(127, 127, 127)',
            size=3,
            symbol='x',
            line=dict(
                color='rgb(204, 204, 204)',
                width=1
            ),
            opacity=0.8
        )
        marker.update(**kwargs)

        if number_of_objectives == 2:
            trace = go.Scattergl(
                x=objectives[0],
                y=objectives[1],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        elif number_of_objectives == 3:
            trace = go.Scatter3d(
                x=objectives[0],
                y=objectives[1],
                z=objectives[2],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        else:
            dimensions = list()
            for column in objectives:
                dimensions.append(
                    dict(range=[0, 1],
                         label=self.axis_labels[column:column+1][0] if self.axis_labels[column:column+1] else None,
                         values=objectives[column])
                )

            trace = go.Parcoords(
                line=dict(color='blue'),
                dimensions=dimensions,
                name=legend,
            )

        return trace
Example #36
0
    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'],
                         index=date_range('1/1/2000', periods=1000))

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

        # .item()
        s = Series([1])
        result = s.item()
        assert result == 1
        assert s.item() == s.iloc[0]

        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype='float64')
        tm.assert_series_equal(result, expected)

        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F'))

        # compress
        # GH 6658
        s = Series([0, 1., -1], index=list('abc'))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.], index=['b']))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Index(dtype=object) as the same as original
        exp = Series([], dtype='float64', index=Index([], dtype='object'))
        tm.assert_series_equal(result, exp)

        s = Series([0, 1., -1], index=[.1, .2, .3])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.], index=[.2]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Float64Index as the same as original
        exp = Series([], dtype='float64', index=Index([], dtype='float64'))
        tm.assert_series_equal(result, exp)
def _to_labels(probabilities: pd.DataFrame) -> pd.Series:
    labels = probabilities.idxmax(axis='columns')

    # Find places where there are multiple maximum values
    max_probabilities = probabilities.max(axis='columns')
    is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='rows')
    number_of_max: pd.Series = is_max.sum(axis='columns')
    multiple_max: pd.Series = number_of_max.gt(1)
    # Set those locations as an 'undecided' label
    labels[multiple_max] = 'undecided'
    # TODO: emit a warning if any are set to 'undecided'

    return labels
Example #38
0
    def test_fillna_dict_series(self):
        df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
                        'b': [1, 2, 3, np.nan, np.nan],
                        'c': [np.nan, 1, 2, 3, 4]})

        result = df.fillna({'a': 0, 'b': 5})

        expected = df.copy()
        expected['a'] = expected['a'].fillna(0)
        expected['b'] = expected['b'].fillna(5)
        assert_frame_equal(result, expected)

        # it works
        result = df.fillna({'a': 0, 'b': 5, 'd': 7})

        # Series treated same as dict
        result = df.fillna(df.max())
        expected = df.fillna(df.max().to_dict())
        assert_frame_equal(result, expected)

        # disable this for now
        with pytest.raises(NotImplementedError, match='column by column'):
            df.fillna(df.max(1), axis=1)
 def _extract_wpa(self, document):
     
     verbs = set(self._get_verbs(document, with_tags=False))
     
     count_verbs = len(verbs)
     count_pa_words = len(PA_WORDS)
     
     wpa_similarity_frame = DataFrame(
         np.empty((count_verbs, count_pa_words)), index=verbs, columns=PA_WORDS
     )
     
     for verb in verbs:
         for pa_word in PA_WORDS:                
             synset_1 = Synset('{}.v.0'.format(Word(pa_word).lemmatize('v')))                                
             synset_2 = Synset('{}.v.0'.format(Word(verb).lemmatize('v')))                                
             wpa_similarity_frame[pa_word][verb] = synset_2.wup_similarity(synset_1) 
     
     wpa_max_columns = wpa_similarity_frame.max()
             
     return max(wpa_max_columns)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
Example #41
0
 def predict(self, prediction_data):
   preds = DataFrame(prediction_data)
   col_names = prediction_data.keys()
   tally_dict = {}
   for col_name in unique(preds):
     tally_dict[col_name] = [0 for x in range(preds.shape[0])]
   for row in preds.iterrows():
     index, data = row
     for col_name, elem in zip(col_names, data):
       tally_dict[elem][index] += self.weights[col_name]
   tally_df = DataFrame(tally_dict)
   max_val = [int(round(x)) for x in tally_df.max(1).tolist()]
   max_level = []
   for row in tally_df.index:
     int_vals = [int(round(x)) for x in tally_df.ix[row].tolist()] 
     is_max = [x == max_val[row] for x in int_vals]
     if sum(is_max) > 1:
       max_level.append(None)
     else:
       max_level.append(tally_df.columns[ is_max ][0])
   return(max_level)
dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"])
dframe1

# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb

# import pandas_datareader.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(
Example #43
0
def pca(x, y=None, ylev=None,
        nlab=0, lsize=10, lalpha=1,
        center="both", scale="none",
        legend=True, cname="variable",
        color=None):
    if type(color) != type({}):
        color = None
    xForSvd = x.ix[:, x.std(axis=0) > 0]
    xsvd = svdForPca(xForSvd, center, scale)
    svdRowPlot = DataFrame(
        xsvd[0][:, 0:2],
        index = xForSvd.index,
        columns = ["PC1", "PC2"]
    )
    svdRowPlot = svdRowPlot.divide(svdRowPlot.max(axis=0) -
                                   svdRowPlot.min(axis=0), axis=1)
    svdColPlot = DataFrame(
        numpy.transpose(xsvd[2][0:2, :]),
        index = xForSvd.columns,
        columns = ["PC1", "PC2"]
    )
    svdColPlot = svdColPlot.divide(svdColPlot.max(axis=0) -
                                   svdColPlot.min(axis=0), axis=1)
    if nlab > 0:
        svdColPlotMag = (svdColPlot**2).sum(axis=1)
        svdColPlotMag.sort_values(ascending=False, inplace=True)
        svdColPlot = svdColPlot.ix[svdColPlotMag.index]
        svdColPlot["label"] = ""
        svdColPlot.ix[0:nlab, "label"] = \
                svdColPlot.ix[0:nlab].index.to_series()
    if legend:
        ax = plt.subplot(111)
    plt.plot(svdColPlot["PC1"], svdColPlot["PC2"],
             "o", color=(0, 0, 0, 0.1), markersize=5,
             label=cname)
    if nlab > 0:
        for i in range(nlab):
            plt.text(svdColPlot.ix[i, "PC1"],
                     svdColPlot.ix[i, "PC2"],
                     svdColPlot.ix[i, "label"],
                     fontsize = lsize,
                     color = (0, 0, 0, lalpha),
                     label = None)
    if y is not None:
        if ylev is None:
            ylev = y.unique()
        for level in ylev:
            if color is not None and level in color.keys():
                plt.plot(svdRowPlot.ix[y == level, 0],
                         svdRowPlot.ix[y == level, 1],
                         "o",
                         markersize = 8,
                         label = level,
                         color = color[level])
            else:
                plt.plot(svdRowPlot.ix[y == level, 0],
                         svdRowPlot.ix[y == level, 1],
                         "o",
                         markersize = 8,
                         label = level)
    else:
        plt.plot(svdRowPlot["PC1"], svdRowPlot["PC2"],
                 "o", markersize=8)
    if legend:
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width*0.8, box.height])
        ax.legend(loc="center left", bbox_to_anchor=(1, 0.5), numpoints=1)
    plt.show()
Example #44
0
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

data['food'].map(lambda x: meat_to_animal[x.lower()])

# 数据标准化
datafile = 'd:/data/normalization_data.xls' #参数初始化
data = pd.read_excel(datafile, header = None) #读取数据

(data - data.min())/(data.max() - data.min()) #最小-最大规范化
(data - data.mean())/data.std() #零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化


###替换值
data = Series([1., -999., 2., -999., -1000., 3.])
data

data.replace(-999, np.nan)

data.replace([-999, -1000], np.nan)

data.replace([-999, -1000], [np.nan, 0])

data.replace({-999: np.nan, -1000: 0})
c    36
'''

print
'lambda(匿名函数)以及应用'
print
frame
'''

   A  B  C
a  0  1  2
b  3  4  5
c  6  7  8
'''
print
frame.max()
'''
A    6
B    7
C    8
'''
f = lambda x: x.max() - x.min()
print
frame.apply(f)  # 作用到每一列
'''
A    6
B    6
C    6
'''
print
frame.apply(f, axis=1)  # 作用到每一行
    def _extract_ado_loc_org(self, document):
        """
            PA word has dependent object of PO category (ADO)
            
            In a PI post, a purchase action is targeted towards a consumable object. 
            This is reflected in the dependency structure of the text.
            
            In a PI post, the consumable object is usually the directly 
            dependent object of the purchase action verb.
            
            If there is a PA word in the text and it has a dependent object belonging to a PO category, ADO = 1, otherwise ADO = 0
             
        """
        
        # 1. Identify if there is a PA word. (or a very similar one)
        # 2. Identify if this PA word has an object.
        # 3. Identify if this object belongs to the PO category.
        # 4. If the 3 statements above are true, return ADO = 1 else ADO = 0
        
        s = pattern.en.parsetree(document, relations=True, lemmata=True)
        
        ADO = 0
        LOC = 0
        ORG = 0
        
        # Extract VERBs
        # Find out if they are ACTION VERBs or not
        # For each found, find if it has an object which is it's direct dependant
        # For each object found, find if it belongs to a PO category
        # For each object found, find it's NER (LOC, ORG)
        
        for sentence in s:
            
            for chunk in sentence.chunks:
                
                if chunk.type == 'VP':

                    print 'Chunk    : ', chunk
                    print 'Subject  : ', chunk.subject
                    print 'Object   : ', chunk.object
                    print 'String   : ', chunk.string                                                            
                    print 'Tagged   : ', chunk.tagged
                    print 'Role     : ', chunk.role                                                             
                    print 'Relation : ', chunk.relation                                                                                 
                    print 'Related  : ', chunk.related                                                                                 

                    
                    # Does it have an object?                    
                    if chunk.object is not None:
                        
                        # Get the verbs!                        
                        verbs_and_tags = filter(lambda x: x[1] in VERB_TAGS, chunk.tagged)
                        print 'Verbs    : ', verbs_and_tags
                        verbs = [verb[0] for verb in verbs_and_tags]                        
                    
                        # Are they PA words?                            
                        count_verbs = len(verbs)
                        count_pa_words = len(PA_WORDS)
        
                        wpa_similarity_frame = DataFrame(
                            np.empty((count_verbs, count_pa_words)), index=verbs, columns=PA_WORDS
                        )
        
                        for verb in verbs:
                            for pa_word in PA_WORDS:                
                                synset_1 = Synset('{}.v.0'.format(Word(pa_word).lemmatize('v')))                                
                                synset_2 = Synset('{}.v.0'.format(Word(verb).lemmatize('v')))                                
                                wpa_similarity_frame[pa_word][verb] = synset_2.wup_similarity(synset_1) 
        
                        wpa_max_columns = wpa_similarity_frame.max()
                
                        wpa = max(wpa_max_columns)
                        
                        if wpa >= 0.7:
                            # Get the nouns from the object
                            
                            head_noun = chunk.object.head                            
                            
                            # do they belong to PO category? 

                            if head_noun:  # check if head belongs to PO Category
                                print 'Head    : ', head_noun
                                
                                ADO = 1 # Fix this, implment this actually based on determining if head_noun belongs to PO Category
                                
                                # IMPORTANT: 
                                # Try and compile your own list of Consumable and Non-Consumable Categories
                                # as well as the words that belong to them.
                                # Freebase isn't available and Google Knowledge base seems not applicable.
                                
                                print 'Next PP   : ', chunk.object.next('PP')
                                if chunk.object.next('PP') is not None:
                                    print 'Next NP   : ', chunk.object.next('PP').next('NP')
                                
                                    word = chunk.object.next('PP').next('NP').head
                                
                                    ner_tagged = stanford_tagger.tag([word.string.title()])                                
                                    print 'NER      : ', ner_tagged
                                    print 

                                    print 'NER_LOC_TAGS:   ', filter(lambda w: w[1] in NER_LOC_TAGS, ner_tagged)
                                    if len(filter(lambda w: w[1] in NER_LOC_TAGS, ner_tagged)) > 0:
                                        LOC = 1
                                    else:
                                        LOC = 0
                                    
                                    print 'NER_ORG_TAGS:   ', filter(lambda w: w[1] in NER_ORG_TAGS, ner_tagged)
                                    if len(filter(lambda w: w[1] in NER_ORG_TAGS, ner_tagged)) > 0 :
                                        ORG = 1
                                    else:
                                        ORG = 0                                        

                                
                                return {'ADO': ADO, 'ORG': ORG, 'LOC': LOC}    
                        
                        

                     
                    print                    
                    
            
        print
            

        return {'ADO': ADO, 'ORG': ORG, 'LOC': LOC}
Example #47
0
# after prepaired data, time to plot it:

for new_counter in range(file_counter+1):
    #print new_counter
    Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    #prepairing proper time:
    x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1]
    
    Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ]
    x2_average = DataFrame.mean(Raws)["Raw key"]
    x2_median = DataFrame.median(Raws)["Raw key"]
    x2_max = DataFrame.max(Raws)["Raw key"]
    
    Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)]
    
    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()

    print x2_average
    #x2_std_dev = 3
    #once again correcting counter:
    x2[:] = [x - quelle_initialTimestamps[new_counter] for x in x2]
    #print x1[0], x2[0], quelle_initialTimestamps[new_counter]
    # Two subplots, the axes array is 1-d http://matplotlib.org/examples/pylab_examples/subplots_demo.html
    f, axarr = plt.subplots(2, sharex=True)
    axarr[0].grid()
    axarr[0].plot(x1, y1)
df_app_cat = df_app_cat.sort(columns="avg")


# In[286]:

plt.plot(df_app_cat["avg"])


# In[287]:

plt.plot(df_app_cat["avg"], "bo", df_app_cat["avg"], "k")


# In[288]:

df_app_cat.max()


# In[289]:

t1["app_cat_high"] = 0
t2["app_cat_high"] = 0
test["app_cat_high"] = 0
t1["app_cat_high"][t1["app_category"] == "fc6fa53d"] = 1
t2["app_cat_high"][t2["app_category"] == "fc6fa53d"] = 1
test["app_cat_high"][test["app_category"] == "fc6fa53d"] = 1


# In[292]:

validation_check2(feature_cols, ["app_cat_high"])