Example #1
0
    def prepare_template_vars(self, sq_m_avg: Series, flats_ttl: DataFrame,
                              flat_disp: DataFrame) -> Dict:
        # final report variables
        pocet_bytu = len(self.data.index)
        nejdrazsi_region = sq_m_avg.idxmax()
        nejvyssi_cena = sq_m_avg.loc[nejdrazsi_region]
        nejlevnejsi_region = sq_m_avg.idxmin()
        nejnizsi_cena = sq_m_avg.loc[nejlevnejsi_region]
        nejvice_bytu = flats_ttl.idxmax()['Počet bytů']
        nejvice_bytu_pct = int(flats_ttl.loc[nejvice_bytu]['Počet bytů'] /
                               pocet_bytu * 100)
        nejcastejsi_dispozice = flat_disp.idxmax()['Počet bytů']
        total = self.get_sum(flat_disp)
        nejcastejsi_dispozice_pct = int(
            flat_disp.loc[nejcastejsi_dispozice]['Počet bytů'] / total * 100)

        return {
            "pocet_bytu": pocet_bytu,
            "nejdrazsi_region": nejdrazsi_region.upper(),
            "nejvyssi_cena": "{:,}".format(nejvyssi_cena).replace(',', ' '),
            "nejlevnejsi_region": nejlevnejsi_region.upper(),
            "nejnizsi_cena": "{:,}".format(nejnizsi_cena).replace(',', ' '),
            "nejvice_bytu": nejvice_bytu,
            "nejvice_bytu_pct": nejvice_bytu_pct,
            "nejcastejsi_dispozice": nejcastejsi_dispozice,
            "nejcastejsi_dispozice_pct": nejcastejsi_dispozice_pct,
            "vygenerovano": datetime.now().strftime("%d.%m.%Y %H:%M")
        }
Example #2
0
def predict(model, result_dir):
    test_gen = data_generator(test_data_dir, shuffle=False)

    proba = model.predict_generator(test_gen, nb_test_samples)
    proba_df = DataFrame(proba, index=test_gen.filenames)

    proba_df.to_csv(os.path.join(result_dir, 'proba.csv'))
    proba_df.idxmax(axis=1).to_csv(os.path.join(result_dir, 'pred.csv'))
Example #3
0
def practice_three():
    df = DataFrame(
        [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
        index=['a', 'b', 'c', 'd'],
        columns=['one', 'two'])
    df.sum()  # 列求和
    df.sum(axis=1)  # 行求和
    df.mean(axis=1, skipna=False)
    '''
    axis        0行,1列
    skipna      排除缺失值
    level       分组约简
    '''
    df.idxmax()  # .idxmax(),.idxmin()间接统计,达到最小值或最大值的索引
    df.cumsum()  # 累计型
    df.describe()  # 用于一次性产生多个汇总统计
    '''
    描述和汇总统计
        count           非NA值得数量
        describe        针对Series或各DataFrame列计算汇总统计
        min,max         计算最大值最小值
        argmin,argmax   计算最大值最小值的索引位置
        idxmin,idxmax   计算最大值最小值的索引值
        quantile        计算样本的分位数(0到1)
        sum             值的和
        mean            值的平均数
        median          值的算术中位数(50%分位数)
        mad             根据平均值计算平均绝对离差
        var             样本值的方差
        std             样本值的标准差
        skew            样本值的偏度(三阶矩)
        kurt            样本值的峰度(四阶矩)
        cumsum          样本值的累计和
        cummin,cummax   样本值的累计最大值和累计最小值
        cumprod         样本值的累计积
        diff            计算一阶差分(对时间序列很有用)
        pct_change      计算百分数变化
    '''

    # 相关系数与协方差
    '''
    .tail()
    .corr()
    .cov()
    .corrwith()
    '''

    # 唯一值、值计数以及成员资格
    obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
    obj.unique()  # 得到唯一值
    obj.value_counts()  # 计算各值出现的频率
    pd.value_counts(obj.values, sort=False)  # 同上,可用于数组或序列
    obj.isin(['b', 'c'])  # 判断矢量化集合的成员资格

    pass
Example #4
0
def classify_spectro_df(spectro_df: pd.DataFrame):
    """Take a spectrogram df and classify it as inputs."""

    # This checks if there's a peak characteristic of a whistle.
    def get_prominence(series):
        return scipy.signal.peak_prominences(series, [series.argmax()])[0][0]

    # Make a DataFrame to hold information we're going to use to classify what
    #  this audio means.
    df = pd.DataFrame(
        data=spectro_df.idxmax(),
        columns=['peak_freq'],
    )
    df['log_peak_freq'] = np.log(df.peak_freq)
    df['prominence'] = spectro_df.apply(get_prominence)
    df['whistle'] = df.prominence > cfg['prominence_threshold']

    # Get the regions where there's whistling to parse them
    active_regions = _get_active_regions(df)
    for start, end in active_regions:
        print(f'whistle detected: ({start}, {end})')
        keyboard = pynput.keyboard.Controller()
        keyboard.press(pynput.keyboard.Key.caps_lock)
        time.sleep(0.2)
        keyboard.release(pynput.keyboard.Key.caps_lock)

    # Drop anything which has been classified
    if active_regions:
        spectro_df = spectro_df.loc[:, active_regions[-1][-1]:]

    return spectro_df, df, active_regions
    def bischoff(self, df: pd.DataFrame):
        """
        :param party_vote_dict: Mapping between party names and total received
        votes
        :return:
        """
        preallocate_seats = df["seats"].sum()
        for _ in tqdm(range(self.MAX_SEATS - preallocate_seats)):
            for index, row in df.iterrows():
                # print(index)
                df.at[index, self.Keywords.QUOTA.value] = math.floor(
                    float(row[self.Keywords.TOTAL_VOTES.value]) /
                    float(row[self.Keywords.SEATS.value] + 1))

            party_with_highest_quota = df.idxmax()[self.Keywords.QUOTA.value]
            df.at[party_with_highest_quota, self.Keywords.SEATS.value] += 1

            seats_distributed = df[self.Keywords.SEATS.value].sum()
            if seats_distributed == 200:
                print("Done")
                break
            elif seats_distributed > 200:
                print("Error distributing")
                print(df)
                break
            else:
                pass
                # print(seats_distributed)

        print(df)
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series,
                     mapper_dict: Dict) -> DataFrame:
    """
    Get prediction probabilities (if available) or return true and predicted
    labels
    """
    df_preds = DataFrame(est.predict(X_test), index=X_test.index)
    if hasattr(est.named_steps["clf"], "predict_proba"):
        # Get prediction probabilities (if available)
        df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index)

        # Append prediction and prediction probabilities
        df_summ = concat([df_preds, df_probas], axis=1)
        df_summ.columns = ["predicted_label"] + [
            f"probability_of_{i}" for i in range(0, len(np.unique(y_test)))
        ]

        # Get label (class) with maximum prediction probability for each row
        df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1)
        df_summ["probability_of_max_class"] = df_probas.max(axis=1)

        # Compare .predict_proba() and manually extracted prediction
        # probability
        lhs = df_summ["max_class_number_manually"]
        rhs = df_summ["predicted_label"].replace(mapper_dict)
        assert (lhs == rhs).eq(True).all()
    else:
        df_summ = df_preds.copy()
    # Get true label
    df_summ.insert(0, "true_label", y_test)
    return df_summ
Example #7
0
    def testSingle(self, test, fold):
        #
        # devents = xgb.DMatrix( test[ self.variables ].values )
        # prediction = DataFrame( self.models[fold].predict( devents ) )
        #
        # return DataFrame(dtype = float, data = {"predicted_class":prediction.idxmax(axis=1).values,
        #                          "predicted_prob": prediction.max(axis=1).values } )

        devents = xgb.DMatrix(test[self.variables].values)
        prediction = DataFrame(self.models[fold].predict(devents))

        # note: this uses idxmax (the column header of the max value) and tries to convert it to a float
        # therefore renaming of the header should be done AFTER extracting the predicted_class
        df = DataFrame(dtype=float,
                       data={
                           "predicted_frac_class":
                           prediction.idxmax(axis=1).values,
                           "predicted_frac_prob": prediction.max(axis=1).values
                       })

        # header renaming
        headers = []
        for i in range(0, len(prediction.columns)):
            headers.append("predicted_frac_prob_" + str(i))
        prediction.columns = headers

        # horizontal concat (adding columns)
        result = concat([prediction, df], axis=1)

        return result
Example #8
0
def transform_majority_label(rated_annotations: pd.DataFrame) -> pd.Series:
    """
    Reduce a Pandas.DataFrame showing consistency per class per data point to a
    data point - label association.
    :param rated_annotations:
    :return: ndarray.
    """
    return rated_annotations.idxmax(axis=1)
Example #9
0
def select_signatures(W: pd.DataFrame, H: pd.DataFrame):
    """
    Scales NMF output by sample and feature totals to select Signatures.
    ------------------------
    Args:
        * W: input W matrix (K x n_features)
        * H: input H matrix (n_samples x K)

    Returns:
        * W: output W matrix with max_id, max, and max_norm columns
        * H: output H matrix with max_id, max, and max_norm columns
    """
    Wnorm = W.copy()
    Hnorm = H.copy()

    # Scale Matrix
    for j in range(W.shape[1]):
        Wnorm.iloc[:,j] *= H.sum(1).values[j]
        Hnorm.iloc[j,:] *= W.sum(0).values[j]

    # Normalize
    Wnorm = Wnorm.div(Wnorm.sum(1),axis=0)
    Hnorm = Hnorm.div(Hnorm.sum(0),axis=1)

    H = H.T
    Hnorm = Hnorm.T

    # Get Max Values
    H_max_id = H.idxmax(axis=1, skipna=True).astype('int')
    H['max'] = H.max(axis=1, skipna=True)
    H['max_id'] = H_max_id
    Hnorm['max_norm']=Hnorm.max(axis=1, skipna=True)

    W_max_id = W.idxmax(axis=1, skipna=True).astype('int')
    W['max'] = W.max(axis=1, skipna=True)
    W['max_id'] = W_max_id
    Wnorm['max_norm']=Wnorm.max(axis=1, skipna=True)

    H['max_norm'] = Hnorm['max_norm']
    W['max_norm'] = Wnorm['max_norm']

    _rename = {x:'S'+x for x in list(H)[:-3]}
    H = H.rename(columns=_rename)
    W = W.rename(columns=_rename)

    return W,H
Example #10
0
def pred_prob_to_pred_label(y_pred_proba: pd.DataFrame) -> List[str]:
    """
    Convert a dataframe of predicted probabilities (shape (n_samples, n_classes)) to
    a list of predicted classes.
    """
    if len(y_pred_proba) == 0:
        return []

    return y_pred_proba.idxmax(axis=1).tolist()
def calConsensus(processedList):
    baseList = ["A", "C", "G", "T"]
    #profileList = []
    df = DataFrame()
    for n in baseList:
        List = [i.count(n) for i in processedList]
        df[n] = Series(List)
    for value in df.idxmax(axis=1):
        print(value, end="")
Example #12
0
    def testSingle(self, test, fold):

        devents = xgb.DMatrix(test[self.variables].values)
        prediction = DataFrame(self.models[fold].predict(devents))

        return DataFrame(dtype=float,
                         data={
                             "predicted_class":
                             prediction.idxmax(axis=1).values,
                             "predicted_prob": prediction.max(axis=1).values
                         })
Example #13
0
 def _from_categorical(data: DataFrame, mapping: dict) -> DataFrame:
     """
         Based on the mapping computed with _categorial_mapping function on a similar dataset,
     converts the encoded data into initial data
     :param data: dataset to be converted back to the inital form
     :param mapping: the mapping computed with _categorical mapping function
     :return: reverted dataset
     """
     categories = data.idxmax(axis=1)  # get the categories
     return DataFrame([mapping[c] for c in categories
                       ])  # easily construct the dataframe from list of
Example #14
0
def predict_ovr(newX=None, n_jobs=1):
    global _X
    global _Y
    global _newX
    global _coefs_ovr
    _newX = _X if newX is None else newX
    classes = np_sort(unique(_Y))
    with Pool(n_jobs) as pool:
        preds = pool.map(_predict_class_ovr, classes)
    preds = DataFrame(dict(zip(classes, preds)))
    return array(preds.idxmax(axis="columns"))
def undo_one_hot(
        df: pd.DataFrame,
        new_column_name: Optional[str] = None
) -> Union[pd.Series, pd.DataFrame]:
    """Undo one-hot encoding."""
    # we have to overwrite the column names because `idxmax` uses the column names
    df.columns = pd.Index(range(df.shape[1]))
    result = df.idxmax(axis="columns")
    if new_column_name is not None:
        return result.to_frame(name=new_column_name)
    else:
        return result
Example #16
0
def classify_sentences(sentences, model):
    if model == 'infersent':
        model = load_model('data/infersent_model.pkl',
                           glove_path='data/glove.840B.300d.txt',
                           infersent_path='data/infersent.allnli.pickle')
    else:
        model = load_model('data/bow_model.pkl',
                           glove_path=None,
                           infersent_path=None)
    df = DataFrame(model.predict_proba(sentences), columns=model.classes_)
    df['max'] = df.idxmax(axis=1)
    return df
Example #17
0
def _to_labels(probabilities: pd.DataFrame) -> pd.Series:
    labels = probabilities.idxmax(axis='columns')

    # Find places where there are multiple maximum values
    max_probabilities = probabilities.max(axis='columns')
    is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='rows')
    number_of_max: pd.Series = is_max.sum(axis='columns')
    multiple_max: pd.Series = number_of_max.gt(1)
    # Set those locations as an 'undecided' label
    labels[multiple_max] = 'undecided'
    # TODO: emit a warning if any are set to 'undecided'

    return labels
def _to_labels(probabilities: pd.DataFrame) -> pd.Series:
    labels = probabilities.idxmax(axis='columns')

    # Find places where there are multiple maximum values
    max_probabilities = probabilities.max(axis='columns')
    is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='rows')
    number_of_max: pd.Series = is_max.sum(axis='columns')
    multiple_max: pd.Series = number_of_max.gt(1)
    # Set those locations as an 'undecided' label
    labels[multiple_max] = 'undecided'
    # TODO: emit a warning if any are set to 'undecided'

    return labels
Example #19
0
    def experimentDropoutRate(self, args):
        # Load data
        X_train, X_dev, X_test, Y_train, Y_dev, classes = Data().load_data()
        nb_features = X_train.shape[1]
        print(nb_features, 'features')
        nb_classes = Y_train.shape[1]
        print(nb_classes, 'classes')

        args.epochs = 100

        model = None

        folds = Data().cross_validation_split(X_train, Y_train)
        metrics_per_set = DataFrame()
        for rate in arange(0.4, 0.9, 0.1):
            print("\nEvaluating dropout rate ", str(rate))
            count = 0
            mean_of_folds = DataFrame()
            for fold in folds:
                print("Training holding fold", str(count), "out..")
                if count == len(folds) - 1:
                    early_stopping_fold = folds[0]
                else:
                    early_stopping_fold = folds[count + 1]

                union_set = Data().construct_union_set(
                    fold.copy(), early_stopping_fold.copy(), folds.copy())

                model = Classifier(type='DropoutAdam',
                                   nb_features=nb_features,
                                   nb_classes=nb_classes,
                                   epochs=args.epochs,
                                   batch_size=32,
                                   classes=classes,
                                   run_number=args.run,
                                   rate=rate)

                model.fit(X_train=union_set[0],
                          Y_train=union_set[1],
                          X_dev=early_stopping_fold[0],
                          Y_dev=early_stopping_fold[1])
                mean_of_folds["Fold " + str(count + 1)] = model.predict(
                    X_test=fold[0], X_dev=fold[0], Y_dev=fold[1])
                count += 1

            if model is not None:
                metrics_per_set["Rate: " +
                                str(rate)] = mean_of_folds.mean(axis=1)
        # metrics_per_set['cols'] = ['acc', 'prec', 'rec', 'f1']
        print(metrics_per_set)
        print(metrics_per_set.idxmax(axis=1))
Example #20
0
def bayesclass_predict(Class, model, data):
    x = data
    k = model.predict(x)

    df = DataFrame(index=Class.index.values, columns=x.index.values)
    for i in Class.index.values:
        df.loc[i] = norm.logpdf(x=np.ravel(k),
                                loc=Class.Mean.ix[i],
                                scale=Class.Variance.ix[i])

    condition = np.ravel([df.max() > -50])
    j = np.round(np.ravel(k))
    j = j * (~condition)
    j = j + np.ravel(df.idxmax()) * condition

    return j
Example #21
0
def highest_density_interval(posteriors: pd.DataFrame, p=0.9) -> pd.DataFrame:
    """
    Get HDI

    posteriors: pandas DataFrame of posteriors
    p: confidence interval
    """

    # If we pass a DataFrame, just call this recursively on the columns
    if isinstance(posteriors, pd.DataFrame):
        return pd.DataFrame(
            [
                highest_density_interval(posteriors[col], p=p)
                for col in posteriors
            ],
            index=posteriors.columns,
        )

    cumsum = np.cumsum(posteriors.values)

    # N x N matrix of total probability mass for each low, high
    total_p = cumsum - cumsum[:, None]

    # Return all indices with total_p > p
    lows, highs = (total_p > p).nonzero()

    # Find the smallest range (highest density)
    best = (highs - lows).argmin()

    low = posteriors.index[lows[best]]
    most_likely = posteriors.idxmax(axis=0)
    high = posteriors.index[highs[best]]

    return pd.Series(
        [most_likely, low, high],
        index=["most_likely", f"low_{p*100:.0f}", f"high_{p*100:.0f}"],
    ).round(2)
from numpy.random import randn
import matplotlib.pyplot as plt

array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]])
print array1
df1 = DataFrame(array1, index=[1, 2], columns=list('ABC'))
print df1

#sum()
print "Sum of cols", df1.sum()  #sums along each column
print df1.sum(axis=1)  #sum along indexes

print "Min", df1.min()
print "Max", df1.max()

print df1.idxmax()
print df1.cumsum()
print df1.describe()

df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC'))
print df2

plt.plot(df2)
plt.legend(df2.columns, loc="lower right")
plt.savefig('samplepic.png')
plt.show()

series1 = Series(list('abcccaabd'))
print series1.unique()

print series1.value_counts()
Example #23
0
df = DataFrame([[1.4, np.nan], [7.1,-4.5], [np.nan, np.nan], [0.75, -1.3]],
               index = ['a','b','c','d'],
               columns=['one','two']
               )

print(df)
print('\n')
print(df.sum())
print('\n')
print(df.sum(axis=1))
print('\n')
print(df.mean())
print('\n')
print(df.mean(axis=1,skipna=False))
print('\n')
print(df.idxmax())
print('\n')
print(df.cumsum())
print('\n')
print(df.cumsum(axis=1))
print('\n')
print(df.describe())
print('\n')

###############################################################

obj = Series(['a','a','b','c']*4)
print(obj)
print(obj.describe())
print('\n')
Example #24
0
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
for tic, data in all_data.iteritems()})
price = DataFrame({tic: data['Adj Close'] 
frame.loc['a':'d', 'STL':]
frame.iloc[0:3, 1:2]
frame['UMST'] = 4
frame.reindex(index=['c', 'e', 'a'], columns=['UM', 'Washu'])
frame[frame < 0] = np.nan
frame.isnull()
frame.dropna()
frame.dropna(axis=1)
um = frame['UM']
um[um.notnull()]
frame.fillna(method='ffill', axis=0, limit=1, inplace=False)
frame.fillna(method='ffill', axis=1, limit=1)
frame.mean()
frame.mean(axis=1, skipna=False)
frame.idxmin()
frame.idxmax(axis=1)
frame2 = DataFrame(
    {
        'Washu': np.random.randn(5),
        'UM': np.random.randn(5),
        'UMST': np.random.randn(5)
    },
    index=list('abcde'))
frame3 = DataFrame({
    'a': {
        'Washu': 1,
        'UM': 3
    },
    'b': {
        'Washu': 2,
        'UM': 4
Example #26
0
def centered_plot(best_values: pd.DataFrame, best_names: pd.DataFrame):
    # Things are normalized so classifiers are centered at normscore 0
    # Hence we only have to plot best_selector vs best_baseline
    best_values = best_values.drop(columns='classifiers')
    best_names = best_names.drop(columns='classifiers')

    selector_normscores = best_values['selectors'].values
    baseline_normscores = best_values['baselines'].values

    # We choose the label name of which performed better
    labels = [
        best_names.loc[task, category]
        for task, category in best_values.idxmax(axis=1).items()
    ]

    # Assign colors so selectors and baselines are visually distinct
    selector_names = set(filter(lambda name: 'selector' in name, labels))
    baseline_names = set(filter(lambda name: 'baseline' in name, labels))

    selector_colors = sns.color_palette('viridis_r', len(selector_names) * 2)
    baseline_colors = sns.color_palette('rocket', len(baseline_names))

    cmap = {
        **{name: selector_colors[i]
           for i, name in enumerate(selector_names)},
        **{name: baseline_colors[i]
           for i, name in enumerate(baseline_names)},
    }
    colors = [cmap[label] for label in labels]

    figsize = (8, 10)
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(1, 1, 1)
    xlims = (-1.3, 1.05)
    ylims = (-1.3, 1.8)
    ax.set_xlim(xlims)
    ax.set_ylim(ylims)

    # Axis lines within box of radius 1
    ax.plot((xlims[0], 1), (0, 0), c='black', linestyle=':', linewidth=0.5)
    ax.plot((0, 0), (ylims[0], 1), c='black', linestyle=':', linewidth=0.5)

    # Horizontal Oracle line
    ax.plot((xlims[0], 1), (1, 1), c='black', linestyle=':', linewidth=1.0)

    # Vertical line for outisde box
    ax.plot((1, 1), (ylims[0], ylims[1]),
            c='black',
            linestyle=':',
            linewidth=0.5)

    # Diagonal line for marking which side is better
    ax.plot((xlims[0], 1), (ylims[0], 1),
            c='grey',
            linestyle='--',
            linewidth=0.2)

    # Text indicating the regions
    offsets = (0.1, 0.05)
    ax.text(0 + 0.3 + offsets[0],
            ylims[0] + offsets[1],
            "best baseline < single best\nbest selector > single best",
            fontsize=8)
    ax.text(xlims[0] + offsets[0],
            ylims[0] + offsets[1],
            "best baseline < single best\nbest selector < single best",
            fontsize=8)
    ax.text(xlims[0] + offsets[0],
            0 + offsets[1],
            "best baseline > single best\nbest selector < single best",
            fontsize=8)
    ax.text(0 + 0.3 + offsets[0],
            0 + offsets[1],
            "best baseline > single best\nbest selector > single best",
            fontsize=8)
    ax.text(0 + offsets[0], 1 + offsets[1], "baseline > oracle", fontsize=8)

    legend_lines = [
        Line2D([0], [0],
               color='w',
               marker='o',
               markerfacecolor=col,
               label=name.replace('_', ' ')) for name, col in cmap.items()
    ]
    ax.legend(handles=legend_lines)

    ax.scatter(x=selector_normscores, y=baseline_normscores, c=colors)
    ax.set_xlabel('Selector normalized score')
    ax.set_ylabel('Baseline normalized score')
    #ax.axes.set_aspect('equal')
    ax.set_title('Selector/Baseline performances for 62 Datasets')
    return fig
Example #27
0
def umap_dataset_properties_selectors_baselines(best_values: pd.DataFrame,
                                                best_names: pd.DataFrame,
                                                cached_metaprops: str,
                                                random_state=5):
    if len(best_values.index) < 4:
        print("Can't create meaningful UMAP of less than 4 points")
        return

    if os.path.exists(cached_metaprops):
        df_metaprops = pd.read_csv(cached_metaprops, index_col=0)
    else:
        tasks = list(map(int, best_values.index))
        dataset_ids = [
            openml.tasks.get_task(task).dataset_id for task in tasks
        ]

        # This will take a while to get
        # Hence the caching
        dataset_metaprops = [
            openml.datasets.get_dataset(dataset_id).qualities
            for dataset_id in dataset_ids
        ]

        available_keys = reduce(
            lambda acc, metaprops: acc.intersection(metaprops.keys()),
            dataset_metaprops, set(dataset_metaprops[0].keys()))
        dict_metaprops = {
            k: [metaprop[k] for metaprop in dataset_metaprops]
            for k in available_keys
        }
        df_metaprops = pd.DataFrame.from_dict(dict_metaprops,
                                              orient='index',
                                              columns=tasks)
        df_metaprops.to_csv(cached_metaprops)

    # Drop features that have more than 30% missing
    cut_percentage = 0.00  # Most features have 0%, 12% or 67% missing
    for row in df_metaprops.index:
        missing = sum(df_metaprops.loc[row].isnull()) / len(
            df_metaprops.loc[row])
        if missing > cut_percentage:
            df_metaprops.drop(index=row, inplace=True)

    # Convert the rest of the nans to the mean (8/62 had 24/48 missing features)
    df_metaprops = df_metaprops.apply(lambda row: row.fillna(row.mean()),
                                      axis=1)

    df_metaprops = df_metaprops.T  # Make the tasks be on the index

    # Scale Data according to UMAPS recommendation
    df_scaled_metaprops = StandardScaler().fit_transform(df_metaprops)

    # Use UMAP to produce embedding
    # Doesn't really make sense with low number of datasets
    n_datasets = len(df_metaprops)
    K = math.ceil(n_datasets / 2) if n_datasets < 20 else 10
    umapper = UMAP(n_neighbors=K, random_state=random_state)
    embeddings = umapper.fit_transform(df_scaled_metaprops)

    figsize = (10, 12)
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(1, 1, 1)

    # Take out classifiers
    best_values = best_values.drop(columns='classifiers')
    best_names = best_names.drop(columns='classifiers')

    # We choose the label name of which performed better
    labels = [
        best_names.loc[task, category]
        for task, category in best_values.idxmax(axis=1).items()
    ]

    # Assign colors so selectors and baselines are visually distinct
    selector_names = set(filter(lambda name: 'selector' in name, labels))
    baseline_names = set(filter(lambda name: 'baseline' in name, labels))

    selector_colors = sns.color_palette('viridis_r', len(selector_names) * 2)
    baseline_colors = sns.color_palette('rocket', len(baseline_names))

    cmap = {
        **{name: selector_colors[i]
           for i, name in enumerate(selector_names)},
        **{name: baseline_colors[i]
           for i, name in enumerate(baseline_names)},
    }
    colors = [cmap[label] for label in labels]

    ax.scatter(embeddings[:, 0], embeddings[:, 1], c=colors)

    legend_lines = [
        Line2D([0], [0],
               color='w',
               marker='o',
               markerfacecolor=col,
               label=name.replace('_', ' ')) for name, col in cmap.items()
    ]
    ax.legend(handles=legend_lines)

    ax.set_xlabel('UMAP axis 1')
    ax.set_ylabel('UMAP axis 2')
    ax.set_title(
        'UMAP projection of dataset meta-features - Selectors / AutoML')

    return fig
d    0.5
'''

print
'其它函数'
print
df
'''
   one  two
a  1.0  NaN
b  7.0  4.0
c  NaN  NaN
d  0.0  1.0
'''
print
df.idxmax()  # 计算每一列最大值的索引
'''
one    b
two    b
'''
print
df.cumsum()  # 每一列的累加和
'''
   one  two
a  1.0  NaN
b  8.0  4.0
c  NaN  NaN
d  8.0  5.0
'''
print
df.describe()  # 对DataFrame每列计算汇总统计
Example #29
0
from pandas import DataFrame

data = {
    'Speed': [101, 109, 106],
    'Temp': [34, 32, 45],
    'Humidity': [4500, 2300, 5800]
}
frame = DataFrame(data)
print(frame)

print(frame.sum())  #to calculate sum of all columns

print(frame.sum(axis=1))  # to calculate sum of rows

print(frame.idxmax())  # to calculate max value at particular index value.

print(frame.idxmin())
dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"])
dframe1

# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb

# import pandas_datareader.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(
    ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1)
df1 = DataFrame(dic1)

#calculating the sum of individual columns

df1.sum()

#calculating the sum of individual rows

df1.sum(axis=1)
#here axis =1 represents the horizontal axis

#calculating the maximum values for each individual columns
#results will in the form of displayed index

df1.idxmax()

#similarly for minimum values for each individual columns

df1.idxmin()

#fundamental operations on DataFrames like addition,subtraction etc

dic2 = {
    "cse": [10, 13, 11],
    "maths": [11, 14, 17],
    "english": [5, 7, 9],
    "ece": [11, 13, 15]
}
df2 = DataFrame(dic2)
Example #32
0
dframe1

#Let's see the sum() method in action
dframe1.sum()
#Notice how it ignores NaN values

#Notice how it ignores NaN values
dframe1.sum(axis=1)

#Can also grab min and max values of dataframe
dframe1.min()

#As well as there index
dframe1.idxmin()

dframe1.idxmax()

dframe1.max()

dframe1
#Can also do an accumulation sum
dframe1.cumsum()

#A very useful feature is describe, which provides summary statistics
describe=dframe1.describe()

# We can also get information on correlation and covariance

#For more info on correlation and covariance, check out the videos below!
from IPython.display import YouTubeVideo
YouTubeVideo('xGbpuFNR1ME')
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
Example #34
0
# 检验是否唯一
obj.index.is_unique
# 一个索引有多个值,那么该索引就会返回多个值。
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
# 默认会排除NA,但是可以通过skipna禁用该功能
df.mean(axis=1,skipna=False)
# 返回最大值的索引
df.idxmax()
# 累加
df.cumsum()
df.describe()
# 相关系数
returns.MSFT.corr(returns.IBM)
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)

## 唯一值,值计数以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# 统计个数
obj.value_counts()
# 统计个数后默认排序,也可以不排序
Example #35
0
### Descriptive statistics
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df

df.sum()
df.sum(axis=1)  # NB for this one NaNs are treated at 0

df.cumsum()

df.mean(axis=1, skipna=False)

df.describe()  # also works on other objects

df.idxmax()  # returns the id of the index of the max

### Handling Missing Data
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
string_data[0] = None
string_data.isnull()

data = Series([1, NA, 3.5, NA, 7])
data.dropna()
data[data.notnull()]  # another way to do it

data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()  #  row wise
data
Example #36
0
# -*- coding:utf-8 -*-
import numpy as np
from pandas import Series, DataFrame

print('求和')
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df)
print(df.sum())  #按列求和
print(df.sum(axis=1))  # 按行求和

print('平均数')
print(df.mean(axis=1, skipna=False))
print(df.mean(axis=1))

print('其它')
print(df.idxmax())
print(df.cumsum())
print(df.describe())
obj = Series(['a', 'a', 'b', 'c'] * 4)
print(obj.describe())
Example #37
0
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print '求和'
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
              index = ['a', 'b', 'c', 'd'],
              columns = ['one', 'two'])
print df
print df.sum()  # 按列求和
print df.sum(axis = 1)  # 按行求和
print

print '平均数'
print df.mean(axis = 1, skipna = False)
print df.mean(axis = 1)
print

print '其它'
print df.idxmax()
print df.cumsum()
print df.describe()
obj = Series(['a', 'a', 'b', 'c'] * 4)
print obj.describe()