Ejemplos de cut en Python, ejemplos de pandas.cut en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: bosch_functions.py Proyecto: mengyx-work/xgboost_hyperopt

def create_grouped_index_df(bin_num):
    ## load the labels and start_time column for train and test data
    start_time = time.time()
    train_labels            = pd.read_csv(data_path + train_num_file, index_col='Id', usecols=['Id', dep_var_name])
    train_date_start_columm = pd.read_csv(data_path + train_date_file, index_col='Id', usecols=['Id', start_time_column_name])
    test_date_start_columm  = pd.read_csv(data_path + test_date_file, index_col='Id', usecols=['Id', start_time_column_name])
    end_time = time.time()
    print 'data loading takes ', round((end_time - start_time), 1), ' seconds.'

    ## join the start_time with labels, then drop the NaN in start_time
    labeled_start_time = pd.merge(train_labels, train_date_start_columm, how='left', left_index=True, right_index=True)
    ## this labeled_start_time dataFrame doesn't contain the NaN, therefore it can be directly used for calculating the mquantiles
    labeled_start_time = labeled_start_time[~labeled_start_time[start_time_column_name].isnull()]


    ##section to subset the data by start_time
    prob_list = [1.*i/bin_num for i in range(1, bin_num)]
    quantile_values = mquantiles(labeled_start_time[start_time_column_name], prob=prob_list)

    bins = [labeled_start_time[start_time_column_name].min()]
    bins.extend(quantile_values)
    bins.append(labeled_start_time[start_time_column_name].max())
    bin_names = [str(i) for i in range(len(bins)-1)]

    ## cut the entire dataframe into different time_windows by start_time
    tmp_train = train_date_start_columm.copy()
    tmp_test  = test_date_start_columm.copy()

    tmp_train['time_window_num'] = pd.cut(tmp_train[start_time_column_name], bins, labels=bin_names)
    tmp_test['time_window_num']  = pd.cut(tmp_test[start_time_column_name],  bins, labels=bin_names)
    ## create a row number column, start index is 1
    tmp_train['row_num'] = range(1, (tmp_train.shape[0] + 1))
    tmp_test['row_num']  = range(1, (tmp_test.shape[0] + 1))

    return tmp_train, tmp_test, bins, bin_names

Ejemplo n.º 2

0

Mostrar archivo

Archivo: preprocess.py Proyecto: swatisaoji1/DataMining-Implementation-Naive-Bayesian-Classifier

def makediscrete2(df, dataType):
    dataLabels = list(df.columns.values)
    for eachLabel in dataLabels:
        if dataType[eachLabel] is 1: 
            choiceC = True
            bins = input('Enter bin size for ' + eachLabel + ": ")
            while choiceC is True:
                choice = input("Enter \n 1 - for equal size bins \n 2 - for custom range \n You Choice: "  )
                if choice == 1:    

                    df[eachLabel] = pd.cut(df[eachLabel], bins)
                    choiceC = False
                elif choice == 2:
                    print ("Enter " + str(bins+1) + " values for bin edges:")
                    binedges =[]
                    for x in range(bins+1):
                        value = input("" + str(x) + ": ")
                        binedges.append(value)

                    df[eachLabel] = pd.cut(df[eachLabel], binedges ) 
                    choiceC = False
                else:
                    print "Wrong choice Try Again!! "  
        
    df.to_csv(Globals.DISCRETIZED_FILE)
    print ("continuous data converted to discrete data stored : " + Globals.DISCRETIZED_FILE)
    return df, Globals.DISCRETIZED_FILE

Ejemplo n.º 3

0

Mostrar archivo

Archivo: cleantitanic.py Proyecto: neojou/kaggle-titanic

def cleaneddf(no_bins=0):
    #you'll want to tweak this to conform with your computer's file system
    trainpath = '../../data/train.csv'
    testpath = '../../data/test.csv'
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)
    
    #discretise fare
    if no_bins==0:
        return [cleandf(traindf), cleandf(testdf)]
    traindf=cleandf(traindf)
    testdf=cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True)
    bins=bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)
   
 
    #discretise age
    bins_and_binned_age = pd.qcut(traindf.Age+jitter(traindf.Age), no_bins, retbins=True)
    bins=bins_and_binned_age[1]
    
    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)
    
    #create a submission file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv('./prediction.csv', index=False)
    return [traindf, testdf]

Ejemplo n.º 4

0

Mostrar archivo

Archivo: transformations.py Proyecto: SGMAP-AGD/anonymisation

def period_by_hours(x, separation):
    ''' aggrege le x par intervale d'heure.
        Le calcul pourrait être simple si on interdisait
        le chevauchement de jour.
    '''
    print(separation)
    assert isinstance(separation, list)
    assert all([sep < 24 for sep in separation])
    separation.sort()

    if 0 in separation:
        separation.append(24)
        hour_categ = pd.cut(x.dt.hour, separation, right=False)
        date_categ = x.dt.date
        return date_categ.astype(str) + ' ' + hour_categ.astype(str)
    else:
        hour = x.dt.hour
        hour_categ = pd.cut(hour, separation, right=False).astype(str)
        night_categ = '[' + str(separation[-1]) + ', ' + str(separation[0]) + ')'
        hour_categ[(hour < separation[0]) | (hour >= separation[-1])] = night_categ
        assert hour_categ.nunique(dropna=False) == len(separation)
        date_categ = x.dt.date.astype(str)
        # décalage d'un jour pour les premières heures
        decale = x.dt.date[x.dt.hour < separation[1]] + pd.DateOffset(days=-1)
        date_categ[x.dt.hour < separation[1]] = decale.astype(str)
        assert all(date_categ.str.len() == 10)
        return date_categ + ' ' + hour_categ

Ejemplo n.º 5

0

Mostrar archivo

Archivo: pointcloud.py Proyecto: ArcticSnow/dempy

def binData2D(myXYZ, xstart, xend, ystart, yend, nx, ny):
    '''
    Fucntion to bin a scatter point cloud (xyz) into a 2d array
    :param myXYZ: xyz array containings the point cloud coordiantes
    :param xstart:
    :param xend:
    :param ystart:
    :param yend:
    :param nx: number of cells along the x-axis
    :param ny: number of cells along hte y-axis
    :return: a group object (pandas library) with all points classified into bins
    '''
    # note, the division requires:     from _future_ import division
    x = myXYZ[:,0].ravel()
    y = myXYZ[:,1].ravel()
    z = myXYZ[:,2].ravel()
    df = pd.DataFrame({'X' : x , 'Y' : y , 'Z' : z})
    bins_x = np.linspace(xstart, xend, nx+1)
    x_cuts = pd.cut(df.X,bins_x, labels=False)
    bins_y = np.linspace(ystart,yend, ny+1)
    y_cuts = pd.cut(df.Y,bins_y, labels=False)
    bin_xmin, bin_ymin = x_cuts.min(), y_cuts.min()
    print('Data cut in a ' + str(bins_x.__len__()) + ' by ' + str(bins_y.__len__()) + ' matrix')
    dx = (xend - xstart)/nx
    dy = (yend - ystart)/ny
    print('dx = ' + str(dx) + ' ; dy = ' + str (dy))
    grouped = df.groupby([x_cuts,y_cuts])
    print('Data grouped, \nReady to go!!')
    return grouped, bins_x, bins_y, int(bin_xmin), int(bin_ymin)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: ipython_lib.py Proyecto: jackleg/ipython_startup

def get_log_odds_chris(target, feature, bins, f_range=None, M=10, display_head=False):
    """return log ( P(feature=x | target=1) / P(feature=x | target=0) )
       tn : targent name, 0 or 1
       fn : x name 
       f_range : x의 범위 제한
       M : smoothing factor
    """
    tn = target.name
    fn = feature.name
    X = pd.concat([target, feature], axis=1)
    if f_range is not None:
        X = X[(X[fn] > f_range[0]) & (X[fn] < f_range[1])]
    if display_head:
        X["_cut"] = pd.cut(X[fn], bins=bins).astype(str)
        X["_cut"] = X._cut.map(lambda x: float(x.split(",")[0][1:]))
    else:
        X["_cut"] = pd.cut(X[fn], bins=bins)
    Y = X.groupby("_cut").apply(
        lambda x: np.log((x[tn].sum() + 1.0 * M / bins) / ((1.0 - x[tn]).sum() + 1.0 * M / bins))
    )
    #    display(X.groupby('_cut').apply(lambda x: (x[tn].sum(), (1-x[tn]).sum())))
    #    display(Y)
    Y = Y - np.log((1.0 * X[tn].sum() + M) / ((1.0 - X[tn]).sum() + M))
    Y = pd.DataFrame(Y, columns=["%s_log_odds" % fn])
    return Y

Ejemplo n.º 7

0

Mostrar archivo

Archivo: quickmaps.py Proyecto: murphy214/berrl

def make_object_map(data,field,**kwargs):
	linear = False
	for key,value in kwargs.iteritems():
		if key == 'linear':
			linear = value
	print linear
	if linear == False:
		colors,rangelist = make_distributed_range(data,field)
	else:
		colors = get_heatmap51()
		colors2 = colors 
		maxvalue = data[field].max()
		if maxvalue < 51:
			totallist = range(maxvalue)
			colors = reduce_color_list_size(totallist,colors)
			colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
		else:
			colors = reduce_color_list_size(range(len(data)),colors)
			colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors)
			if not rangelist[0] == 0:
				rangelist = [0] + rangelist[1:]
			data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors)
			
			return data
	colors2 = get_heatmap51()
	if not rangelist[0] == 0:
		rangelist = [0] + rangelist[1:]
	data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:])
	
	return data

Ejemplo n.º 8

0

Mostrar archivo

Archivo: extra_credit.py Proyecto: DrFrankieD/MC3_P3

def get_indicators(start_date, end_date, symbols):
    """Simulate and assess the performance of a stock portfolio."""
    # Read in adjusted closing prices for given symbols, date range
    dates = pd.date_range(start_date, end_date)
    prices_all = get_data(symbols, dates)  # automatically adds SPY
    prices = prices_all[symbols]  # only portfolio symbols
    # prices_SPY = prices_all['SPY']  # only SPY, for comparison later

    sym = symbols[1]

    x1 = (prices[sym] - pd.rolling_mean(prices[sym], 20)) / (2 * pd.rolling_std(prices[sym], 20))
    x1_dis = pd.cut(x1, 10, labels=False)

    x2 = prices[sym].pct_change(20)
    x2_dis = pd.cut(x2, 10, labels=False)
    x3 = pd.rolling_std(prices[sym].pct_change(1), 20)
    x3_dis = pd.cut(x3, 10, labels=False)

    # return pd.concat([x1_,x2_0,x3_0], axis=1).dropna(), prices
    tempdf = pd.concat([x1_dis, x2_dis, x3_dis], axis=1).dropna()
    tempdf.columns = ["x1", "x2", "x3"]

    print tempdf.dtypes

    tempdf["holding"] = np.random.randint(0, 3, size=len(tempdf))
    # 0 = no position , 1 = negative positin 2 =holding long
    tempdf["s"] = 1000 * tempdf["holding"] + 100 * tempdf["x3"] + 10 * tempdf["x2"] + 1 * tempdf["x1"]
    print tempdf.head(50)
    return tempdf, prices

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_cut.py Proyecto: scari/pandas

def test_bins_not_overlapping_from_interval_index():
    # see gh-23980
    msg = "Overlapping IntervalIndex is not accepted"
    ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])

    with pytest.raises(ValueError, match=msg):
        cut([5, 6], bins=ii)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_cython.py Proyecto: DusanMilunovic/pandas

def test_cython_agg_empty_buckets_nanops(observed):
    # GH-18869 can't call nanops on empty groups, so hardcode expected
    # for these
    df = pd.DataFrame([11, 12, 13], columns=['a'])
    grps = range(0, 25, 5)
    # add / sum
    result = df.groupby(pd.cut(df['a'], grps),
                        observed=observed)._cython_agg_general('add')
    intervals = pd.interval_range(0, 20, freq=5)
    expected = pd.DataFrame(
        {"a": [0, 0, 36, 0]},
        index=pd.CategoricalIndex(intervals, name='a', ordered=True))
    if observed:
        expected = expected[expected.a != 0]

    tm.assert_frame_equal(result, expected)

    # prod
    result = df.groupby(pd.cut(df['a'], grps),
                        observed=observed)._cython_agg_general('prod')
    expected = pd.DataFrame(
        {"a": [1, 1, 1716, 1]},
        index=pd.CategoricalIndex(intervals, name='a', ordered=True))
    if observed:
        expected = expected[expected.a != 1]

    tm.assert_frame_equal(result, expected)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: data_pre_processor.py Proyecto: heevery/ohp

def get_data_frame_with_dummies(users):
    users_ref = users.copy()
    base_dummies = None
    categories = {'gender': ['male', 'female'], 'education': ['overGraduate', 'university', 'underHigh'],
                  'income': ['100', '200', '300', '400', '500', '1200more'], 'job': ['officer', 'student', 'etc'],
                  'marriage': ['married', 'single'], 'religion': ['buddhist', 'none', 'christian', 'romanCatholicism']}
    age_bins = [10, 20, 30, 40, 50, 60, 70]
    numChild_bins = [0, 1, 10]
    for label_type in users_ref.columns:
        temp_dummies = None
        if label_type == 'age':
            temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], age_bins, right=False), prefix=label_type)
        elif label_type == 'numberOfChildren':
            temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], numChild_bins, right=False), prefix=label_type)
        elif label_type == 'residence':
            continue
        else:
            users_ref[label_type + "_cat"] = pd.Categorical(users_ref[label_type],
                                                            categories=categories.get(label_type))
            temp_dummies = pd.get_dummies(users_ref[label_type + "_cat"], prefix=label_type)

        if base_dummies is None:
            base_dummies = temp_dummies
        else:
            base_dummies = pd.concat([base_dummies, temp_dummies], axis=1)
    label_nums = base_dummies.sum()
    label_rates = label_nums / float(len(users_ref))
    return base_dummies, label_nums, label_rates

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_tile.py Proyecto: MasonGallo/pandas

    def test_datetime_cut(self):
        # GH 14714
        # testing for time data to be present as series
        data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03']))

        result, bins = cut(data, 3, retbins=True)
        expected = (
            Series(IntervalIndex([
                Interval(Timestamp('2012-12-31 23:57:07.200000'),
                         Timestamp('2013-01-01 16:00:00')),
                Interval(Timestamp('2013-01-01 16:00:00'),
                         Timestamp('2013-01-02 08:00:00')),
                Interval(Timestamp('2013-01-02 08:00:00'),
                         Timestamp('2013-01-03 00:00:00'))]))
            .astype(CDT(ordered=True)))

        tm.assert_series_equal(result, expected)

        # testing for time data to be present as list
        data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'),
                np.datetime64('2013-01-03')]
        result, bins = cut(data, 3, retbins=True)
        tm.assert_series_equal(Series(result), expected)

        # testing for time data to be present as ndarray
        data = np.array([np.datetime64('2013-01-01'),
                         np.datetime64('2013-01-02'),
                         np.datetime64('2013-01-03')])
        result, bins = cut(data, 3, retbins=True)
        tm.assert_series_equal(Series(result), expected)

        # testing for time data to be present as datetime index
        data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03'])
        result, bins = cut(data, 3, retbins=True)
        tm.assert_series_equal(Series(result), expected)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: bootstrap.py Proyecto: iorch/Programa-Data-Science

def bootstrapped_utility(df_testing=None,varname=None, bins = None):
  N = 1500
  MKT = 500000
  S_bin = 6000
  L_bin = 3000
  IT_bin = 0.20
  IP_bin = 0.40
  df_bs = df_testing.sample(frac=1, replace=True)
  df_bs_good = df_bs[df_bs['is_good']==1]
  df_bs_bad = df_bs[df_bs['is_good']!=1]
  hbs_good = cut(df_bs_good[varname], bins=bins)
  hbs_bad = cut(df_bs_bad[varname], bins=bins)
  bs_purity_by_bin = []
  bs_efficiency_by_bin = []
  for g in range(0,len(hbs_good.value_counts())):
    sum_g_b = hbs_good.value_counts()[g] + hbs_bad.value_counts()[g]
    if sum_g_b !=0:
      bs_purity_by_bin.append(1.0*hbs_good.value_counts()[g]/sum_g_b)
    else:
      bs_purity_by_bin.append(1.0*hbs_good.value_counts()[g])
    bs_efficiency_by_bin.append(1.0*sum_g_b/(hbs_good.size+hbs_bad.size))
  bs_purity_by_bin = array(bs_purity_by_bin)
  bs_default_by_bin = -1*(bs_purity_by_bin - 1.0)
  bs_efficiency_by_bin = array(bs_efficiency_by_bin)
  bs_DC_bin = N * bs_efficiency_by_bin * bs_default_by_bin * L_bin
  bs_RT_bin = N * bs_efficiency_by_bin * bs_purity_by_bin * S_bin * IT_bin
  bs_RP_bin = N * bs_efficiency_by_bin * bs_default_by_bin * (S_bin - L_bin) * IP_bin
  bs_f_bin = bs_RT_bin + bs_RP_bin - bs_DC_bin
  bs_f = cumsum(bs_f_bin[::-1])[::-1] - MKT
  return bs_f

Ejemplo n.º 14

0

Mostrar archivo

Archivo: cleantitanic.py Proyecto: pradeep-pasupuleti/DataAnalytics

def cleaneddf(no_bins=0):
    #you'll want to tweak this to conform with your computer's file system
    testpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\test.csv'
    trainpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\train.csv'
    print trainpath
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)
    
    #discretise fare
    if no_bins==0:
        return [cleandf(traindf), cleandf(testdf)]
    traindf=cleandf(traindf)
    testdf=cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True)
    bins=bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)
    
    #discretise age
    bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True)
    bins=bins_and_binned_age[1]
    
    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)
    
    #create a submission file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv(r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\prediction.csv',
                  index=False)

    return [traindf, testdf]

Ejemplo n.º 15

0

Mostrar archivo

Archivo: cleantitanic.py Proyecto: nplay007/kaggle

def cleaneddf(no_bins=0):
    #you'll want to tweak this to conform with your computer's file system
    trainpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtrain.csv'
    testpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtest.csv'
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)
    
    #discretise fare
    if no_bins==0:
        return [cleandf(traindf), cleandf(testdf)]
    traindf=cleandf(traindf)
    testdf=cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True)
    bins=bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)
    
    #discretise age
    bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True)
    bins=bins_and_binned_age[1]
    
    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)
    
    #create a submission file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv('C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/prediction.csv',
                  index=False)
    return [traindf, testdf]

Ejemplo n.º 16

0

Mostrar archivo

Archivo: MlFiterGoldenPd.py Proyecto: alaofeng/abu

    def dummies_xy(cls, order_has_ret):
        """
        bins的选择不是胡来,根据binscs可视化数据结果进行
        :param order_has_ret:
        :return:
        """
        bins = [-np.inf, 0.0, 0.1, 0.2, 0.4, 0.50, 0.85, 1.0, 1.2, np.inf]
        cats = pd.cut(order_has_ret.wave_score1, bins)
        wave_score1_dummies = pd.get_dummies(cats, prefix='ws1_dummies')
        order_has_ret = pd.concat([order_has_ret, wave_score1_dummies], axis=1)

        bins = [-np.inf, 0.0, 0.2, 0.4, 0.6, 0.8, 1.0, np.inf]
        cats = pd.cut(order_has_ret.atr_std, bins)
        atr_dummies = pd.get_dummies(cats, prefix='atr_dummies')
        order_has_ret = pd.concat([order_has_ret, atr_dummies], axis=1)

        bins = [-np.inf, -20, -12, -7, -3, 0, 3, 7, 12, 20, np.inf]

        cats = pd.cut(order_has_ret.deg_hisWindowPd, bins)
        deg_his_window_dummies = pd.get_dummies(cats, prefix='dh_dummies')
        order_has_ret = pd.concat([order_has_ret, deg_his_window_dummies], axis=1)

        cats = pd.cut(order_has_ret.deg_windowPd, bins)
        deg_window_dummies = pd.get_dummies(cats, prefix='dw_dummies')
        order_has_ret = pd.concat([order_has_ret, deg_window_dummies], axis=1)

        cats = pd.cut(order_has_ret.deg_60WindowPd, bins)
        deg_60window_dummies = pd.get_dummies(cats, prefix='d60_dummies')
        order_has_ret = pd.concat([order_has_ret, deg_60window_dummies], axis=1)

        return order_has_ret

Ejemplo n.º 17

0

Mostrar archivo

Archivo: utils.py Proyecto: ericfourrier/decam

def psi(bench,target,group,print_df = True):

    """ This function return the Population Stability Index, quantifying if the 
    distribution is stable between two states.
    This statistic make sense and works is only working for numeric variables 
    for bench and target.
    Params:
    - bench is a numpy array with the reference variable.
    - target is a numpy array of the new variable.
    - group is the number of group you want consider.
    """ 
    labels_q = np.percentile(bench,[(100.0/group)*i for i in range(group + 1)],interpolation = "nearest")

    # This is the right approach when you have not a lot of unique value 
    ben_pct = (pd.cut(bench,bins = np.unique(labels_q),include_lowest = True).value_counts())/len(bench)
    target_pct = (pd.cut(target,bins = np.unique(labels_q),include_lowest = True).value_counts())/len(target)
    target_pct = target_pct.sort_index()# sort the index
    ben_pct = ben_pct.sort_index() # sort the index
    psi = sum((target_pct - ben_pct)*np.log(target_pct/ben_pct))
    # Print results for better understanding
    if print_df:
        results = pd.DataFrame({'ben_pct': ben_pct.values,
                         'target_pct': target_pct.values},
                          index = ben_pct.index)
        return {'data':results,'statistic': psi}
    return psi

Ejemplo n.º 18

0

Mostrar archivo

Archivo: check_probs.py Proyecto: jamestrimble/lungexchange

 def print_wt_dist(df):
     age_breaks = [0, 19, 29, 39, 49, 59, 69, 79, 100]
     df['age_band'] = pd.cut(df['age'], age_breaks)
     df['weight_band'] = pd.cut(df['weight'], weight_breaks)
     pt = pd.pivot_table(df, index=['weight_band'],
             columns=['sex', 'age_band'], values=['age'], aggfunc=[len])
     print 1. * pt.cumsum(0) / pt.sum()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: Loader.py Proyecto: Rob7045713/MetagameAnalysis

def Categoricalize(df, feats):
    """ Turns the continuous raw data into categorical data. Specific to the
    current setup of features for the NS2 data 

    Arguments:
        df      - pandas.DataFrame contatining the data to categoricalize

        feats   - List of features in the DataFrame
    """

    # I chose this, no idea how good it is
    time_cuts = np.array([0,1,2,3,4,5,7.5,10,12.5,15,20,25,30,40,50,60,90,120])
    #time_cuts = np.array([0,1,2,5,10,15,30,60,120])
    time_cuts *= 60

    # THIS IS SPECIFIC TO THE CURRENT DATA SETUP
    time_feats = feats[:1] + feats[5:-1]

    for feat in time_feats:
        df[feat]=pd.cut(df[feat],time_cuts)

    tvt_feats = feats[1:5]
    tvt_cuts = np.array([0, 0.1, 0.2, 0.25, 0.333, 0.5, 0.667, 0.75, 1, 1.333, 1.5, 2, 3, 4, 5, 10, 10000])
    #tvt_cuts = np.array([0, 0.1, 0.2, 0.5, 1, 2, 5, 10, 10000])

    for feat in tvt_feats:
        df[feat]=pd.cut(df[feat],tvt_cuts)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: meta.py Proyecto: sudlab/iCLIPlib

def compress_matrix(matrix, nrows=None, ncols=None):
    '''Compress a matrix to a new number of rows/columns. Cells
    in the matrix are collapsed by averaging. Assumes matrix is sorted

    Parameters
    ----------
    matrix : pandas.DataFrame
        matrix to be compressed. Index(s) must be numeric.
    nrows, nocls : int
        Number of rows/columns in the output matrix. If ``None`` then will
        be same as input matrix.

    Returns
    -------
    pandas.DataFrame
        If `nrows`/`ncols` is ``None`` then the corresponding index will
        be unchanged. Otherwise, index will be integer 0 to `nrows`/`ncols`
        .
    '''

    if ncols:
        groups, bins = pd.cut(matrix.columns.values, ncols,
                              retbins=True, labels=False)
        groups = bins[groups]
        matrix = matrix.groupby(groups, axis=1).mean()

    if nrows:
        groups = pd.cut(range(matrix.shape[0]), nrows, labels=False)
        matrix = matrix.groupby(groups).mean()

    return matrix

Ejemplo n.º 21

0

Mostrar archivo

Archivo: new_partitioning.py Proyecto: shirinnj/ML-Project

def add_digitized_columns_given_input_filter(df_orig, columns_list, cut_point_list, based_on_filter, quantile_cut_point_format= True,
                                             digitized_columns_names=[]):
    df = df_orig.copy()
    filter_name = '*'.join(['_Digital'] +based_on_filter)
    if not digitized_columns_names:
        digitized_columns_names = map(lambda x: x + filter_name, columns_list)
    print digitized_columns_names
    if not based_on_filter:
        if quantile_cut_point_format:
            for k,col in enumerate(columns_list):
                df[digitized_columns_names[k]] = cut_modified(df[col], cut_point_list )
        else:
            for k,col in enumerate(columns_list):
                df[digitized_columns_names[k]] = pd.cut(df[col], cut_point_list, labels =range(len(cut_point_list)-1))

    else:
        df_groups = df.groupby(based_on_filter)
        if quantile_cut_point_format:
            for k,col in enumerate(columns_list):
                #df[digitized_columns_names[k]] = df_groups[col].transform(lambda x: pd.qcut(x,cut_point_list,
                #labels =digital_vals))
                df[digitized_columns_names[k]] = df_groups[col].transform(lambda x: cut_modified(x,cut_point_list))
        else:
            for k,col in enumerate(columns_list):
                df[digitized_columns_names[k]] = df_groups[col].transform(lambda x: pd.cut(x,cut_point_list,
                                                                                           labels =range(len(cut_point_list)-1)))
    return df, digitized_columns_names

Ejemplo n.º 22

0

Mostrar archivo

Archivo: data_process.py Proyecto: TaRyu/fx

def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL):
    data = pd.read_pickle(file_in)['close']
    data = data.reshape(-1, 24)
    data = np.array([data[i:i + 24] for i in range(data.shape[0] - 24 + 1)])
    data_s = {
        'open_price': np.array([data[i][0][0]
                                for i in range(data.shape[0] - 1)]),
        'close_price': np.array([data[i][int(NUM_PIX / 24) - 1][23]
                                 for i in range(data.shape[0] - 1)]),
        'max_price': np.array([data[i].max()
                               for i in range(data.shape[0] - 1)]),
        'min_price': np.array([data[i].min()
                               for i in range(data.shape[0] - 1)]),
        'mean_price': np.array([data[i].mean()
                                for i in range(data.shape[0] - 1)]),
        'median_price': np.array([np.median(data[i])
                                  for i in range(data.shape[0] - 1)]),
        'buy_or_sell': np.array(
            [int(data[i + 1][int(NUM_PIX / 24) - 1][23] > data[i + 1][0][0])
             for i in range(data.shape[0] - 1)]),
        'change': np.array(
            [(data[i + 1][int(NUM_PIX / 24) - 1][23] - data[i + 1][0][0]) /
             data[i + 1][int(NUM_PIX / 24) - 1][23] * 100
             for i in range(data.shape[0] - 1)])}
    data_s = pd.DataFrame(data_s)
    bins = [-100, -5, -4, -3, -2, -1.5, -1, -
            0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100]
    labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
    data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels)
    bins = [-100, -5, -2, 0, 2, 5, 100]
    labels = [-3, -2, -1, 1, 2, 3]
    data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels)
    data = data.reshape(len(data), NUM_PIX)
    np.save(file_out[0], data[:len(data) - 1])
    data_s.to_pickle(file_out[1])

Ejemplo n.º 23

0

Mostrar archivo

Archivo: titanictrain.py Proyecto: minal17/coding_repo

def cleaneddf(no_bins=0):
    trainpath = 'Titanic/train.csv'
    testpath = 'Titanic/test.csv'
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)

    #discretise fare
    if no_bins == 0:
       return [cleandf(traindf), cleandf(testdf)]
    traindf = cleandf(traindf)
    testdf = cleandf(testdf)
    bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins = True)
    bins = bins_and_binned_fare[1]
    traindf.Fare = bins_and_binned_fare[0]
    testdf.Fare = pd.cut(testdf.Fare, bins)

    #discrete age
    bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins = True)
    bins = bins_and_binned_age[1]

    traindf.Age = bins_and_binned_age[0]
    testdf.Age = pd.cut(testdf.Age, bins)

    #create a file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv('Titanic/prediction.csv', index = False)
    return [traindf, testdf]

Ejemplo n.º 24

0

Mostrar archivo

Archivo: plotting.py Proyecto: DZielke/bokeh

def make_continuous_bar_source(df, x_field, y_field='None', df_orig=None, agg='count'):
    """Makes discrete, then creates representation of the bars to be plotted.

    Args:
      df (DataFrame): contains the data to be converted to a discrete form
      x_field (str): the column in df that maps to the x dim of the plot
      y_field (str, optional):  the column in df that maps to the y dim of the plot
      df_orig (DataFrame, optional): original dataframe that the subset ``df`` was
        generated from
      agg (str, optional): the type of aggregation to be used

    Returns:
      ColumnDataSource: aggregated, discrete form of x,y values

    """
    # Generate dataframe required to use the categorical bar source function
    idx, edges = pd.cut(x=df[x_field], bins=8, retbins=True, labels=False)
    labels, edges = pd.cut(x=df[x_field], bins=8, retbins=True)
    centers = pd.rolling_mean(edges, 2)[1:]

    # store new value of x as the bin it fell into
    df['centers'] = centers[idx]
    df['labels'] = labels

    # After making it discrete, create the categorical bar source
    return make_categorical_bar_source(df, 'labels', y_field, df_orig, agg)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: flymad_jaaba_v4.py Proyecto: dbath/wahnsinn

def calc_p_values(data, gt1_name, gt2_name,
                  stat_colname=None,
                  num_bins=50, bin_how='mean',
                  ):

    if stat_colname is None:
        raise ValueError("you must explicitly set stat_colname (try 'maxWingAngle')")
    
    data.index = data.index.astype(np.int64)  #LAZY DANNO. DROP TIMESTAMPS FOR BINNING.
    data['synced_ns'] = data.index
    
    df_ctrl = data[data.group == gt1_name][['FlyID', stat_colname, 'synced_ns']]
    df_exp = data[data.group == gt2_name][['FlyID', stat_colname, 'synced_ns']]

    align_start = df_ctrl.index.min()
    dalign = df_ctrl.index.max() - align_start

    p_values = DataFrame()

    if bin_how=='mean':
        bin_func = np.mean
    elif bin_how=='median':
        bin_func = np.median

    bins = np.linspace(0,dalign,num_bins+1) + align_start
    binned_ctrl = pd.cut(df_ctrl.index, bins, labels= bins[:-1])
    binned_exp = pd.cut(df_exp.index, bins, labels= bins[:-1])
    for x in binned_ctrl.levels:
        test1_full_dataset = df_ctrl[binned_ctrl == x]
        test2_full_dataset = df_exp[binned_exp == x]
        bin_start_time = test1_full_dataset['synced_ns'].min()
        bin_stop_time = test1_full_dataset['synced_ns'].max()

        test1 = []
        for obj_id, fly_group in test1_full_dataset.groupby('FlyID'):
            test1.append( bin_func(fly_group[stat_colname].values) )
        test1 = np.array(test1)
        
        test2 = []
        for obj_id, fly_group in test2_full_dataset.groupby('FlyID'):
            test2.append( bin_func(fly_group[stat_colname].values) )
        test2 = np.array(test2)
        
        try:
            hval, pval = kruskal(test1, test2)
        except ValueError as err:
            pval = 1.0

        dftemp = DataFrame({'Bin_number': x,
                            'P': pval,
                            'bin_start_time':bin_start_time,
                            'bin_stop_time':bin_stop_time,
                            'name1':gt1_name, 
                            'name2':gt2_name,
                            'test1_n':len(test1),
                            'test2_n':len(test2),
                            }, index=[x])
        p_values = pd.concat([p_values, dftemp])
    return p_values

Ejemplo n.º 26

0

Mostrar archivo

Archivo: plotting.py Proyecto: halfdanrump/MarketSimulation

def make_color_grouped_scatter_plot(data_frame, x_name, y_name, color_by, filename, colormap, x_function = 'dummy', y_function = 'dummy', color_function = 'dummy', legend = False, colorbar = True):
    ### Originally created for issue_21
    def dummy(a): return a
    data_frame = data_frame.copy()
    p = Ppl(colormap, alpha=1)

    fig, ax = plt.subplots(1)
    #ax.set_autoscale_on(False)
    ax.set_xlim([eval(x_function)(min(data_frame[x_name])), eval(x_function)(max(data_frame[x_name]))])
    ax.set_ylim([eval(y_function)(min(data_frame[y_name])), eval(y_function)(max(data_frame[y_name]))])
    x_label = x_name.capitalize().replace('_', ' ')
    if x_function == 'log': x_label += ' (log)'
    y_label = y_name.capitalize().replace('_', ' ')
    if y_function == 'log': y_label += ' (log)'
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.xaxis.get_major_formatter().set_powerlimits((0, 1))
    ax.yaxis.get_major_formatter().set_powerlimits((0, 1))
    # Show the whole color range
    n_intervals = len(colormap.colors)
    if color_function == 'log': bins = np.logspace(np.log10( data_frame[color_by].min()), np.log10(data_frame[color_by].max()), n_intervals + 1, base = 10)
    else: bins = np.linspace(eval(color_function)(data_frame[color_by].min()), eval(color_function)(data_frame[color_by].max()), n_intervals + 1)
        
    data_frame['groups'] = pandas.cut(data_frame[color_by], bins=bins, labels = False)
    groups = pandas.cut(data_frame[color_by], bins=bins)
    bounds = []
    
    
    for g in range(n_intervals):
        x = eval(x_function)(data_frame[data_frame.groups == g][x_name])
        y = eval(y_function)(data_frame[data_frame.groups == g][y_name])
        p.scatter(ax, x, y, label=str(groups.levels[g]), s = 5, linewidth=0)

    if legend: p.legend(ax, loc=0)
    #ax.set_title('prettyplotlib `scatter` example\nshowing default color cycle and scatter params')
    
    bounds = bins
    if colorbar:
        cmap = p.get_colormap().mpl_colormap
        
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
        #ax2.set_ylabel(color_by.capitalize().replace('_', ' '), rotation='horizontal')
        #ax2.xaxis.get_major_formatter().set_powerlimits((0, 1))
        #ax2.yaxis.get_major_formatter().set_powerlimits((0, 1))
        ax2 = fig.add_axes([0.9, 0.1 , 0.03, 0.8])

        cbar = mpl.colorbar.ColorbarBase(ax2, cmap=cmap, spacing='proportional', ticks=bounds, norm=norm, alpha=1, orientation='vertical')
        #cbar.ax.set_xticklabels(map(lambda x: '%.3g'%x, bounds))# vertically oriented colorbar
        cbar.ax.set_yticklabels([])# vertically oriented colorbar
        #for j, lab in enumerate(map(lambda lower, upper: '%.3g~%.3g'%(lower, upper), bounds[:-1], bounds[1::])):
        cbar.ax.text(0,1.02, '%.3g'%max(map(eval(color_function), bounds)))
        #for j, lab in enumerate(map(lambda upper: '< %.3g'%upper, bounds[1::])):
        #    cbar.ax.text(.5, (2 * j + 1) / 8.0, lab, ha='center', va='center', rotation='vertical')
        #cbar.ax.set_xticklabels([str(int(t)) for t in bounds])# vertically oriented colorbar
        if color_function == 'log': label = color_by.capitalize().replace('_', ' ') + ' (log)'
        else: label = color_by.capitalize().replace('_', ' ')
        cbar.ax.set_ylabel(label, rotation='vertical')
    fig.savefig(filename)
    return ax, fig

Ejemplo n.º 27

0

Mostrar archivo

Archivo: desc_timespan_moves.py Proyecto: fretillement/Senior-Living

def twoway(df=df): 
	df['num_years_bucket'] = pd.cut(df['num_years'], bins=range(0,31,5), right=False,\
					 retbins=False) 
	df['num_moves_bucket'] = pd.cut(df['num_moves'], bins=range(0,31,5), right=False,\
					 retbins=False) 
	output = pd.DataFrame(df.groupby(['num_moves_bucket', 'num_years_bucket']).size()).reset_index()
	output = output.pivot('num_moves_bucket', 'num_years_bucket', 0)
	return output

Ejemplo n.º 28

0

Mostrar archivo

Archivo: features.py Proyecto: AlexImmer/run-dmc

def binned_color_code_return_probability(train, test, deviations=1.0) -> (
        pd.DataFrame, pd.DataFrame):
    """Bin the colorCode column in training and test using return probabilities of training data.

    Notes
    -----
    This is to deal with unknown colorCodes (CC's) in the target set by binning the CC range.
    The binning considers outlier CC's by keeping them separate, 1-sized bins.
    Outliers are CC's whose return probability is over one standard deviation away from the mean.
    For our training data (mean: 0.548, std: 0.114) colorCode c is an is an outlier if

        retProb(c) < 0.434 || 0.662 < retProb(c), given deviations = 1.

    Parameters
    ----------
    train : pd.DataFrame
    test : pd.DataFrame
        See apply_return_probs
    deviations : float
        Number of standard deviations a return probability has to differ from the mean to be
        considered an outlier.
    """
    color_code_min = 0
    color_code_max = 9999

    # Calculate return probability for each colorCode
    color_code_ret_probs = (train
                            .groupby('colorCode')['returnQuantity']
                            .apply(group_return_probability))

    # Reindex those values to resemble to distribution in the training set
    row_ret_probs = color_code_ret_probs.reindex(train['colorCode'])

    # Calculate mean and minimum mean distance
    mean = row_ret_probs.mean()
    diff = row_ret_probs.std() * deviations
    mean_distances = color_code_ret_probs.sub(mean).abs()

    # iterate over colorCodes and respective mean distances to collect bins
    bins = [color_code_min]
    for cc, mean_distance in mean_distances.items():
        if mean_distance > diff:
            # add the colorCode as 1-sized bin (current cc and cc + 1)
            # if the last colorCode was added, don't add the current cc, only cc + 1.
            if bins[-1] != cc:
                bins.append(cc)
            bins.append(cc + 1)
    bins.append(color_code_max + 1)

    # Assign bins to each row in test and training data
    train['binnedColorCode'] = pd.cut(train['colorCode'], bins, right=False, labels=False)
    test['binnedColorCode'] = pd.cut(test['colorCode'], bins, right=False, labels=False)

    train, test = apply_return_probs(train, test, 'binnedColorCode', 'colorReturnProb')
    # Test set colorCodes that are bigger than any colorCode in the training set fall into a
    # category that has no returnProbability. Impute that bin with the mean retProb.
    test['colorReturnProb'] = test['colorReturnProb'].fillna(mean)
    return train, test

Ejemplo n.º 29

0

Mostrar archivo

Archivo: wavefront.py Proyecto: jamierodriguez/WavefrontPSF

    def reduce_data_to_field(self, data, xkey='x', ykey='y',
            reducer=np.median, num_bins=1, **kwargs):
        """Take data and bin by two of its coordinates (default focal plane
        coordinates x and y). Then, in each bin, apply the reducer function.

        Parameters
        ----------
        data : dataframe
            Pandas dataframe that has xkey and ykey columns.

        xkey, ykey : strings, default 'x' and 'y'
            Keys by which the data are binned. These keys must be in the data,
            or else problems will arise!

        reducer : function
            Function that takes set of data and returns a number. After the
            data is binned, one applies this function to the data in a bin. So
            if one sets reducer=np.mean, then the resultant data is a two
            dimensional histogram.

        num_bins : int, default 1
            Number of bins for the focal plane. If less than six, then the
            number of bins is a sort of proxy for the number of divisions of a
            chip. Default is 2 bins per chip. This implies that num_bins < 6
            does not really make sense for xkey,ykey neq x,y

        Returns
        -------
        field : dataframe
            Dataframe binned by x and y coordinates.

        bins_x, bins_y : arrays
            Arrays of the bins used.

        Notes
        -----
        data needs to have 'x' and 'y' keys!

        """
        x = data[xkey]
        y = data[ykey]

        if num_bins < 6:
            bins_x, bins_y = self.edges(num_bins)
        else:
            bins_x = np.linspace(np.min(x), np.max(x), num_bins)
            bins_y = np.linspace(np.min(y), np.max(y), num_bins)
        groups = data.groupby([pd.cut(x, bins_x), pd.cut(y, bins_y)])
        field = groups.aggregate(reducer)
        # also get the count
        counts = groups[xkey].aggregate('count').values

        # filter out nanmins on x and y
        field = field[field[xkey].notnull() & field[xkey].notnull()]
        # counts already filtered out notnull so let's try!
        field['N'] = counts

        return field, bins_x, bins_y

Ejemplo n.º 30

0

Mostrar archivo

Archivo: bin_test.py Proyecto: IQSS/miniverse

def run_test2():

    orig_animals = ['cat', 'dog', 'mouse']
    animals = orig_animals * 3

    raw_data = { 'animal' : animals,
                'score' : get_rand_num_array(len(animals))
    }

    # make DataFrame
    #
    df = pd.DataFrame(raw_data, columns = ['animal', 'score'])

    print '-' * 10
    print df
    print '-' * 10
    #return

    # Create array for bins
    #
    bins = get_bin_list(step=20, low_num=0, high_num=100)

    # For each score assign it to a bin
    #
    labels = pd.cut(df['score'], bins)

    # Same as above but adding the bin value as a column to the DataFrame
    #
    df['bin_label'] = pd.cut(df['score'], bins)
    print type(df)
    print df.describe
    print '-' * 10

    from collections import Counter
    c = Counter(df['bin_label'])
    print '-' * 10
    print c

    vcounts = pd.value_counts(df['bin_label'])
    print vcounts
    #print 'by_bin', by_bin
    print '-' * 10
    vcounts = df['bin_label'].value_counts()
    d = vcounts.to_dict()
    keys = d.keys()
    keys.sort()
    for k in keys:
        print k, d[k], type(k)

    return
    # Show the count in each bin
    #
    vc_series = pd.value_counts(df['bin_label'])
    print '\n', 'vc_series', vc_series
    print '-' * 10

    print vc_series.axes
    import ipdb; ipdb.set_trace()

Ejemplo n.º 31

0

Mostrar archivo

                       payer_enc['START']).dt.days >= 21]

#----- Calculate coverage
payer_agg = payer_rqd[['PATIENT', 'PAYER_COVERAGE', 'TOTAL_CLAIM_COST']]
payer_agg = payer_agg.groupby(['PATIENT'], as_index=False).agg({
    "PAYER_COVERAGE":
    "sum",
    "TOTAL_CLAIM_COST":
    "sum"
})
payer_agg['COVERAGE_PCT'] = (payer_agg['PAYER_COVERAGE']) / (
    payer_agg['TOTAL_CLAIM_COST'])
bins_p = [-10, 0.00001, 0.5, 0.80, 11]
labels_p = ['NO COVERAGE', 'LOW', 'MED', 'HIGH']
payer_agg["PAYERBUCKET"] = pd.cut(payer_agg['COVERAGE_PCT'],
                                  bins=bins_p,
                                  labels=labels_p)

payer_agg = pd.get_dummies(data=payer_agg, columns=['PAYERBUCKET'])
payer_agg = payer_agg.drop(['PAYER_COVERAGE', 'TOTAL_CLAIM_COST'], axis=1)
payer_agg = payer_agg.drop_duplicates()

#--- Caculate latest payer
payer_name = pd.merge(payer_t, payer_df1, left_on='PAYER', right_on='Id')
payer_year = payer_name.groupby(['PATIENT'],
                                as_index=False).agg({"END_YEAR": "max"})
payer_name2 = pd.merge(payer_name,
                       payer_year,
                       left_on=['PATIENT', 'END_YEAR'],
                       right_on=['PATIENT', 'END_YEAR'])
payer_name2 = payer_name2.drop(

Ejemplo n.º 32

0

Mostrar archivo

def cal_ic(index, begt, endt, cycle, kind, **kargs):
    chosen_ = kargs.get('chosen', [])
    sector = kargs.get('sector', '')
    mode = kargs.get('mode', 'all')

    #date_list = [parse(str(x)).strftime('%Y-%m-%d') for x in get_ts_td(begt,endt,cycle)]
    date_list = [
        parse(str(x)).strftime('%Y-%m-%d') for x in get_tc_td(begt, endt)
    ]
    #import pdb;pdb.set_trace()
    ic_dict = {}

    if len(sector):
        sector_stks_dict = load_zzhy_all(DATA_PATH)

    small_comb_num = 5
    big_comb_num = 6
    for i in range(small_comb_num, big_comb_num):
        if not len(chosen_):
            chosen = [';'.join(x) for x in list(combinations(kind, i))]
        else:
            chosen = chosen_
        for factors in list(combinations(kind, i)):
            bbb = ';'.join(factors)
            if bbb.split(';') not in [x.split(';') for x in chosen]:
                continue
            for f in factors:
                if i > 1:
                    ic_dict[';'.join(factors)] = [''] * (len(date_list) - 1)
                else:
                    ic_dict[f] = [''] * (len(date_list) - 1)

    for x in range(0, len(date_list) - 1):
        INTdate = int(date_list[x].replace('-', ''))
        print(INTdate)
        '''
		if not len(sector):
			factor = get_fin(INTdate)
		else:
			factor = get_fin(INTdate,sector=sector)
		'''
        factor = get_fin(INTdate,
                         sector=sector,
                         sector_stks_dict=sector_stks_dict)
        factor = factor.set_index('代码')
        #import pdb;pdb.set_trace()
        #factor = score_factors(factor,VALUE+PROFIT+TRADE+GROWTH_,GROWTH)
        factor = score_factors(factor, VALUE + PROFIT + GROWTH_, GROWTH)
        #import pdb;pdb.set_trace()
        for i in range(small_comb_num, big_comb_num):
            if not len(chosen_):
                chosen = [';'.join(x) for x in list(combinations(kind, i))]
            else:
                chosen = chosen_
            for factors in list(combinations(kind, i)):
                bbb = ';'.join(factors)
                if bbb.split(';') not in [x.split(';') for x in chosen]:
                    continue

                score = []
                for f in factors:
                    f_score = factor[[f]]
                    if not len(score):
                        score = f_score
                    else:
                        #import pdb;pdb.set_trace()
                        score = score.join(f_score)

                score = score.mean(axis=1)
                score = score.dropna()
                score.name = 'f'
                ret = factor[['下一个月涨幅']]
                ret = ret.join(score)
                ret = ret.dropna()
                #import pdb;pdb.set_trace()
                fut_ret_rank = pd.cut(ret['下一个月涨幅'], 10, labels=False) + 1
                factor_rank = pd.cut(ret['f'], 10, labels=False) + 1
                ic = stats.spearmanr(fut_ret_rank, factor_rank)[0]
                #import pdb;pdb.set_trace()

                if i > 1:
                    ic_dict[';'.join(factors)][x] = ic
                else:
                    #import pdb;pdb.set_trace()
                    ic_dict[f][x] = ic
                #import pdb;pdb.set_trace()

    ic_df = pd.DataFrame(ic_dict)
    t_stat, p_value = stats.ttest_1samp(ic_df, 0)
    #import pdb;pdb.set_trace()

    sorted_columns = [
        '1/PFCF', 'EBITDA/EV', 'S/EV', '1/PE', '1/PB', '1/PS', '1/POP',
        'CROIC', 'ROIC', 'ROIC1', 'EP', 'EBITDA-资本支出/IC', 'ROE', 'ROE(扣除)',
        'ROA', 'FCF/营业收入', '毛利/净资产', '毛利率', '净利率', '销售现金比', '现金营业收入比',
        'delta毛利率', 'deltaROE', '负债/投入资本', '外部融资额/总资产', '资产负债率', '有形净值债务率',
        '经营现金净流入/资本支出', '折旧/投入资本', '长期负债变化率', '近两年平均资本支出增长率', 'delta存货周转率',
        'delta应收账款周转率', 'TTMFCF增长率', 'TTM净利润增长率', 'TTM营业利润增长率', 'TTM营业收入增长率',
        '最新一季净利润增长率', '最新一季营业利润增长率', '最新一季营业收入增长率', '近一个月跌幅', 'SKEW', '5/60',
        '换手率', 'ILLIQ', '波动率', '目标收益率', '预期增长率', '预期PE', '预期PEG', 'Delta预期增长率',
        'Delta预期增长率2', 'Delta预期当年净利润增长率2', '总市值'
    ]
    #import pdb;pdb.set_trace()
    IC = pd.DataFrame(ic_df.mean(), columns=['IC'])
    IC['STD'] = ic_df.std()
    IC['IR'] = IC['IC'] / ic_df.std()
    #import pdb;pdb.set_trace()
    IC['T'] = t_stat
    IC['P'] = p_value

    if not len(mode):
        alpha_factor = IC.sort_values(by='T', ascending=False)
        selected = alpha_factor
        selected.to_csv('IC_{0}_{1}_{2}_{3}_{4}_{5}_{6}.csv'.format(
            begt, endt, index, 'AllbyT', str(small_comb_num),
            str(big_comb_num), sector))
    else:
        IC = IC.reindex_axis(kind)
        selected = IC
        selected.to_csv('IC_{0}_{1}_{2}_{3}_{4}_{5}_{6}.csv'.format(
            begt, endt, index, 'all', str(small_comb_num), str(big_comb_num),
            sector))
    return selected

Ejemplo n.º 33

0

Mostrar archivo

Fraudcheck.head()
Fraudcheck.columns

Le = preprocessing.LabelEncoder()
Fraudcheck['undergrad'] = Le.fit_transform(Fraudcheck['Undergrad'])
Fraudcheck['marital_Status'] = Le.fit_transform(Fraudcheck['Marital_Status'])
Fraudcheck['urban'] = Le.fit_transform(Fraudcheck['Urban'])
#Droping
Fraudcheck.drop(["Undergrad"], inplace=True, axis=1)
Fraudcheck.drop(["Marital_Status"], inplace=True, axis=1)
Fraudcheck.drop(["Urban"], inplace=True, axis=1)

# converting float value to integer S we are converting float to catogorical data
bins = [-1, 30000, 100000]
Fraudcheck["Taxable_Income"] = pd.cut(Fraudcheck["Taxable_Income"],
                                      bins,
                                      labels=["Risky", "Good"])

colnames = list(Fraudcheck.columns)
predictors = colnames[1:6]  #inputs
target = colnames[0]  #outputs

X = Fraudcheck[predictors]
Y = Fraudcheck[target]
####### GridSearch

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=2,
                            oob_score=True,
                            n_estimators=1000,
                            criterion="entropy")

Ejemplo n.º 34

0

Mostrar archivo

Archivo: User_value_analysis.py Proyecto: kakaair/User_value_analysis

print('-' * 60)

# 四、RMF分析实现过程
# 4.1数据转换
recency_value = sales_data['ORDERDATE'].groupby(
    sales_data.index).max()  # 计算原始最近一次订单时间
frequency_value = sales_data['ORDERDATE'].groupby(
    sales_data.index).count()  # 计算原始订单频率
monetary_value = sales_data['AMOUNTINFO'].groupby(
    sales_data.index).sum()  # 计算原始订单总金额

# 4.2计算RFM得分
# 分别计算R、F、M得分
deadline_date = pd.datetime(2017, 1, 1)  # 指定一个时间节点，用于计算其他时间与该时间的距离
r_interval = (deadline_date - recency_value).dt.days  # 计算R间隔
r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1])  # 计算R得分
f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5])  # 计算F得分
m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5])  # 计算M得分

# R、F、M数据合并
rfm_list = [r_score, f_score, m_score]  # 将r、f、m三个维度组成列表
rfm_cols = ['r_score', 'f_score', 'm_score']  # 设置r、f、m三个维度列名
rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(),
                      dtype=np.int32,
                      columns=rfm_cols,
                      index=frequency_value.index)  # 建立r、f、m数据框
print('RFM Score Overview:')
print(rfm_pd.head(4))
print('-' * 60)

# 计算RFM总得分:加权得分

Ejemplo n.º 35

0

Mostrar archivo

Archivo: data_20191021141352.py Proyecto: bkraft4257/kaggle_titanic

 def calc_age_bins(self):
     self.Xy['age_bin'] = pd.cut(self.Xy.age, bins=[0,10,20,30, 40, 50, 60, np.inf])

Ejemplo n.º 36

0

Mostrar archivo

Archivo: utf-8''nhanes_univariate_practice.py Proyecto: JiatianWang/Statistics-with-Python_week2_reference_univariate_practice

    1: 'Married',
    2: 'Widowed',
    3: 'Divorced',
    4: 'Separated',
    5: 'Never married',
    6: 'Living with partner',
    77: 'Refused',
    99: 'dont know'
})
da['DMDMARTL2'].value_counts()

da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})

a = da.groupby(da["RIAGENDRx"])['DMDMARTL2'].value_counts()

da["agegrp"] = pd.cut(da.RIDAGEYR, [30, 40])

b = da.groupby([da["RIAGENDRx"], da['agegrp']])['DMDMARTL2'].value_counts()

b.loc['Male', :].unstack()

# __Q1a.__ Briefly comment on some of the differences that you observe between the distribution of marital status between women and men, for people of all ages.

# __Q1b.__ Briefly comment on the differences that you observe between the distribution of marital status states for women between the overall population, and for women between the ages of 30 and 40.

# __Q1c.__ Repeat part b for the men.

# ## Question 2
#
# Restricting to the female population, stratify the subjects into age bands no wider than ten years, and construct the distribution of marital status within each age band.  Within each age band, present the distribution in terms of proportions that must sum to 1.

Ejemplo n.º 37

0

Mostrar archivo

def Ks2(data, flag):
    # Bin que vai permitir agrupar os valores por score
    bin = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

    #Contar o total de ocorrências de bons e mals pagadores
    dfTotal = data.ALVO.value_counts().to_frame()
    dfTotal = dfTotal.rename(index={0: "Bom", 1: "Mal"})
    totais = dfTotal.T  #totais.at['ALVO','Bom'] e totais.at['ALVO','Mal']

    #Contando por percentual de decil
    df = data.groupby(pd.cut(data.SCORE, bins=bin))['ALVO'].value_counts()

    # Desempilhando o Dataframe
    df = df.unstack()

    # Calculando o percentual do decil comparado ao valor total
    df["PercentualBom"] = (df[0] / totais.at['ALVO', 'Bom'])
    df["PercentualMal"] = (df[1] / totais.at['ALVO', 'Mal'])

    # Calculando o Cumulativo
    df["PercentualBomAcc"] = df['PercentualBom'].rolling(min_periods=1,
                                                         window=10).sum()
    df["PercentualMalAcc"] = df['PercentualMal'].rolling(min_periods=1,
                                                         window=10).sum()

    # Calculando o ks2 por decil
    df['KS2'] = abs(df["PercentualBomAcc"] - df["PercentualMalAcc"])

    # Label
    label = [
        "0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80",
        "80-90", "90-100"
    ]

    flag = flag.strip().lower()

    if flag == "no":

        # Texto da Tabela descrevendo gráfico
        desc = """O KS2 é uma métrica utilizada para sabermos quanto o modelo discrimina os bons dos maus clientes. 
            Seu valor é a maior diferença das distribuições acumuladas dos dois públicos analisados. 
            Quanto maior o KS2, melhor será a discriminação dos dois públicos pelo modelo em análise."""

        #fig = go.Figure()

        # Fazendo os subplots para colocar a descrição e o gráfico na mesma imagem
        fig = make_subplots(rows=1,
                            cols=3,
                            specs=[[{
                                "type": "table"
                            }, {
                                "colspan": 2
                            }, None]])

        # Add Table
        fig.add_trace(go.Table(header=dict(values=["Descrição"],
                                           font=dict(size=10),
                                           align="left"),
                               cells=dict(values=[desc], align="left")),
                      row=1,
                      col=1)

        # Add linha de Bom
        fig.add_trace(
            go.Scatter(x=label,
                       y=df.PercentualBomAcc,
                       hovertemplate="%{x},%{y}",
                       name="Bom"))

        # Add linha de Mal
        fig.add_trace(go.Scatter(x=label,
                                 y=df.PercentualMalAcc,
                                 hovertemplate="%{x},%{y}",
                                 name="Mal"),
                      row=1,
                      col=2)

        # Add linha de KS2
        fig.add_trace(go.Scatter(x=label, y=df.KS2, name="KS2"), row=1, col=2)

        fig.update_xaxes(title_text="Faixa de Score", row=1, col=2)
        fig.update_yaxes(title_text="% População (AC)", row=1, col=2)
        fig.update_yaxes(tickformat=".3%", row=1, col=2)

        #Add Formatação do Gráfico
        fig.update_layout(title={
            'text': "KS2",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        }, )

        fig.show()

        return fig

    elif flag == "yes":

        pre_fig = make_subplots(rows=1, cols=2, specs=[[{"colspan": 2}, None]])

        # Add linha de Bom
        pre_fig.add_trace(
            go.Scatter(x=label,
                       y=df.PercentualBomAcc,
                       hovertemplate="%{x},%{y}",
                       name="Bom"))

        # Add linha de Mal
        pre_fig.add_trace(go.Scatter(x=label,
                                     y=df.PercentualMalAcc,
                                     hovertemplate="%{x},%{y}",
                                     name="Mal"),
                          row=1,
                          col=1)

        # Add linha de KS2
        pre_fig.add_trace(go.Scatter(x=label, y=df.KS2, name="KS2"),
                          row=1,
                          col=1)

        pre_fig.update_xaxes(title_text="Faixa de Score", row=1, col=1)
        pre_fig.update_yaxes(title_text="% População (AC)", row=1, col=1)
        pre_fig.update_yaxes(tickformat=".3%", row=1, col=1)

        #Add Formatação do Gráfico
        pre_fig.update_layout(title={
            'text': "KS2",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        }, )

        pre_fig.show()

        desc = str(input("digite a descrição desejada: "))

        # Fazendo os subplots para colocar a descrição e o gráfico na mesma imagem
        fig = make_subplots(rows=1,
                            cols=3,
                            specs=[[{
                                "type": "table"
                            }, {
                                "colspan": 2
                            }, None]])

        # Add Table
        fig.add_trace(go.Table(header=dict(values=["Descrição"],
                                           font=dict(size=10),
                                           align="left"),
                               cells=dict(values=[desc], align="left")),
                      row=1,
                      col=1)

        # Add linha de Bom
        fig.add_trace(
            go.Scatter(x=label,
                       y=df.PercentualBomAcc,
                       hovertemplate="%{x},%{y}",
                       name="Bom"))

        # Add linha de Mal
        fig.add_trace(go.Scatter(x=label,
                                 y=df.PercentualMalAcc,
                                 hovertemplate="%{x},%{y}",
                                 name="Mal"),
                      row=1,
                      col=2)

        # Add linha de KS2
        fig.add_trace(go.Scatter(x=label, y=df.KS2, name="KS2"), row=1, col=2)

        fig.update_xaxes(title_text="Faixa de Score", row=1, col=2)
        fig.update_yaxes(title_text="% População (AC)", row=1, col=2)
        fig.update_yaxes(tickformat=".3%", row=1, col=2)

        #Add Formatação do Gráfico
        fig.update_layout(title={
            'text': "KS2",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        }, )

        fig.show()

        return fig

    else:
        raise " Flag invalida, por favor digite 'YES' ou 'NO' "

Ejemplo n.º 38

0

Mostrar archivo

print(TopFin_df)

#Bottom Five Schools
BotFin_df = Place_df.tail(5)
print(BotFin_df)

#Math Scores by Grade
MBG_df = Stud_df.groupby(["school_name", "grade"])[["math_score"]].mean()
print(round(MBG_df, 2))

RBG_df = Stud_df.groupby(["school_name", "grade"])[["reading_score"]].mean()
print(round(RBG_df, 2))

#Cutting the dataframe into intervals: 575-599, 600 to 629, 630 to 649, 650+
Fin_df["Spending Ranges"] = pd.cut(
    Fin_df["Per Student Budget"], [575, 599, 629, 649, 674],
    labels=["Below $600", "$600-$629", "$630-$649", "At Least $650"])
#scores by school spending
BySpend_df = Fin_df.groupby(["Spending Ranges"])[[
    "Avg Math Score", "Avg Reading Score", "% Passing Math",
    "% Passing Reading", "Overall Passing Rate"
]]
print(round(BySpend_df.mean(), 2))

#Cutting the data into 3 groups by School Size
Fin_df["Size Ranges"] = pd.cut(Fin_df["Total Students"], [0, 1499, 2999, 5000],
                               labels=["Small", "Medium", "Large"])

BySize_df = Fin_df.groupby(["Size Ranges"])[[
    "Avg Math Score", "Avg Reading Score", "% Passing Math",
    "% Passing Reading", "Overall Passing Rate"

Ejemplo n.º 39

0

Mostrar archivo

def create_ml_100k_dataset(embed_dim=16):
    """加载数据"""
    base_path = '/home/mesie/python/recommend/recommend-learning/data/'
    rating_df = pd.read_csv(
        base_path + 'ml-100k/u.data',
        sep='\t',
        names=['user_id', 'movie_id', 'rating', 'timestamp'])

    user_df = pd.read_csv(
        base_path + 'ml-100k/u.user',
        sep='|',
        names=['user_id', 'age', 'gender', 'occupation', 'zip'])

    item_df = pd.read_csv(base_path + 'ml-100k/u.item',
                          sep='|',
                          encoding="ISO-8859-1",
                          names=[
                              'movie_id', 'title', 'release_date',
                              'video_release_date', 'url', 'unknown', 'Action',
                              'Adventure', 'Animation', 'Children', 'Comedy',
                              'Crime', 'Documentary', 'Drama', 'Fantasy',
                              'Film-Noir', 'Horror', 'Musical', 'Mystery',
                              'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
                          ])

    data_df = pd.merge(rating_df, user_df, how="left")
    data_df = pd.merge(data_df, item_df, how="left")
    data_df['label'] = data_df['rating'].apply(get_label)

    # 处理age
    data_df['age'] = pd.cut(
        data_df['age'],
        bins=[0, 15, 25, 35, 45, 60, 100],
        labels=['0-15', '15-25', '25-35', '35-45', '45-60', '60-100'])
    # 处理电影类型
    user_features = ['user_id', 'age', 'gender']
    movie_features = [
        'movie_id', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
        'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
        'Western'
    ]

    # features = user_features + movie_features
    feature_max_idx = {}
    for feature in user_features + movie_features:
        lbe = LabelEncoder()
        data_df[feature] = lbe.fit_transform(data_df[feature])
        feature_max_idx[feature] = data_df[feature].max() + 1

    user_feat_cols = []
    user_feat_cols.append(
        sparseFeature(feat='user_id',
                      feat_num=feature_max_idx['user_id'],
                      embed_dim=embed_dim))
    user_feat_cols = user_feat_cols + [
        sparseFeature(feat=uf, feat_num=feature_max_idx[uf])
        for uf in ['age', 'gender']
    ]

    item_feat_cols = []
    item_feat_cols.append(
        sparseFeature(feat='movie_id',
                      feat_num=feature_max_idx['movie_id'],
                      embed_dim=embed_dim))
    movie_type = [
        'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
        'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
        'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]
    item_feat_cols = item_feat_cols + [
        sparseFeature(feat=mt, feat_num=feature_max_idx[mt])
        for mt in movie_type
    ]

    train, test = train_test_split(data_df, test_size=0.2)
    train_X = [{feat: train[feat].values
                for feat in user_features},
               {feat: train[feat].values
                for feat in movie_features}]
    # train_X = [train[user_features].values.astype('int32'), train[movie_features].values.astype('int32'),
    #            train['label'].values.astype('int32')]
    train_y = train['label'].values.astype('int32')
    test_X = [{feat: test[feat].values
               for feat in user_features},
              {feat: test[feat].values
               for feat in movie_features}]

    # test_X = [test[user_features].values.astype('int32'), test[movie_features].values.astype('int32'),
    #           test['label'].values.astype('int32')]
    test_y = test['label'].values.astype('int32')
    print(train_X)
    return user_feat_cols, item_feat_cols, train_X, train_y, test_X, test_y

Ejemplo n.º 40

0

Mostrar archivo

Archivo: data_20191028100510.py Proyecto: bkraft4257/kaggle_titanic

 def calc_age_bins(self):
     """Calculates age bins. 
     """
     self.Xy["age_bin"] = pd.cut(self.Xy.age, bins=self.age_bins)

Ejemplo n.º 41

0

Mostrar archivo

#from sklearn.svm import SVC
#from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

print('Loading dataframe...')
t0 = time()
with open('jokes.df.pickle', 'rb') as pickle_file:
    df = pickle.load(pickle_file)
print("done in %0.3fs" % (time() - t0))

print('Flattening {} d2v lists ...'.format(df.d2v.values.shape[0]))
X = np.reshape([r for r in df.d2v.values], (df.d2v.values.shape[0], 300))
y = pd.cut(df.score, [-1, 15, 50000], labels=['MEH', 'GOOD'])
print(df.score.describe())
print("done in %0.3fs" % (time() - t0))

print('Splitting into a training and testing set ...')
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)
print("done in %0.3fs" % (time() - t0))

print("Projecting the input data on the d2v orthonormal basis (PCA'ing) ...")
pca = PCA(n_components=32, svd_solver='randomized', whiten=True).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))

Ejemplo n.º 42

0

Mostrar archivo

data['title'] = data['title'].fillna(0)
data['title'] = data['title'].astype(int)

data['title']
pd.value_counts(data['title'])

#  age  - standardscaler
data['age'].value_counts()
data['age'].describe()
data['age'].fillna(method='pad', inplace=True)
np.bincount(data['age'].isnull())

#  age_range  - encoding
bins = [0, 5, 10, 15, 20, 30, 40, 50, 60, 80, 100]
labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
data['age_range'] = pd.cut(data['age'], bins, labels=labels)
data['age_range']
np.bincount(data['age_range'].isnull())
data.age_range.unique()
sns.countplot(data['age_range'], data=data, hue=data['survived'])
plt.close('all')
# del data['age']
data['age_range']

# cabin-> cabin list -> encoding
data['cabin'].unique()
np.bincount(data['cabin'].isnull())
data['cabin']

data.loc[data.cabin.str[0] == 'A', 'cabinlist'] = 1
data.loc[data.cabin.str[0] == 'B', 'cabinlist'] = 2

Ejemplo n.º 43

0

Mostrar archivo

Archivo: plot_model_fold_change.py Proyecto: blab/flu-forecasting-all-lineages

    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="model TSV")
    parser.add_argument("faceted_output", help="faceted model fold change scatterplot (PDF, PNG, etc.)")
    parser.add_argument("combined_output", help="single-panel model fold change scatterplot (PDF, PNG, etc.)")
    parser.add_argument("year_range")
    parser.add_argument("viruses")
    parser.add_argument("predictors")
    parser.add_argument("sample")

    args = parser.parse_args()

    frequency_bins = np.linspace(0.0, 1.0, 5)
    df = pd.read_table(args.input)
    df["observed_ratio"] = df["observed_freq"] / df["initial_freq"]
    df["predicted_ratio"] = df["predicted_freq"] / df["initial_freq"]
    df["binned_initial_freq"] = pd.cut(df["initial_freq"], bins=frequency_bins)

    correlation_by_bin = []
    mcc_by_bin = []
    n_clades_by_bin = []
    for freq, freq_df in df.groupby("binned_initial_freq"):
        correlation_by_bin.append(pearsonr(freq_df["observed_ratio"], freq_df["predicted_ratio"])[0])

        # Calculate Matthew's correlation coefficient
        mcc_by_bin.append(get_matthews_correlation_coefficient_for_data_frame(freq_df))

        n_clades_by_bin.append(freq_df.shape[0])

    g = sns.FacetGrid(df, col="binned_initial_freq", col_wrap=2, height=4)
    g.map(sns.regplot, "observed_ratio", "predicted_ratio", fit_reg=False)
    g.add_legend()

Ejemplo n.º 44

0

Mostrar archivo

[
    train_data.Age[train_data.Pclass == i].plot.kde(bw_method=0.3,
                                                    color=colors[i - 1])
    for i in [1, 2, 3]
]
plt.xlim((-20, 100))
plt.legend(["First class", "Second class", "Third class"])
plt.title("Density plot Age wrt Class of Travel")

# Consider Age categories instead of specific ages
train_data.Age = train_data.Age.fillna(-0.5)

labels = ['Unknown', 'Babies', 'Children', 'Youth', 'Adults', 'Seniors']
bins = [-1, 0, 5, 15, 24, 65, np.inf]

train_data['AgeCategories'] = pd.cut(train_data["Age"], bins, labels=labels)

survival_rate_per_category = (
    train_data.AgeCategories[train_data.Survived == 1].value_counts(
        normalize=True).sort_index() /
    train_data.AgeCategories[train_data.Survived == 0].value_counts(
        normalize=True).sort_index())

plt.subplot2grid((1, 3), (0, 2))
survival_rate_per_category.plot(
    kind="bar",
    alpha=0.6,
    color=["lightblue", "teal", "wheat", "beige", "grey", "lavender"])
plt.ylabel("survival rate")
plt.title("Survival Rate per Age Category")

Ejemplo n.º 45

0

Mostrar archivo

Archivo: naivebayes.py Proyecto: ryankwast/351-lyric-analysis

# In[4]:


df = pd.read_csv ('spotify_songs.csv')


# In[6]:


lyrics = df['lyrics'].values.astype(str)


# In[41]:


y = pd.cut(df.valence,bins=[0,0.5,1],labels=[0,1])


# In[42]:


lyrics_train, lyrics_test, y_train, y_test = train_test_split(lyrics, y, random_state=1)


# In[44]:


vectorizer = CountVectorizer()
vectorizer.fit(lyrics_train)

Ejemplo n.º 46

0

Mostrar archivo

Archivo: 9_cut.py Proyecto: shcqupc/Alg_study

import pandas as pd
import numpy as np
x = np.array([1, 7, 5, 4, 6, 3])
print(pd.cut(x, 3))

Ejemplo n.º 47

0

Mostrar archivo

Archivo: titanic_script.py Proyecto: gybay/kaggle_projects

for d in data:
    for i in range(0, 2):
        for j in range(0, 3):
            guess = d[(d['Sex'] == i) & (d['Pclass'] == j + 1)]['Age'].dropna()
            age_guess = guess.mean()
            ages[i, j] = int(age_guess / 0.5 + 0.5) * 0.5
    for i in range(0, 2):
        for j in range(0, 3):
            d.loc[(d.Age.isnull()) & (d.Sex == i) & (d.Pclass == j + 1),
                  'Age'] = ages[i, j]
    d['Age'] = d['Age'].astype(int)

print(train_data.head())

# Examine the survival rate among Age bands
train_data['AgeBand'] = pd.cut(train_data['Age'], 5)
print(train_data[['AgeBand', 'Survived']].groupby(
    ['AgeBand'], as_index=False).mean().sort_values(by='AgeBand',
                                                    ascending=True))

# Replace Age with AgeBand
for d in data:
    d.loc[d['Age'] <= 16, 'Age'] = 0
    d.loc[(d['Age'] > 16) & (d['Age'] <= 32), 'Age'] = 1
    d.loc[(d['Age'] > 32) & (d['Age'] <= 48), 'Age'] = 2
    d.loc[(d['Age'] > 48) & (d['Age'] <= 64), 'Age'] = 3
    d.loc[d['Age'] > 64, 'Age']

print(train_data.head())
# Remove AgeBand feature
train_data = train_data.drop(['AgeBand'], axis=1)

Ejemplo n.º 48

0

Mostrar archivo

fig4 = px.box(gss_clean,
              x='job_prestige',
              y='sex',
              color='sex',
              labels={
                  'job_prestige': 'Occupational Prestige',
                  'sex': ''
              })
fig4.update_layout(showlegend=False)
fig4.update(layout=dict(title=dict(x=0.5)))
fig4.show()

new_df = gss_clean[['income', 'sex', 'job_prestige']]

new_df['job_prestige_cat'] = pd.cut(new_df['job_prestige'],
                                    bins=6,
                                    labels=['1', '2', '3', '4', '5', '6'])
new_df = new_df.sort_values(by='job_prestige_cat')

clean_df = new_df.dropna(axis=0)

fig_bar = px.box(clean_df,
                 x='sex',
                 y='income',
                 color='sex',
                 facet_col='job_prestige_cat',
                 facet_col_wrap=2,
                 hover_data=['income'],
                 labels={
                     'sex': 'Sex',
                     'income': 'Income'

Ejemplo n.º 49

0

Mostrar archivo

    # load data
    data_temp = data_total[data_total['channel'] == str(c + 1)]
    #print(data_temp)

    # prepare binning (bin_list)
    photon_max = max(data_temp['intensity [photon]'])
    print('photon_max: {}'.format(photon_max))
    binsize = 1000
    photon_bin_max = photon_max // binsize
    print('photon_bin_max: {}'.format(photon_bin_max))
    bin_list = list(range(0, (int(photon_bin_max) + 2) * binsize, binsize))
    print(bin_list)

    # prepare binned data
    data_total_tmp = data_total
    data_total_tmp['bins'] = pd.cut(data_total['intensity [photon]'],
                                    bins=bin_list)
    # 1st group by bins
    data_total_tmp = data_total_tmp.groupby(
        by=['channel', 'group', 'filename', 'bins']).size()
    # reset index
    data_total_tmp = data_total_tmp.reset_index()
    data_total_tmp = data_total_tmp.rename(index=int, columns={0: 'counts'})
    # 2nd group by
    data_total_tmp_mean = data_total_tmp.groupby(
        by=['channel', 'group', 'bins']).mean()['counts']
    data_total_tmp_sem = data_total_tmp.groupby(
        by=['channel', 'group', 'bins']).sem()['counts']

    print('binned data, mean')
    display(data_total_tmp_mean)
    print('binned data, sem')

Ejemplo n.º 50

0

Mostrar archivo

Archivo: Feature.py Proyecto: wzzanthony/HelloWorld-DataMining

# it will not cause much interference to the model for ages 200
# 3. LR is a generalized linear model with limited expressive power. 
# After discretization, each variable has a separate weight, 
# which is equivalent to introducing non-linearity, 
# which can improve the expressive power of the model and increase the fitting.
# 4. After discrete features, feature crossover can be performed to improve the expression ability. 
# M + N variables are programmed by M + N variables, 
# which further introduces non-linearity and improves the expression ability；
# 5. The model is more stable after the features are discrete. 
# For example, the user's age range will not change because the user is one year old.

# LightGBM improve the performance of XGBoost and increase data hiving at the same time
# which improves the generalization of the model

bin = [i*10 for i in range(31)]
data['power_bin'] = pd.cut(data['power'], bin, labels=False)
print(data[['power_bin', 'power']].head())
#see the original data
print(data.shape)
print(data.columns)

data = data.drop(['creatDate', 'regDate', 'regionCode'], axis=1)
#check whether they have been deleted 
print(data.shape)
print(data.columns)

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
data['power'] = np.log(data['power'] + 1) 
data['power'] = ((data['power'] - np.min(data['power'])) / (np.max(data['power']) - np.min(data['power'])))
data['kilometer'] = ((data['kilometer'] - np.min(data['kilometer'])) /

Ejemplo n.º 51

0

Mostrar archivo

Archivo: Movielens_result_new.py Proyecto: activewizardslab/python-ml

# In[93]:

df.plot(kind='line', x=df.Genres, figsize=[9, 3])
plt.title('What movies watch writers?')
plt.ylabel('count')
plt.xlabel('genres')

##### 10. Which age group is ranking which genre the most?

# In[101]:

# we are dividing people in 8 group by age.
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
# add new column with age_group
data['Age_group'] = pd.cut(data['Age'],
                           range(0, 81, 10),
                           right=False,
                           labels=labels)
print data[['Age', 'Age_group']].drop_duplicates()[:20]

# In[95]:

# let's look at how age is distributed amongst our users
# call panda's "hist" on the column to produce a histogram
data.Age.hist(bins=10)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('age')

# In[96]:

# now we are comparing ratings across age groups

Ejemplo n.º 52

0

Mostrar archivo

Archivo: puf_reweight.py Proyecto: donboyd5/weighting_backup

# alternative: here is the numpy equivalent to R ifelse
# targets_long['value'] = np.where(condition, targets_long['value'] * 1000, targets_long['value'])


# %% get advanced puf file
%time puf_2018 = pd.read_hdf(PUF_HDF)  # 1 sec
puf_2018.tail()
puf_2018.columns.sort_values().tolist()  # show all column names

pufsub = puf_2018.copy()  # new data frame
pufsub = pufsub.loc[pufsub["data_source"] == 1]  # ~7k records dropped
# create an irs stub categorical variable
pufsub['IRS_STUB'] = pd.cut(
    pufsub['c00100'],
    IRS_AGI_STUBS,
    labels=list(range(1, len(IRS_AGI_STUBS))),
    right=False)
pufsub.columns.sort_values().tolist()  # show all column names


# %% get just the variables we want and create new needed variables
# get list of target variables we need to have
plist = list(PUFTARG_XWALK.keys())
plist
# add names of variables we need for calculations or targeting
plist.append('pid')
plist.append('IRS_STUB')
plist.append('s006')
plist.append('MARS')
# remove names of variables we are going to create

Ejemplo n.º 53

0

Mostrar archivo

def main():

    inpt_folder = "/mnt/scratch/Data/METData/"
    v_frac = 0.1

    ## Create a dummy model as it has all of the loader capabilities and ensure this is exactly what our networks see during training!
    model = Model.METNET_Agent('Tight',
                               '/mnt/scratch/Saved_Networks/Presentation/')
    model.setup_network(True, 'XXX', None, 1, 5, False, 0, dev='cpu')
    model.inpt_list = ['Tight_Final_ET']
    model.setup_dataset(inpt_folder,
                        v_frac,
                        32,
                        1024,
                        8096,
                        8,
                        'mag',
                        0,
                        0,
                        0,
                        no_trn=True)

    ## The bin setup to use for the profiles
    n_bins = 40
    mag_bins = np.linspace(0, 400, n_bins + 1)
    exy_bins = [
        np.linspace(-50, 250, n_bins + 1),
        np.linspace(-150, 150, n_bins + 1)
    ]

    ## All the networks outputs and targets for the batch will be combined into one list
    all_outputs = []
    all_targets = []

    ## The information to be saved in our dataframe, the truth et (for binning) and the performance metric per bin
    met_names = ['Tru', 'Res', 'Lin', 'Ang']

    ## Configure pytorch, the network and the loader appropriately
    T.set_grad_enabled(False)
    model.net.eval()
    model.valid_loader.dataset.weight_off()

    ## Iterate through the validation set
    for batch in tqdm(model.valid_loader, desc='perfm', ncols=80, ascii=True):

        ## Get the network outputs and targets
        tight, targets = myUT.move_dev(batch[:-1], model.device)

        ## Undo the processing on Tight
        tight = (tight * model.net.inp_stats[1, :1] +
                 model.net.inp_stats[0, :1]) / 1000
        outputs = T.cat([tight, T.zeros_like(tight)], dim=1)

        all_outputs.append(deepcopy(outputs))
        all_targets.append(deepcopy(targets))

    ## Combine the lists into single tensors
    all_outputs = T.cat(all_outputs)
    all_targets = T.cat(all_targets)

    ## Undo the normalisation on the outputs and the targets
    net_xy = all_outputs
    tru_xy = (all_targets * model.net.trg_stats[1] +
              model.net.trg_stats[0]) / 1000
    net_et = T.norm(net_xy, dim=1)
    tru_et = T.norm(tru_xy, dim=1)

    ## Calculate the performance metrics
    res = ((net_xy - tru_xy)**2).mean(dim=1)
    lin = (net_et - tru_et) / (tru_et + 1e-8)
    ang = T.acos(
        T.sum(net_xy * tru_xy, dim=1) /
        (net_et * tru_et + 1e-8))**2  ## Calculated using the dot product

    ## We save the overall resolution
    model.avg_res = T.sqrt(res.mean()).item()

    ## Combine the performance metrics into a single pandas dataframe
    combined = T.vstack([tru_et, res, lin, ang]).T
    df = pd.DataFrame(myUT.to_np(combined), columns=met_names)

    ## Make the profiles in bins of True ET using pandas cut and groupby methods
    df['TruM'] = pd.cut(df['Tru'],
                        mag_bins,
                        labels=(mag_bins[1:] + mag_bins[:-1]) / 2)
    profs = df.drop('Tru', axis=1).groupby('TruM', as_index=False).mean()
    profs['Res'] = np.sqrt(profs['Res'])  ## Res and Ang are RMSE measurements
    profs['Ang'] = np.sqrt(profs['Ang'])

    ## Save the performance profiles
    profs.to_csv(Path(model.save_dir, model.name, 'perf.csv'), index=False)

    ## Save the Magnitude histograms
    h_tru_et = np.histogram(myUT.to_np(tru_et), mag_bins, density=True)[0]
    h_net_et = np.histogram(myUT.to_np(net_et), mag_bins, density=True)[0]
    myPL.plot_and_save_hists(Path(model.save_dir, model.name, 'MagDist'),
                             [h_tru_et, h_net_et], ['Truth', 'Outputs'],
                             ['MET Magnitude [Gev]', 'Normalised'],
                             mag_bins,
                             do_csv=True)

    ## Save the ex and ey contour plots
    h_tru_xy = np.histogram2d(*myUT.to_np(tru_xy).T, exy_bins, density=True)[0]
    h_net_xy = np.histogram2d(*myUT.to_np(net_xy).T, exy_bins, density=True)[0]
    myPL.plot_and_save_contours(Path(model.save_dir, model.name, 'ExyDist'),
                                [h_tru_xy, h_net_xy], ['Truth', 'Outputs'],
                                ['METx [GeV]', 'METy [GeV]'],
                                exy_bins,
                                do_csv=True)

    ## Get a dataframe from the class dict and write out
    dict_df = pd.DataFrame.from_dict([model.get_dict()]).set_index('name')
    dict_df.to_csv(Path(model.save_dir, model.name, 'dict.csv'))

Ejemplo n.º 54

0

Mostrar archivo

Archivo: fpgrowth.py Proyecto: gintian/expected-attention-model-of-enterprises

    def loadAndTransDatasetsReal(self):
        """
            读入样本数据(随机生成的模拟真实数据)
            给连续特征分组, 返回修改后的新样本数据
            同时可输出特征分组文件, 供手动修改分组权重值
        """

        df = self.df_train.copy()

        # 1 特征转换: 指标值分组和转换
        # 1.1 对类别特征, 用[特征名+'__'+特征值]的结果替换原来的特征值
        # 如将'建筑业'替换成'industry__建筑业'
        for f1 in [
                'industry', 'city', 'district', 'is_change_premises',
                'cacellation_reason', 'is_real_premise', 'is_executee',
                'is_financial_black_list', 'alert_level'
        ]:
            df[f1] = df[f1].map(lambda x: f1 + '__' + str(x))
            #print(df[f1].value_counts())

        # 1.2 对连续特征值, 先分组, 再用[指标名+'__'+分组]的结果替换分组后的指标值
        for f2 in [
                'registered_capital',
                'employees',
                'total_assets',
                'total_tax',
                'industry_index',
                'fraud_score',
                'period_abnormal',
                'rate_frozon_holdings',
                'total_liabilities',
                'total_debt',
                'judical_auction_amount',
                'complaints_number_monthly',
                'rate_conciliation',
                'sub_enterprises_number',
        ]:
            df[f2] = pd.cut(
                df[f2], 4, duplicates='drop').map(lambda x: f2 + '__' + str(x))
            #print(df[f2].value_counts())

        # 1.3 给个别指标手动指定分组后改名
        #print(df['period_abnormal'].value_counts())
        #print(pd.cut(df['period_abnormal'], bins=5))

        del df['enterprise_id']

        # 2 输出转换后的数据集
        # 2.1 输出所有指标的所有分组值, 存储在features_group_name.xlsx文件中
        df_features_group = pd.DataFrame()
        for c in df.columns:
            #print(c)
            df1 = df[c].value_counts().reset_index()
            df1.columns = ['feature_group_name', 'counts']
            df1['interval'] = df1['feature_group_name'].map(
                lambda x: x.split('__')[1])
            #print(df1[['features', 'interval', 'counts']])
            df_features_group = pd.concat([
                df_features_group,
                df1[['feature_group_name', 'interval', 'counts']]
            ])
        #print(df_features_group)
        df_features_group.to_excel('datasets/features_group_name.xlsx',
                                   index=False)

        # 2.2 存储分组转换后的指标值
        #print(df.shape, '\n', df.head(), '\n', df.columns)
        df.to_csv('datasets/features_grouped.csv', sep=';', index=False)

        # 3 将特征转换后的数据集转换成数组格式, 模型利用datsets做正式训练
        self.datasets_init = df.to_numpy().tolist()

        return self.datasets_init

Ejemplo n.º 55

0

Mostrar archivo

Archivo: util.py Proyecto: haykalatas/classification_template

    def mono_bin(Y, X, n=max_bin):

        df1 = pd.DataFrame({"X": X, "Y": Y})
        justmiss = df1[['X', 'Y']][df1.X.isnull()]
        notmiss = df1[['X', 'Y']][df1.X.notnull()]
        r = 0
        while np.abs(r) < 1:
            try:
                d1 = pd.DataFrame({
                    "X": notmiss.X,
                    "Y": notmiss.Y,
                    "Bucket": pd.qcut(notmiss.X, n)
                })
                d2 = d1.groupby('Bucket', as_index=True)
                r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
                n = n - 1
            except Exception as e:
                n = n - 1

        if len(d2) == 1:
            n = force_bin
            bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
            if len(np.unique(bins)) == 2:
                bins = np.insert(bins, 0, 1)
                bins[1] = bins[1] - (bins[1] / 2)
            d1 = pd.DataFrame({
                "X":
                notmiss.X,
                "Y":
                notmiss.Y,
                "Bucket":
                pd.cut(notmiss.X, np.unique(bins), include_lowest=True)
            })
            d2 = d1.groupby('Bucket', as_index=True)

        d3 = pd.DataFrame({}, index=[])
        d3["MIN_VALUE"] = d2.min().X
        d3["MAX_VALUE"] = d2.max().X
        d3["COUNT"] = d2.count().Y
        d3["EVENT"] = d2.sum().Y
        d3["NONEVENT"] = d2.count().Y - d2.sum().Y
        d3 = d3.reset_index(drop=True)

        if len(justmiss.index) > 0:
            d4 = pd.DataFrame({'MIN_VALUE': np.nan}, index=[0])
            d4["MAX_VALUE"] = np.nan
            d4["COUNT"] = justmiss.count().Y
            d4["EVENT"] = justmiss.sum().Y
            d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
            d3 = d3.append(d4, ignore_index=True)

        d3["EVENT_RATE"] = d3.EVENT / d3.COUNT
        d3["NON_EVENT_RATE"] = d3.NONEVENT / d3.COUNT
        d3["DIST_EVENT"] = d3.EVENT / d3.sum().EVENT
        d3["DIST_NON_EVENT"] = d3.NONEVENT / d3.sum().NONEVENT
        d3["WOE"] = np.log(d3.DIST_EVENT / d3.DIST_NON_EVENT)
        d3["IV"] = (d3.DIST_EVENT - d3.DIST_NON_EVENT) * np.log(
            d3.DIST_EVENT / d3.DIST_NON_EVENT)
        d3["VAR_NAME"] = "VAR"
        d3 = d3[[
            'VAR_NAME', 'MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT',
            'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT',
            'DIST_NON_EVENT', 'WOE', 'IV'
        ]]
        d3 = d3.replace([np.inf, -np.inf], 0)
        d3.IV = d3.IV.sum()

        return (d3)

Ejemplo n.º 56

0

Mostrar archivo

Archivo: inceptionV3.py Proyecto: Boneageanalysis/BAA2020

fig, ax = plt.subplots()
ax = sns.distplot(df['boneage'], bins=10)
ax.set(xlabel='Boneage (months)', ylabel='Density',
    title='Boneage distribution');

boneage_mean = df['boneage'].mean()#mean age
boneage_div = 2*df['boneage'].std()
print(boneage_mean,"  ",boneage_div)
df['boneage_zscore'] = df['boneage'].map(lambda x:
    (x - boneage_mean) / boneage_div)
df.dropna(inplace=True)

df['gender'] = df['male'].map(lambda x: 1 if x else 0)

df['boneage_category'] = pd.cut(df['boneage'], 10)

df['gender']
df['boneage_category']
print(df)

raw_train_df, raw_valid_df = train_test_split(df, test_size=VALIDATION_FRACTION,
  random_state=2018)

raw_train_df.shape[0]
raw_valid_df.shape[0]

raw_valid_df.head()

print(df.loc[df['exists']==False,:])

Ejemplo n.º 57

0

Mostrar archivo

def main():
    logging.basicConfig(format="%(funcName)s: %(message)s", level=logging.INFO)
    output_dir = './german_credit_showcase/'
    sensitive_microdata_path = output_dir + 'german_credit_data.tsv'

    if not path.exists(output_dir):
        mkdir(output_dir)

    if not path.exists(sensitive_microdata_path):
        attributes = []
        codes = {}
        logging.info('Retrieving data documentation...')
        codes_file = request.urlopen(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc'
        ).readlines()
        logging.info('Retrieved')
        state = 'skip'
        attribute = ''
        code = ''
        value = ''
        for line in [x.decode('UTF-8').strip() for x in codes_file]:
            if line == '':
                state = 'skip'
            elif state == 'skip':
                if str.startswith(line, 'Att'):
                    state = 'attribute'
            elif state == 'attribute':
                attribute = line.strip()
                attributes.append(attribute)
                codes[attribute] = {}
                state = 'values'
            elif state == 'values':
                split = line.index(':') if ':' in line else -1
                if split != -1:
                    code = line[:split].strip()
                    value = line[split + 1:].strip().replace(':', '-')
                    codes[attribute][code] = value
                else:
                    codes[attribute][code] += ' ' + line.replace(':', '-')
        attributes.append('Credit rating')
        codes['Credit rating'] = {'1': 'Good', '2': 'Bad'}

        logging.info('Retrieving data file...')
        df = pd.read_csv(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data',
            sep=' ',
            index_col=False,
            header=None,
            names=attributes)

        logging.info('Retrieved')
        logging.info('Processing dataset...')
        values, labels = binValuesAndLabels(df['Duration in month'].max(), 12)
        df['Duration in month'] = pd.cut(df['Duration in month'],
                                         bins=values,
                                         labels=labels)
        values, labels = binValuesAndLabels(df['Credit amount'].max(), 2500)
        df['Credit amount'] = pd.cut(df['Credit amount'],
                                     bins=values,
                                     labels=labels)
        values, labels = binValuesAndLabels(df['Age in years'].max(), 20)
        df['Age in years'] = pd.cut(df['Age in years'],
                                    bins=values,
                                    labels=labels)

        df = df.astype(str).replace(to_replace=r'^nan$', value='', regex=True)

        for att in attributes:
            df[att] = df[att].replace(codes[att])

        del df['foreign worker']
        del df['Property']
        del df['Telephone']
        del df['Other debtors / guarantors']
        del df['Number of people being liable to provide maintenance for']
        del df['Other installment plans']
        del df['Savings account/bonds']
        del df['Present employment since']
        del df['Status of existing checking account']

        df.to_csv(sensitive_microdata_path, sep='\t', index=False)
        logging.info('Processed')

    config = {
        'parallel_jobs': 1,
        'memory_limit_pct': 90,
        'use_columns': [],
        'record_limit': -1,
        'reporting_length': 5,
        'reporting_precision': 2,
        'reporting_threshold': 2,
        'seeded': True,
        'sensitive_zeros': [],
        'prefix': 'credit',
        'output_dir': output_dir,
        'sensitive_microdata_path': sensitive_microdata_path,
        'sensitive_microdata_delimiter': '\t',
        'report_title': 'German Credit Data Showcase',
    }

    json.dump(config,
              open(path.join('.', config['prefix'] + '_config.json'), 'w'),
              indent=1)

    config['aggregate'] = True
    config['generate'] = True
    config['navigate'] = True
    config['evaluate'] = True

    config['reportable_aggregates_path'] = path.join(
        config['output_dir'], config['prefix'] + '_reportable_aggregates.tsv')
    config['synthetic_microdata_path'] = path.join(
        config['output_dir'], config['prefix'] + '_synthetic_microdata.tsv')
    config['sensitive_aggregates_path'] = path.join(
        config['output_dir'], config['prefix'] + '_sensitive_aggregates.tsv')

    runPipeline(config)

Ejemplo n.º 58

0

Mostrar archivo

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import roc_curve, auc
import imblearn
import xgboost as xgb  # conda install py-xgboost
import lime
import lime.lime_tabular
import warnings
from lime import submodular_pick
import matplotlib.pyplot as plt

data = pd.read_csv(os.path.join('..', 'result', 'all_right_wrong_i.csv'))
data.drop(['change_1m', 'change_3m'], axis=1, inplace=True)
data = data.assign(
    MRS_TX_3=pd.cut(data['MRS_TX_3'], [-1, 2, 7], labels=[0, 1]))

wrong_data = data[data.ctype == 0]
wrong_data.drop(['ctype'], axis=1, inplace=True)
id_wrong_data = wrong_data[['ICASE_ID', 'IDCASE_ID']]
y_wrong_data = wrong_data[['MRS_TX_3']]
x_wrong_data = wrong_data.drop(['ICASE_ID', 'IDCASE_ID', 'MRS_TX_3'], axis=1)

aucs = []
for i in range(1):
    X_train, X_test, y_train, y_test = train_test_split(x_wrong_data,
                                                        y_wrong_data,
                                                        test_size=0.3,
                                                        random_state=i,
                                                        stratify=y_wrong_data)
    # for train_index, test_index in KFold(n_splits=10).split(x_wrong_data):

Ejemplo n.º 59

0

Mostrar archivo

Archivo: main.py Proyecto: rishabhverma28/divvy

    # Create a new dataframe with unique columns for station names and station ids
    df_unique = df.drop_duplicates(
        subset=['from_station_id', 'from_station_name'])
    df_unique = df_unique[['from_station_id', 'from_station_name']]
    df_unique = df_unique.reset_index(drop=True)

    # Create a json file if needed
    # with open('./frontend/file.json', 'w') as f:
    #     json.dump(df_unique.to_json(r'./frontend/file.json',
    #                                 orient="records"), f)

    # Dropping rows with empty values
    df.dropna(inplace=True)

    # Convert the tripduration column into float type
    df['tripduration'] = df['tripduration'].replace({
        ',': ''
    }, regex=True).astype(float)

    # Get the current year
    now = datetime.now()

    # Add a new column, age to the dataframe
    df['age'] = now.year - df['birthyear']

    # Separate the ages into groups
    df['age_group'] = pd.cut(df['age'], [0, 15, 30, 45, 200],
                             labels=['0-15', '16-30', '31-45', '46+'])
    app.run(debug=True)

Ejemplo n.º 60

0

Mostrar archivo

Archivo: data_handle.py Proyecto: gavrielhonig/Multi-armed-bandit-python

    'RU': 11162,
    'SA': 22865,
    'SE': 54608,
    'TH': 7274,
    'TR': 9370,
    'TW': 24827,
    'UA': 3592,
    'US': 65111,
    'VN': 2740,
    'ZA': 11300,
    'CO': 6500
})
df.gdpCountry = pd.to_numeric(df.gdpCountry, errors='coerce')
df['gdpCountry'] = df['gdpCountry'].fillna(11335)
df['gdpCountry'] = pd.cut(df.gdpCountry,
                          bins=[0, 29960, 50000, 150000],
                          labels=[0, 1, 2])

#End Yev Gdp

device_map = {
    'IPhone7': 0,
    'IPhone7Plus': 0,
    'IPhone8Plus': 0,
    'IPhone6S': 0,
    'IPhoneSE': 0,
    'IPhone8': 0,
    'IPhone6SPlus': 0,
    'IPadAir1G': 0,
    'IPhone5S': 0,
    'IPadMini3G': 0,