def create_grouped_index_df(bin_num): ## load the labels and start_time column for train and test data start_time = time.time() train_labels = pd.read_csv(data_path + train_num_file, index_col='Id', usecols=['Id', dep_var_name]) train_date_start_columm = pd.read_csv(data_path + train_date_file, index_col='Id', usecols=['Id', start_time_column_name]) test_date_start_columm = pd.read_csv(data_path + test_date_file, index_col='Id', usecols=['Id', start_time_column_name]) end_time = time.time() print 'data loading takes ', round((end_time - start_time), 1), ' seconds.' ## join the start_time with labels, then drop the NaN in start_time labeled_start_time = pd.merge(train_labels, train_date_start_columm, how='left', left_index=True, right_index=True) ## this labeled_start_time dataFrame doesn't contain the NaN, therefore it can be directly used for calculating the mquantiles labeled_start_time = labeled_start_time[~labeled_start_time[start_time_column_name].isnull()] ##section to subset the data by start_time prob_list = [1.*i/bin_num for i in range(1, bin_num)] quantile_values = mquantiles(labeled_start_time[start_time_column_name], prob=prob_list) bins = [labeled_start_time[start_time_column_name].min()] bins.extend(quantile_values) bins.append(labeled_start_time[start_time_column_name].max()) bin_names = [str(i) for i in range(len(bins)-1)] ## cut the entire dataframe into different time_windows by start_time tmp_train = train_date_start_columm.copy() tmp_test = test_date_start_columm.copy() tmp_train['time_window_num'] = pd.cut(tmp_train[start_time_column_name], bins, labels=bin_names) tmp_test['time_window_num'] = pd.cut(tmp_test[start_time_column_name], bins, labels=bin_names) ## create a row number column, start index is 1 tmp_train['row_num'] = range(1, (tmp_train.shape[0] + 1)) tmp_test['row_num'] = range(1, (tmp_test.shape[0] + 1)) return tmp_train, tmp_test, bins, bin_names
def makediscrete2(df, dataType): dataLabels = list(df.columns.values) for eachLabel in dataLabels: if dataType[eachLabel] is 1: choiceC = True bins = input('Enter bin size for ' + eachLabel + ": ") while choiceC is True: choice = input("Enter \n 1 - for equal size bins \n 2 - for custom range \n You Choice: " ) if choice == 1: df[eachLabel] = pd.cut(df[eachLabel], bins) choiceC = False elif choice == 2: print ("Enter " + str(bins+1) + " values for bin edges:") binedges =[] for x in range(bins+1): value = input("" + str(x) + ": ") binedges.append(value) df[eachLabel] = pd.cut(df[eachLabel], binedges ) choiceC = False else: print "Wrong choice Try Again!! " df.to_csv(Globals.DISCRETIZED_FILE) print ("continuous data converted to discrete data stored : " + Globals.DISCRETIZED_FILE) return df, Globals.DISCRETIZED_FILE
def cleaneddf(no_bins=0): #you'll want to tweak this to conform with your computer's file system trainpath = '../../data/train.csv' testpath = '../../data/test.csv' traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins==0: return [cleandf(traindf), cleandf(testdf)] traindf=cleandf(traindf) testdf=cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True) bins=bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discretise age bins_and_binned_age = pd.qcut(traindf.Age+jitter(traindf.Age), no_bins, retbins=True) bins=bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a submission file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv('./prediction.csv', index=False) return [traindf, testdf]
def period_by_hours(x, separation): ''' aggrege le x par intervale d'heure. Le calcul pourrait être simple si on interdisait le chevauchement de jour. ''' print(separation) assert isinstance(separation, list) assert all([sep < 24 for sep in separation]) separation.sort() if 0 in separation: separation.append(24) hour_categ = pd.cut(x.dt.hour, separation, right=False) date_categ = x.dt.date return date_categ.astype(str) + ' ' + hour_categ.astype(str) else: hour = x.dt.hour hour_categ = pd.cut(hour, separation, right=False).astype(str) night_categ = '[' + str(separation[-1]) + ', ' + str(separation[0]) + ')' hour_categ[(hour < separation[0]) | (hour >= separation[-1])] = night_categ assert hour_categ.nunique(dropna=False) == len(separation) date_categ = x.dt.date.astype(str) # décalage d'un jour pour les premières heures decale = x.dt.date[x.dt.hour < separation[1]] + pd.DateOffset(days=-1) date_categ[x.dt.hour < separation[1]] = decale.astype(str) assert all(date_categ.str.len() == 10) return date_categ + ' ' + hour_categ
def binData2D(myXYZ, xstart, xend, ystart, yend, nx, ny): ''' Fucntion to bin a scatter point cloud (xyz) into a 2d array :param myXYZ: xyz array containings the point cloud coordiantes :param xstart: :param xend: :param ystart: :param yend: :param nx: number of cells along the x-axis :param ny: number of cells along hte y-axis :return: a group object (pandas library) with all points classified into bins ''' # note, the division requires: from _future_ import division x = myXYZ[:,0].ravel() y = myXYZ[:,1].ravel() z = myXYZ[:,2].ravel() df = pd.DataFrame({'X' : x , 'Y' : y , 'Z' : z}) bins_x = np.linspace(xstart, xend, nx+1) x_cuts = pd.cut(df.X,bins_x, labels=False) bins_y = np.linspace(ystart,yend, ny+1) y_cuts = pd.cut(df.Y,bins_y, labels=False) bin_xmin, bin_ymin = x_cuts.min(), y_cuts.min() print('Data cut in a ' + str(bins_x.__len__()) + ' by ' + str(bins_y.__len__()) + ' matrix') dx = (xend - xstart)/nx dy = (yend - ystart)/ny print('dx = ' + str(dx) + ' ; dy = ' + str (dy)) grouped = df.groupby([x_cuts,y_cuts]) print('Data grouped, \nReady to go!!') return grouped, bins_x, bins_y, int(bin_xmin), int(bin_ymin)
def get_log_odds_chris(target, feature, bins, f_range=None, M=10, display_head=False): """return log ( P(feature=x | target=1) / P(feature=x | target=0) ) tn : targent name, 0 or 1 fn : x name f_range : x의 범위 제한 M : smoothing factor """ tn = target.name fn = feature.name X = pd.concat([target, feature], axis=1) if f_range is not None: X = X[(X[fn] > f_range[0]) & (X[fn] < f_range[1])] if display_head: X["_cut"] = pd.cut(X[fn], bins=bins).astype(str) X["_cut"] = X._cut.map(lambda x: float(x.split(",")[0][1:])) else: X["_cut"] = pd.cut(X[fn], bins=bins) Y = X.groupby("_cut").apply( lambda x: np.log((x[tn].sum() + 1.0 * M / bins) / ((1.0 - x[tn]).sum() + 1.0 * M / bins)) ) # display(X.groupby('_cut').apply(lambda x: (x[tn].sum(), (1-x[tn]).sum()))) # display(Y) Y = Y - np.log((1.0 * X[tn].sum() + M) / ((1.0 - X[tn]).sum() + M)) Y = pd.DataFrame(Y, columns=["%s_log_odds" % fn]) return Y
def make_object_map(data,field,**kwargs): linear = False for key,value in kwargs.iteritems(): if key == 'linear': linear = value print linear if linear == False: colors,rangelist = make_distributed_range(data,field) else: colors = get_heatmap51() colors2 = colors maxvalue = data[field].max() if maxvalue < 51: totallist = range(maxvalue) colors = reduce_color_list_size(totallist,colors) colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors) else: colors = reduce_color_list_size(range(len(data)),colors) colors,rangelist = make_gradient_range(data[field].min(),maxvalue,colors) if not rangelist[0] == 0: rangelist = [0] + rangelist[1:] data['COLORKEY'] = pd.cut(data[field],bins=rangelist+[1000000000],labels=colors) return data colors2 = get_heatmap51() if not rangelist[0] == 0: rangelist = [0] + rangelist[1:] data['COLORKEY'] = pd.cut(data[field],bins=rangelist,labels=colors[1:]) return data
def get_indicators(start_date, end_date, symbols): """Simulate and assess the performance of a stock portfolio.""" # Read in adjusted closing prices for given symbols, date range dates = pd.date_range(start_date, end_date) prices_all = get_data(symbols, dates) # automatically adds SPY prices = prices_all[symbols] # only portfolio symbols # prices_SPY = prices_all['SPY'] # only SPY, for comparison later sym = symbols[1] x1 = (prices[sym] - pd.rolling_mean(prices[sym], 20)) / (2 * pd.rolling_std(prices[sym], 20)) x1_dis = pd.cut(x1, 10, labels=False) x2 = prices[sym].pct_change(20) x2_dis = pd.cut(x2, 10, labels=False) x3 = pd.rolling_std(prices[sym].pct_change(1), 20) x3_dis = pd.cut(x3, 10, labels=False) # return pd.concat([x1_,x2_0,x3_0], axis=1).dropna(), prices tempdf = pd.concat([x1_dis, x2_dis, x3_dis], axis=1).dropna() tempdf.columns = ["x1", "x2", "x3"] print tempdf.dtypes tempdf["holding"] = np.random.randint(0, 3, size=len(tempdf)) # 0 = no position , 1 = negative positin 2 =holding long tempdf["s"] = 1000 * tempdf["holding"] + 100 * tempdf["x3"] + 10 * tempdf["x2"] + 1 * tempdf["x1"] print tempdf.head(50) return tempdf, prices
def test_bins_not_overlapping_from_interval_index(): # see gh-23980 msg = "Overlapping IntervalIndex is not accepted" ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)]) with pytest.raises(ValueError, match=msg): cut([5, 6], bins=ii)
def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = pd.DataFrame([11, 12, 13], columns=['a']) grps = range(0, 25, 5) # add / sum result = df.groupby(pd.cut(df['a'], grps), observed=observed)._cython_agg_general('add') intervals = pd.interval_range(0, 20, freq=5) expected = pd.DataFrame( {"a": [0, 0, 36, 0]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) if observed: expected = expected[expected.a != 0] tm.assert_frame_equal(result, expected) # prod result = df.groupby(pd.cut(df['a'], grps), observed=observed)._cython_agg_general('prod') expected = pd.DataFrame( {"a": [1, 1, 1716, 1]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) if observed: expected = expected[expected.a != 1] tm.assert_frame_equal(result, expected)
def get_data_frame_with_dummies(users): users_ref = users.copy() base_dummies = None categories = {'gender': ['male', 'female'], 'education': ['overGraduate', 'university', 'underHigh'], 'income': ['100', '200', '300', '400', '500', '1200more'], 'job': ['officer', 'student', 'etc'], 'marriage': ['married', 'single'], 'religion': ['buddhist', 'none', 'christian', 'romanCatholicism']} age_bins = [10, 20, 30, 40, 50, 60, 70] numChild_bins = [0, 1, 10] for label_type in users_ref.columns: temp_dummies = None if label_type == 'age': temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], age_bins, right=False), prefix=label_type) elif label_type == 'numberOfChildren': temp_dummies = pd.get_dummies(pd.cut(users_ref[label_type], numChild_bins, right=False), prefix=label_type) elif label_type == 'residence': continue else: users_ref[label_type + "_cat"] = pd.Categorical(users_ref[label_type], categories=categories.get(label_type)) temp_dummies = pd.get_dummies(users_ref[label_type + "_cat"], prefix=label_type) if base_dummies is None: base_dummies = temp_dummies else: base_dummies = pd.concat([base_dummies, temp_dummies], axis=1) label_nums = base_dummies.sum() label_rates = label_nums / float(len(users_ref)) return base_dummies, label_nums, label_rates
def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) result, bins = cut(data, 3, retbins=True) expected = ( Series(IntervalIndex([ Interval(Timestamp('2012-12-31 23:57:07.200000'), Timestamp('2013-01-01 16:00:00')), Interval(Timestamp('2013-01-01 16:00:00'), Timestamp('2013-01-02 08:00:00')), Interval(Timestamp('2013-01-02 08:00:00'), Timestamp('2013-01-03 00:00:00'))])) .astype(CDT(ordered=True))) tm.assert_series_equal(result, expected) # testing for time data to be present as list data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')] result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) # testing for time data to be present as ndarray data = np.array([np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')]) result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) # testing for time data to be present as datetime index data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03']) result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected)
def bootstrapped_utility(df_testing=None,varname=None, bins = None): N = 1500 MKT = 500000 S_bin = 6000 L_bin = 3000 IT_bin = 0.20 IP_bin = 0.40 df_bs = df_testing.sample(frac=1, replace=True) df_bs_good = df_bs[df_bs['is_good']==1] df_bs_bad = df_bs[df_bs['is_good']!=1] hbs_good = cut(df_bs_good[varname], bins=bins) hbs_bad = cut(df_bs_bad[varname], bins=bins) bs_purity_by_bin = [] bs_efficiency_by_bin = [] for g in range(0,len(hbs_good.value_counts())): sum_g_b = hbs_good.value_counts()[g] + hbs_bad.value_counts()[g] if sum_g_b !=0: bs_purity_by_bin.append(1.0*hbs_good.value_counts()[g]/sum_g_b) else: bs_purity_by_bin.append(1.0*hbs_good.value_counts()[g]) bs_efficiency_by_bin.append(1.0*sum_g_b/(hbs_good.size+hbs_bad.size)) bs_purity_by_bin = array(bs_purity_by_bin) bs_default_by_bin = -1*(bs_purity_by_bin - 1.0) bs_efficiency_by_bin = array(bs_efficiency_by_bin) bs_DC_bin = N * bs_efficiency_by_bin * bs_default_by_bin * L_bin bs_RT_bin = N * bs_efficiency_by_bin * bs_purity_by_bin * S_bin * IT_bin bs_RP_bin = N * bs_efficiency_by_bin * bs_default_by_bin * (S_bin - L_bin) * IP_bin bs_f_bin = bs_RT_bin + bs_RP_bin - bs_DC_bin bs_f = cumsum(bs_f_bin[::-1])[::-1] - MKT return bs_f
def cleaneddf(no_bins=0): #you'll want to tweak this to conform with your computer's file system testpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\test.csv' trainpath = r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\train.csv' print trainpath traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins==0: return [cleandf(traindf), cleandf(testdf)] traindf=cleandf(traindf) testdf=cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True) bins=bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discretise age bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True) bins=bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a submission file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv(r'C:\Pradeep\Working Set\Consulting\Kaggle\Titanic\Data Sets Oirig\prediction.csv', index=False) return [traindf, testdf]
def cleaneddf(no_bins=0): #you'll want to tweak this to conform with your computer's file system trainpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtrain.csv' testpath = 'C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/rawtest.csv' traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins==0: return [cleandf(traindf), cleandf(testdf)] traindf=cleandf(traindf) testdf=cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins=True) bins=bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discretise age bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins=True) bins=bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a submission file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv('C:/Documents and Settings/DIGIT/My Documents/Google Drive/Blogs/triangleinequality/Titanic/prediction.csv', index=False) return [traindf, testdf]
def dummies_xy(cls, order_has_ret): """ bins的选择不是胡来,根据binscs可视化数据结果进行 :param order_has_ret: :return: """ bins = [-np.inf, 0.0, 0.1, 0.2, 0.4, 0.50, 0.85, 1.0, 1.2, np.inf] cats = pd.cut(order_has_ret.wave_score1, bins) wave_score1_dummies = pd.get_dummies(cats, prefix='ws1_dummies') order_has_ret = pd.concat([order_has_ret, wave_score1_dummies], axis=1) bins = [-np.inf, 0.0, 0.2, 0.4, 0.6, 0.8, 1.0, np.inf] cats = pd.cut(order_has_ret.atr_std, bins) atr_dummies = pd.get_dummies(cats, prefix='atr_dummies') order_has_ret = pd.concat([order_has_ret, atr_dummies], axis=1) bins = [-np.inf, -20, -12, -7, -3, 0, 3, 7, 12, 20, np.inf] cats = pd.cut(order_has_ret.deg_hisWindowPd, bins) deg_his_window_dummies = pd.get_dummies(cats, prefix='dh_dummies') order_has_ret = pd.concat([order_has_ret, deg_his_window_dummies], axis=1) cats = pd.cut(order_has_ret.deg_windowPd, bins) deg_window_dummies = pd.get_dummies(cats, prefix='dw_dummies') order_has_ret = pd.concat([order_has_ret, deg_window_dummies], axis=1) cats = pd.cut(order_has_ret.deg_60WindowPd, bins) deg_60window_dummies = pd.get_dummies(cats, prefix='d60_dummies') order_has_ret = pd.concat([order_has_ret, deg_60window_dummies], axis=1) return order_has_ret
def psi(bench,target,group,print_df = True): """ This function return the Population Stability Index, quantifying if the distribution is stable between two states. This statistic make sense and works is only working for numeric variables for bench and target. Params: - bench is a numpy array with the reference variable. - target is a numpy array of the new variable. - group is the number of group you want consider. """ labels_q = np.percentile(bench,[(100.0/group)*i for i in range(group + 1)],interpolation = "nearest") # This is the right approach when you have not a lot of unique value ben_pct = (pd.cut(bench,bins = np.unique(labels_q),include_lowest = True).value_counts())/len(bench) target_pct = (pd.cut(target,bins = np.unique(labels_q),include_lowest = True).value_counts())/len(target) target_pct = target_pct.sort_index()# sort the index ben_pct = ben_pct.sort_index() # sort the index psi = sum((target_pct - ben_pct)*np.log(target_pct/ben_pct)) # Print results for better understanding if print_df: results = pd.DataFrame({'ben_pct': ben_pct.values, 'target_pct': target_pct.values}, index = ben_pct.index) return {'data':results,'statistic': psi} return psi
def print_wt_dist(df): age_breaks = [0, 19, 29, 39, 49, 59, 69, 79, 100] df['age_band'] = pd.cut(df['age'], age_breaks) df['weight_band'] = pd.cut(df['weight'], weight_breaks) pt = pd.pivot_table(df, index=['weight_band'], columns=['sex', 'age_band'], values=['age'], aggfunc=[len]) print 1. * pt.cumsum(0) / pt.sum()
def Categoricalize(df, feats): """ Turns the continuous raw data into categorical data. Specific to the current setup of features for the NS2 data Arguments: df - pandas.DataFrame contatining the data to categoricalize feats - List of features in the DataFrame """ # I chose this, no idea how good it is time_cuts = np.array([0,1,2,3,4,5,7.5,10,12.5,15,20,25,30,40,50,60,90,120]) #time_cuts = np.array([0,1,2,5,10,15,30,60,120]) time_cuts *= 60 # THIS IS SPECIFIC TO THE CURRENT DATA SETUP time_feats = feats[:1] + feats[5:-1] for feat in time_feats: df[feat]=pd.cut(df[feat],time_cuts) tvt_feats = feats[1:5] tvt_cuts = np.array([0, 0.1, 0.2, 0.25, 0.333, 0.5, 0.667, 0.75, 1, 1.333, 1.5, 2, 3, 4, 5, 10, 10000]) #tvt_cuts = np.array([0, 0.1, 0.2, 0.5, 1, 2, 5, 10, 10000]) for feat in tvt_feats: df[feat]=pd.cut(df[feat],tvt_cuts)
def compress_matrix(matrix, nrows=None, ncols=None): '''Compress a matrix to a new number of rows/columns. Cells in the matrix are collapsed by averaging. Assumes matrix is sorted Parameters ---------- matrix : pandas.DataFrame matrix to be compressed. Index(s) must be numeric. nrows, nocls : int Number of rows/columns in the output matrix. If ``None`` then will be same as input matrix. Returns ------- pandas.DataFrame If `nrows`/`ncols` is ``None`` then the corresponding index will be unchanged. Otherwise, index will be integer 0 to `nrows`/`ncols` . ''' if ncols: groups, bins = pd.cut(matrix.columns.values, ncols, retbins=True, labels=False) groups = bins[groups] matrix = matrix.groupby(groups, axis=1).mean() if nrows: groups = pd.cut(range(matrix.shape[0]), nrows, labels=False) matrix = matrix.groupby(groups).mean() return matrix
def add_digitized_columns_given_input_filter(df_orig, columns_list, cut_point_list, based_on_filter, quantile_cut_point_format= True, digitized_columns_names=[]): df = df_orig.copy() filter_name = '*'.join(['_Digital'] +based_on_filter) if not digitized_columns_names: digitized_columns_names = map(lambda x: x + filter_name, columns_list) print digitized_columns_names if not based_on_filter: if quantile_cut_point_format: for k,col in enumerate(columns_list): df[digitized_columns_names[k]] = cut_modified(df[col], cut_point_list ) else: for k,col in enumerate(columns_list): df[digitized_columns_names[k]] = pd.cut(df[col], cut_point_list, labels =range(len(cut_point_list)-1)) else: df_groups = df.groupby(based_on_filter) if quantile_cut_point_format: for k,col in enumerate(columns_list): #df[digitized_columns_names[k]] = df_groups[col].transform(lambda x: pd.qcut(x,cut_point_list, #labels =digital_vals)) df[digitized_columns_names[k]] = df_groups[col].transform(lambda x: cut_modified(x,cut_point_list)) else: for k,col in enumerate(columns_list): df[digitized_columns_names[k]] = df_groups[col].transform(lambda x: pd.cut(x,cut_point_list, labels =range(len(cut_point_list)-1))) return df, digitized_columns_names
def one2two(file_in=PATH_FILE_OUT, file_out=PATH_FILE_FINAL): data = pd.read_pickle(file_in)['close'] data = data.reshape(-1, 24) data = np.array([data[i:i + 24] for i in range(data.shape[0] - 24 + 1)]) data_s = { 'open_price': np.array([data[i][0][0] for i in range(data.shape[0] - 1)]), 'close_price': np.array([data[i][int(NUM_PIX / 24) - 1][23] for i in range(data.shape[0] - 1)]), 'max_price': np.array([data[i].max() for i in range(data.shape[0] - 1)]), 'min_price': np.array([data[i].min() for i in range(data.shape[0] - 1)]), 'mean_price': np.array([data[i].mean() for i in range(data.shape[0] - 1)]), 'median_price': np.array([np.median(data[i]) for i in range(data.shape[0] - 1)]), 'buy_or_sell': np.array( [int(data[i + 1][int(NUM_PIX / 24) - 1][23] > data[i + 1][0][0]) for i in range(data.shape[0] - 1)]), 'change': np.array( [(data[i + 1][int(NUM_PIX / 24) - 1][23] - data[i + 1][0][0]) / data[i + 1][int(NUM_PIX / 24) - 1][23] * 100 for i in range(data.shape[0] - 1)])} data_s = pd.DataFrame(data_s) bins = [-100, -5, -4, -3, -2, -1.5, -1, - 0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 100] labels = [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8] data_s['change_D_16'] = pd.cut(data_s['change'], bins, labels=labels) bins = [-100, -5, -2, 0, 2, 5, 100] labels = [-3, -2, -1, 1, 2, 3] data_s['change_D'] = pd.cut(data_s['change'], bins, labels=labels) data = data.reshape(len(data), NUM_PIX) np.save(file_out[0], data[:len(data) - 1]) data_s.to_pickle(file_out[1])
def cleaneddf(no_bins=0): trainpath = 'Titanic/train.csv' testpath = 'Titanic/test.csv' traindf = pd.read_csv(trainpath) testdf = pd.read_csv(testpath) #discretise fare if no_bins == 0: return [cleandf(traindf), cleandf(testdf)] traindf = cleandf(traindf) testdf = cleandf(testdf) bins_and_binned_fare = pd.qcut(traindf.Fare, no_bins, retbins = True) bins = bins_and_binned_fare[1] traindf.Fare = bins_and_binned_fare[0] testdf.Fare = pd.cut(testdf.Fare, bins) #discrete age bins_and_binned_age = pd.qcut(traindf.Age, no_bins, retbins = True) bins = bins_and_binned_age[1] traindf.Age = bins_and_binned_age[0] testdf.Age = pd.cut(testdf.Age, bins) #create a file for kaggle predictiondf = pd.DataFrame(testdf['PassengerId']) predictiondf['Survived']=[0 for x in range(len(testdf))] predictiondf.to_csv('Titanic/prediction.csv', index = False) return [traindf, testdf]
def make_continuous_bar_source(df, x_field, y_field='None', df_orig=None, agg='count'): """Makes discrete, then creates representation of the bars to be plotted. Args: df (DataFrame): contains the data to be converted to a discrete form x_field (str): the column in df that maps to the x dim of the plot y_field (str, optional): the column in df that maps to the y dim of the plot df_orig (DataFrame, optional): original dataframe that the subset ``df`` was generated from agg (str, optional): the type of aggregation to be used Returns: ColumnDataSource: aggregated, discrete form of x,y values """ # Generate dataframe required to use the categorical bar source function idx, edges = pd.cut(x=df[x_field], bins=8, retbins=True, labels=False) labels, edges = pd.cut(x=df[x_field], bins=8, retbins=True) centers = pd.rolling_mean(edges, 2)[1:] # store new value of x as the bin it fell into df['centers'] = centers[idx] df['labels'] = labels # After making it discrete, create the categorical bar source return make_categorical_bar_source(df, 'labels', y_field, df_orig, agg)
def calc_p_values(data, gt1_name, gt2_name, stat_colname=None, num_bins=50, bin_how='mean', ): if stat_colname is None: raise ValueError("you must explicitly set stat_colname (try 'maxWingAngle')") data.index = data.index.astype(np.int64) #LAZY DANNO. DROP TIMESTAMPS FOR BINNING. data['synced_ns'] = data.index df_ctrl = data[data.group == gt1_name][['FlyID', stat_colname, 'synced_ns']] df_exp = data[data.group == gt2_name][['FlyID', stat_colname, 'synced_ns']] align_start = df_ctrl.index.min() dalign = df_ctrl.index.max() - align_start p_values = DataFrame() if bin_how=='mean': bin_func = np.mean elif bin_how=='median': bin_func = np.median bins = np.linspace(0,dalign,num_bins+1) + align_start binned_ctrl = pd.cut(df_ctrl.index, bins, labels= bins[:-1]) binned_exp = pd.cut(df_exp.index, bins, labels= bins[:-1]) for x in binned_ctrl.levels: test1_full_dataset = df_ctrl[binned_ctrl == x] test2_full_dataset = df_exp[binned_exp == x] bin_start_time = test1_full_dataset['synced_ns'].min() bin_stop_time = test1_full_dataset['synced_ns'].max() test1 = [] for obj_id, fly_group in test1_full_dataset.groupby('FlyID'): test1.append( bin_func(fly_group[stat_colname].values) ) test1 = np.array(test1) test2 = [] for obj_id, fly_group in test2_full_dataset.groupby('FlyID'): test2.append( bin_func(fly_group[stat_colname].values) ) test2 = np.array(test2) try: hval, pval = kruskal(test1, test2) except ValueError as err: pval = 1.0 dftemp = DataFrame({'Bin_number': x, 'P': pval, 'bin_start_time':bin_start_time, 'bin_stop_time':bin_stop_time, 'name1':gt1_name, 'name2':gt2_name, 'test1_n':len(test1), 'test2_n':len(test2), }, index=[x]) p_values = pd.concat([p_values, dftemp]) return p_values
def make_color_grouped_scatter_plot(data_frame, x_name, y_name, color_by, filename, colormap, x_function = 'dummy', y_function = 'dummy', color_function = 'dummy', legend = False, colorbar = True): ### Originally created for issue_21 def dummy(a): return a data_frame = data_frame.copy() p = Ppl(colormap, alpha=1) fig, ax = plt.subplots(1) #ax.set_autoscale_on(False) ax.set_xlim([eval(x_function)(min(data_frame[x_name])), eval(x_function)(max(data_frame[x_name]))]) ax.set_ylim([eval(y_function)(min(data_frame[y_name])), eval(y_function)(max(data_frame[y_name]))]) x_label = x_name.capitalize().replace('_', ' ') if x_function == 'log': x_label += ' (log)' y_label = y_name.capitalize().replace('_', ' ') if y_function == 'log': y_label += ' (log)' ax.set_xlabel(x_label) ax.set_ylabel(y_label) ax.xaxis.get_major_formatter().set_powerlimits((0, 1)) ax.yaxis.get_major_formatter().set_powerlimits((0, 1)) # Show the whole color range n_intervals = len(colormap.colors) if color_function == 'log': bins = np.logspace(np.log10( data_frame[color_by].min()), np.log10(data_frame[color_by].max()), n_intervals + 1, base = 10) else: bins = np.linspace(eval(color_function)(data_frame[color_by].min()), eval(color_function)(data_frame[color_by].max()), n_intervals + 1) data_frame['groups'] = pandas.cut(data_frame[color_by], bins=bins, labels = False) groups = pandas.cut(data_frame[color_by], bins=bins) bounds = [] for g in range(n_intervals): x = eval(x_function)(data_frame[data_frame.groups == g][x_name]) y = eval(y_function)(data_frame[data_frame.groups == g][y_name]) p.scatter(ax, x, y, label=str(groups.levels[g]), s = 5, linewidth=0) if legend: p.legend(ax, loc=0) #ax.set_title('prettyplotlib `scatter` example\nshowing default color cycle and scatter params') bounds = bins if colorbar: cmap = p.get_colormap().mpl_colormap norm = mpl.colors.BoundaryNorm(bounds, cmap.N) #ax2.set_ylabel(color_by.capitalize().replace('_', ' '), rotation='horizontal') #ax2.xaxis.get_major_formatter().set_powerlimits((0, 1)) #ax2.yaxis.get_major_formatter().set_powerlimits((0, 1)) ax2 = fig.add_axes([0.9, 0.1 , 0.03, 0.8]) cbar = mpl.colorbar.ColorbarBase(ax2, cmap=cmap, spacing='proportional', ticks=bounds, norm=norm, alpha=1, orientation='vertical') #cbar.ax.set_xticklabels(map(lambda x: '%.3g'%x, bounds))# vertically oriented colorbar cbar.ax.set_yticklabels([])# vertically oriented colorbar #for j, lab in enumerate(map(lambda lower, upper: '%.3g~%.3g'%(lower, upper), bounds[:-1], bounds[1::])): cbar.ax.text(0,1.02, '%.3g'%max(map(eval(color_function), bounds))) #for j, lab in enumerate(map(lambda upper: '< %.3g'%upper, bounds[1::])): # cbar.ax.text(.5, (2 * j + 1) / 8.0, lab, ha='center', va='center', rotation='vertical') #cbar.ax.set_xticklabels([str(int(t)) for t in bounds])# vertically oriented colorbar if color_function == 'log': label = color_by.capitalize().replace('_', ' ') + ' (log)' else: label = color_by.capitalize().replace('_', ' ') cbar.ax.set_ylabel(label, rotation='vertical') fig.savefig(filename) return ax, fig
def twoway(df=df): df['num_years_bucket'] = pd.cut(df['num_years'], bins=range(0,31,5), right=False,\ retbins=False) df['num_moves_bucket'] = pd.cut(df['num_moves'], bins=range(0,31,5), right=False,\ retbins=False) output = pd.DataFrame(df.groupby(['num_moves_bucket', 'num_years_bucket']).size()).reset_index() output = output.pivot('num_moves_bucket', 'num_years_bucket', 0) return output
def binned_color_code_return_probability(train, test, deviations=1.0) -> ( pd.DataFrame, pd.DataFrame): """Bin the colorCode column in training and test using return probabilities of training data. Notes ----- This is to deal with unknown colorCodes (CC's) in the target set by binning the CC range. The binning considers outlier CC's by keeping them separate, 1-sized bins. Outliers are CC's whose return probability is over one standard deviation away from the mean. For our training data (mean: 0.548, std: 0.114) colorCode c is an is an outlier if retProb(c) < 0.434 || 0.662 < retProb(c), given deviations = 1. Parameters ---------- train : pd.DataFrame test : pd.DataFrame See apply_return_probs deviations : float Number of standard deviations a return probability has to differ from the mean to be considered an outlier. """ color_code_min = 0 color_code_max = 9999 # Calculate return probability for each colorCode color_code_ret_probs = (train .groupby('colorCode')['returnQuantity'] .apply(group_return_probability)) # Reindex those values to resemble to distribution in the training set row_ret_probs = color_code_ret_probs.reindex(train['colorCode']) # Calculate mean and minimum mean distance mean = row_ret_probs.mean() diff = row_ret_probs.std() * deviations mean_distances = color_code_ret_probs.sub(mean).abs() # iterate over colorCodes and respective mean distances to collect bins bins = [color_code_min] for cc, mean_distance in mean_distances.items(): if mean_distance > diff: # add the colorCode as 1-sized bin (current cc and cc + 1) # if the last colorCode was added, don't add the current cc, only cc + 1. if bins[-1] != cc: bins.append(cc) bins.append(cc + 1) bins.append(color_code_max + 1) # Assign bins to each row in test and training data train['binnedColorCode'] = pd.cut(train['colorCode'], bins, right=False, labels=False) test['binnedColorCode'] = pd.cut(test['colorCode'], bins, right=False, labels=False) train, test = apply_return_probs(train, test, 'binnedColorCode', 'colorReturnProb') # Test set colorCodes that are bigger than any colorCode in the training set fall into a # category that has no returnProbability. Impute that bin with the mean retProb. test['colorReturnProb'] = test['colorReturnProb'].fillna(mean) return train, test
def reduce_data_to_field(self, data, xkey='x', ykey='y', reducer=np.median, num_bins=1, **kwargs): """Take data and bin by two of its coordinates (default focal plane coordinates x and y). Then, in each bin, apply the reducer function. Parameters ---------- data : dataframe Pandas dataframe that has xkey and ykey columns. xkey, ykey : strings, default 'x' and 'y' Keys by which the data are binned. These keys must be in the data, or else problems will arise! reducer : function Function that takes set of data and returns a number. After the data is binned, one applies this function to the data in a bin. So if one sets reducer=np.mean, then the resultant data is a two dimensional histogram. num_bins : int, default 1 Number of bins for the focal plane. If less than six, then the number of bins is a sort of proxy for the number of divisions of a chip. Default is 2 bins per chip. This implies that num_bins < 6 does not really make sense for xkey,ykey neq x,y Returns ------- field : dataframe Dataframe binned by x and y coordinates. bins_x, bins_y : arrays Arrays of the bins used. Notes ----- data needs to have 'x' and 'y' keys! """ x = data[xkey] y = data[ykey] if num_bins < 6: bins_x, bins_y = self.edges(num_bins) else: bins_x = np.linspace(np.min(x), np.max(x), num_bins) bins_y = np.linspace(np.min(y), np.max(y), num_bins) groups = data.groupby([pd.cut(x, bins_x), pd.cut(y, bins_y)]) field = groups.aggregate(reducer) # also get the count counts = groups[xkey].aggregate('count').values # filter out nanmins on x and y field = field[field[xkey].notnull() & field[xkey].notnull()] # counts already filtered out notnull so let's try! field['N'] = counts return field, bins_x, bins_y
def run_test2(): orig_animals = ['cat', 'dog', 'mouse'] animals = orig_animals * 3 raw_data = { 'animal' : animals, 'score' : get_rand_num_array(len(animals)) } # make DataFrame # df = pd.DataFrame(raw_data, columns = ['animal', 'score']) print '-' * 10 print df print '-' * 10 #return # Create array for bins # bins = get_bin_list(step=20, low_num=0, high_num=100) # For each score assign it to a bin # labels = pd.cut(df['score'], bins) # Same as above but adding the bin value as a column to the DataFrame # df['bin_label'] = pd.cut(df['score'], bins) print type(df) print df.describe print '-' * 10 from collections import Counter c = Counter(df['bin_label']) print '-' * 10 print c vcounts = pd.value_counts(df['bin_label']) print vcounts #print 'by_bin', by_bin print '-' * 10 vcounts = df['bin_label'].value_counts() d = vcounts.to_dict() keys = d.keys() keys.sort() for k in keys: print k, d[k], type(k) return # Show the count in each bin # vc_series = pd.value_counts(df['bin_label']) print '\n', 'vc_series', vc_series print '-' * 10 print vc_series.axes import ipdb; ipdb.set_trace()
payer_enc['START']).dt.days >= 21] #----- Calculate coverage payer_agg = payer_rqd[['PATIENT', 'PAYER_COVERAGE', 'TOTAL_CLAIM_COST']] payer_agg = payer_agg.groupby(['PATIENT'], as_index=False).agg({ "PAYER_COVERAGE": "sum", "TOTAL_CLAIM_COST": "sum" }) payer_agg['COVERAGE_PCT'] = (payer_agg['PAYER_COVERAGE']) / ( payer_agg['TOTAL_CLAIM_COST']) bins_p = [-10, 0.00001, 0.5, 0.80, 11] labels_p = ['NO COVERAGE', 'LOW', 'MED', 'HIGH'] payer_agg["PAYERBUCKET"] = pd.cut(payer_agg['COVERAGE_PCT'], bins=bins_p, labels=labels_p) payer_agg = pd.get_dummies(data=payer_agg, columns=['PAYERBUCKET']) payer_agg = payer_agg.drop(['PAYER_COVERAGE', 'TOTAL_CLAIM_COST'], axis=1) payer_agg = payer_agg.drop_duplicates() #--- Caculate latest payer payer_name = pd.merge(payer_t, payer_df1, left_on='PAYER', right_on='Id') payer_year = payer_name.groupby(['PATIENT'], as_index=False).agg({"END_YEAR": "max"}) payer_name2 = pd.merge(payer_name, payer_year, left_on=['PATIENT', 'END_YEAR'], right_on=['PATIENT', 'END_YEAR']) payer_name2 = payer_name2.drop(
def cal_ic(index, begt, endt, cycle, kind, **kargs): chosen_ = kargs.get('chosen', []) sector = kargs.get('sector', '') mode = kargs.get('mode', 'all') #date_list = [parse(str(x)).strftime('%Y-%m-%d') for x in get_ts_td(begt,endt,cycle)] date_list = [ parse(str(x)).strftime('%Y-%m-%d') for x in get_tc_td(begt, endt) ] #import pdb;pdb.set_trace() ic_dict = {} if len(sector): sector_stks_dict = load_zzhy_all(DATA_PATH) small_comb_num = 5 big_comb_num = 6 for i in range(small_comb_num, big_comb_num): if not len(chosen_): chosen = [';'.join(x) for x in list(combinations(kind, i))] else: chosen = chosen_ for factors in list(combinations(kind, i)): bbb = ';'.join(factors) if bbb.split(';') not in [x.split(';') for x in chosen]: continue for f in factors: if i > 1: ic_dict[';'.join(factors)] = [''] * (len(date_list) - 1) else: ic_dict[f] = [''] * (len(date_list) - 1) for x in range(0, len(date_list) - 1): INTdate = int(date_list[x].replace('-', '')) print(INTdate) ''' if not len(sector): factor = get_fin(INTdate) else: factor = get_fin(INTdate,sector=sector) ''' factor = get_fin(INTdate, sector=sector, sector_stks_dict=sector_stks_dict) factor = factor.set_index('代码') #import pdb;pdb.set_trace() #factor = score_factors(factor,VALUE+PROFIT+TRADE+GROWTH_,GROWTH) factor = score_factors(factor, VALUE + PROFIT + GROWTH_, GROWTH) #import pdb;pdb.set_trace() for i in range(small_comb_num, big_comb_num): if not len(chosen_): chosen = [';'.join(x) for x in list(combinations(kind, i))] else: chosen = chosen_ for factors in list(combinations(kind, i)): bbb = ';'.join(factors) if bbb.split(';') not in [x.split(';') for x in chosen]: continue score = [] for f in factors: f_score = factor[[f]] if not len(score): score = f_score else: #import pdb;pdb.set_trace() score = score.join(f_score) score = score.mean(axis=1) score = score.dropna() score.name = 'f' ret = factor[['下一个月涨幅']] ret = ret.join(score) ret = ret.dropna() #import pdb;pdb.set_trace() fut_ret_rank = pd.cut(ret['下一个月涨幅'], 10, labels=False) + 1 factor_rank = pd.cut(ret['f'], 10, labels=False) + 1 ic = stats.spearmanr(fut_ret_rank, factor_rank)[0] #import pdb;pdb.set_trace() if i > 1: ic_dict[';'.join(factors)][x] = ic else: #import pdb;pdb.set_trace() ic_dict[f][x] = ic #import pdb;pdb.set_trace() ic_df = pd.DataFrame(ic_dict) t_stat, p_value = stats.ttest_1samp(ic_df, 0) #import pdb;pdb.set_trace() sorted_columns = [ '1/PFCF', 'EBITDA/EV', 'S/EV', '1/PE', '1/PB', '1/PS', '1/POP', 'CROIC', 'ROIC', 'ROIC1', 'EP', 'EBITDA-资本支出/IC', 'ROE', 'ROE(扣除)', 'ROA', 'FCF/营业收入', '毛利/净资产', '毛利率', '净利率', '销售现金比', '现金营业收入比', 'delta毛利率', 'deltaROE', '负债/投入资本', '外部融资额/总资产', '资产负债率', '有形净值债务率', '经营现金净流入/资本支出', '折旧/投入资本', '长期负债变化率', '近两年平均资本支出增长率', 'delta存货周转率', 'delta应收账款周转率', 'TTMFCF增长率', 'TTM净利润增长率', 'TTM营业利润增长率', 'TTM营业收入增长率', '最新一季净利润增长率', '最新一季营业利润增长率', '最新一季营业收入增长率', '近一个月跌幅', 'SKEW', '5/60', '换手率', 'ILLIQ', '波动率', '目标收益率', '预期增长率', '预期PE', '预期PEG', 'Delta预期增长率', 'Delta预期增长率2', 'Delta预期当年净利润增长率2', '总市值' ] #import pdb;pdb.set_trace() IC = pd.DataFrame(ic_df.mean(), columns=['IC']) IC['STD'] = ic_df.std() IC['IR'] = IC['IC'] / ic_df.std() #import pdb;pdb.set_trace() IC['T'] = t_stat IC['P'] = p_value if not len(mode): alpha_factor = IC.sort_values(by='T', ascending=False) selected = alpha_factor selected.to_csv('IC_{0}_{1}_{2}_{3}_{4}_{5}_{6}.csv'.format( begt, endt, index, 'AllbyT', str(small_comb_num), str(big_comb_num), sector)) else: IC = IC.reindex_axis(kind) selected = IC selected.to_csv('IC_{0}_{1}_{2}_{3}_{4}_{5}_{6}.csv'.format( begt, endt, index, 'all', str(small_comb_num), str(big_comb_num), sector)) return selected
Fraudcheck.head() Fraudcheck.columns Le = preprocessing.LabelEncoder() Fraudcheck['undergrad'] = Le.fit_transform(Fraudcheck['Undergrad']) Fraudcheck['marital_Status'] = Le.fit_transform(Fraudcheck['Marital_Status']) Fraudcheck['urban'] = Le.fit_transform(Fraudcheck['Urban']) #Droping Fraudcheck.drop(["Undergrad"], inplace=True, axis=1) Fraudcheck.drop(["Marital_Status"], inplace=True, axis=1) Fraudcheck.drop(["Urban"], inplace=True, axis=1) # converting float value to integer S we are converting float to catogorical data bins = [-1, 30000, 100000] Fraudcheck["Taxable_Income"] = pd.cut(Fraudcheck["Taxable_Income"], bins, labels=["Risky", "Good"]) colnames = list(Fraudcheck.columns) predictors = colnames[1:6] #inputs target = colnames[0] #outputs X = Fraudcheck[predictors] Y = Fraudcheck[target] ####### GridSearch from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_jobs=2, oob_score=True, n_estimators=1000, criterion="entropy")
print('-' * 60) # 四、RMF分析实现过程 # 4.1数据转换 recency_value = sales_data['ORDERDATE'].groupby( sales_data.index).max() # 计算原始最近一次订单时间 frequency_value = sales_data['ORDERDATE'].groupby( sales_data.index).count() # 计算原始订单频率 monetary_value = sales_data['AMOUNTINFO'].groupby( sales_data.index).sum() # 计算原始订单总金额 # 4.2计算RFM得分 # 分别计算R、F、M得分 deadline_date = pd.datetime(2017, 1, 1) # 指定一个时间节点,用于计算其他时间与该时间的距离 r_interval = (deadline_date - recency_value).dt.days # 计算R间隔 r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1]) # 计算R得分 f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5]) # 计算F得分 m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5]) # 计算M得分 # R、F、M数据合并 rfm_list = [r_score, f_score, m_score] # 将r、f、m三个维度组成列表 rfm_cols = ['r_score', 'f_score', 'm_score'] # 设置r、f、m三个维度列名 rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols, index=frequency_value.index) # 建立r、f、m数据框 print('RFM Score Overview:') print(rfm_pd.head(4)) print('-' * 60) # 计算RFM总得分:加权得分
def calc_age_bins(self): self.Xy['age_bin'] = pd.cut(self.Xy.age, bins=[0,10,20,30, 40, 50, 60, np.inf])
1: 'Married', 2: 'Widowed', 3: 'Divorced', 4: 'Separated', 5: 'Never married', 6: 'Living with partner', 77: 'Refused', 99: 'dont know' }) da['DMDMARTL2'].value_counts() da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"}) a = da.groupby(da["RIAGENDRx"])['DMDMARTL2'].value_counts() da["agegrp"] = pd.cut(da.RIDAGEYR, [30, 40]) b = da.groupby([da["RIAGENDRx"], da['agegrp']])['DMDMARTL2'].value_counts() b.loc['Male', :].unstack() # __Q1a.__ Briefly comment on some of the differences that you observe between the distribution of marital status between women and men, for people of all ages. # __Q1b.__ Briefly comment on the differences that you observe between the distribution of marital status states for women between the overall population, and for women between the ages of 30 and 40. # __Q1c.__ Repeat part b for the men. # ## Question 2 # # Restricting to the female population, stratify the subjects into age bands no wider than ten years, and construct the distribution of marital status within each age band. Within each age band, present the distribution in terms of proportions that must sum to 1.
def Ks2(data, flag): # Bin que vai permitir agrupar os valores por score bin = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] #Contar o total de ocorrências de bons e mals pagadores dfTotal = data.ALVO.value_counts().to_frame() dfTotal = dfTotal.rename(index={0: "Bom", 1: "Mal"}) totais = dfTotal.T #totais.at['ALVO','Bom'] e totais.at['ALVO','Mal'] #Contando por percentual de decil df = data.groupby(pd.cut(data.SCORE, bins=bin))['ALVO'].value_counts() # Desempilhando o Dataframe df = df.unstack() # Calculando o percentual do decil comparado ao valor total df["PercentualBom"] = (df[0] / totais.at['ALVO', 'Bom']) df["PercentualMal"] = (df[1] / totais.at['ALVO', 'Mal']) # Calculando o Cumulativo df["PercentualBomAcc"] = df['PercentualBom'].rolling(min_periods=1, window=10).sum() df["PercentualMalAcc"] = df['PercentualMal'].rolling(min_periods=1, window=10).sum() # Calculando o ks2 por decil df['KS2'] = abs(df["PercentualBomAcc"] - df["PercentualMalAcc"]) # Label label = [ "0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100" ] flag = flag.strip().lower() if flag == "no": # Texto da Tabela descrevendo gráfico desc = """O KS2 é uma métrica utilizada para sabermos quanto o modelo discrimina os bons dos maus clientes. Seu valor é a maior diferença das distribuições acumuladas dos dois públicos analisados. Quanto maior o KS2, melhor será a discriminação dos dois públicos pelo modelo em análise.""" #fig = go.Figure() # Fazendo os subplots para colocar a descrição e o gráfico na mesma imagem fig = make_subplots(rows=1, cols=3, specs=[[{ "type": "table" }, { "colspan": 2 }, None]]) # Add Table fig.add_trace(go.Table(header=dict(values=["Descrição"], font=dict(size=10), align="left"), cells=dict(values=[desc], align="left")), row=1, col=1) # Add linha de Bom fig.add_trace( go.Scatter(x=label, y=df.PercentualBomAcc, hovertemplate="%{x},%{y}", name="Bom")) # Add linha de Mal fig.add_trace(go.Scatter(x=label, y=df.PercentualMalAcc, hovertemplate="%{x},%{y}", name="Mal"), row=1, col=2) # Add linha de KS2 fig.add_trace(go.Scatter(x=label, y=df.KS2, name="KS2"), row=1, col=2) fig.update_xaxes(title_text="Faixa de Score", row=1, col=2) fig.update_yaxes(title_text="% População (AC)", row=1, col=2) fig.update_yaxes(tickformat=".3%", row=1, col=2) #Add Formatação do Gráfico fig.update_layout(title={ 'text': "KS2", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, ) fig.show() return fig elif flag == "yes": pre_fig = make_subplots(rows=1, cols=2, specs=[[{"colspan": 2}, None]]) # Add linha de Bom pre_fig.add_trace( go.Scatter(x=label, y=df.PercentualBomAcc, hovertemplate="%{x},%{y}", name="Bom")) # Add linha de Mal pre_fig.add_trace(go.Scatter(x=label, y=df.PercentualMalAcc, hovertemplate="%{x},%{y}", name="Mal"), row=1, col=1) # Add linha de KS2 pre_fig.add_trace(go.Scatter(x=label, y=df.KS2, name="KS2"), row=1, col=1) pre_fig.update_xaxes(title_text="Faixa de Score", row=1, col=1) pre_fig.update_yaxes(title_text="% População (AC)", row=1, col=1) pre_fig.update_yaxes(tickformat=".3%", row=1, col=1) #Add Formatação do Gráfico pre_fig.update_layout(title={ 'text': "KS2", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, ) pre_fig.show() desc = str(input("digite a descrição desejada: ")) # Fazendo os subplots para colocar a descrição e o gráfico na mesma imagem fig = make_subplots(rows=1, cols=3, specs=[[{ "type": "table" }, { "colspan": 2 }, None]]) # Add Table fig.add_trace(go.Table(header=dict(values=["Descrição"], font=dict(size=10), align="left"), cells=dict(values=[desc], align="left")), row=1, col=1) # Add linha de Bom fig.add_trace( go.Scatter(x=label, y=df.PercentualBomAcc, hovertemplate="%{x},%{y}", name="Bom")) # Add linha de Mal fig.add_trace(go.Scatter(x=label, y=df.PercentualMalAcc, hovertemplate="%{x},%{y}", name="Mal"), row=1, col=2) # Add linha de KS2 fig.add_trace(go.Scatter(x=label, y=df.KS2, name="KS2"), row=1, col=2) fig.update_xaxes(title_text="Faixa de Score", row=1, col=2) fig.update_yaxes(title_text="% População (AC)", row=1, col=2) fig.update_yaxes(tickformat=".3%", row=1, col=2) #Add Formatação do Gráfico fig.update_layout(title={ 'text': "KS2", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, ) fig.show() return fig else: raise " Flag invalida, por favor digite 'YES' ou 'NO' "
print(TopFin_df) #Bottom Five Schools BotFin_df = Place_df.tail(5) print(BotFin_df) #Math Scores by Grade MBG_df = Stud_df.groupby(["school_name", "grade"])[["math_score"]].mean() print(round(MBG_df, 2)) RBG_df = Stud_df.groupby(["school_name", "grade"])[["reading_score"]].mean() print(round(RBG_df, 2)) #Cutting the dataframe into intervals: 575-599, 600 to 629, 630 to 649, 650+ Fin_df["Spending Ranges"] = pd.cut( Fin_df["Per Student Budget"], [575, 599, 629, 649, 674], labels=["Below $600", "$600-$629", "$630-$649", "At Least $650"]) #scores by school spending BySpend_df = Fin_df.groupby(["Spending Ranges"])[[ "Avg Math Score", "Avg Reading Score", "% Passing Math", "% Passing Reading", "Overall Passing Rate" ]] print(round(BySpend_df.mean(), 2)) #Cutting the data into 3 groups by School Size Fin_df["Size Ranges"] = pd.cut(Fin_df["Total Students"], [0, 1499, 2999, 5000], labels=["Small", "Medium", "Large"]) BySize_df = Fin_df.groupby(["Size Ranges"])[[ "Avg Math Score", "Avg Reading Score", "% Passing Math", "% Passing Reading", "Overall Passing Rate"
def create_ml_100k_dataset(embed_dim=16): """加载数据""" base_path = '/home/mesie/python/recommend/recommend-learning/data/' rating_df = pd.read_csv( base_path + 'ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp']) user_df = pd.read_csv( base_path + 'ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip']) item_df = pd.read_csv(base_path + 'ml-100k/u.item', sep='|', encoding="ISO-8859-1", names=[ 'movie_id', 'title', 'release_date', 'video_release_date', 'url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ]) data_df = pd.merge(rating_df, user_df, how="left") data_df = pd.merge(data_df, item_df, how="left") data_df['label'] = data_df['rating'].apply(get_label) # 处理age data_df['age'] = pd.cut( data_df['age'], bins=[0, 15, 25, 35, 45, 60, 100], labels=['0-15', '15-25', '25-35', '35-45', '45-60', '60-100']) # 处理电影类型 user_features = ['user_id', 'age', 'gender'] movie_features = [ 'movie_id', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] # features = user_features + movie_features feature_max_idx = {} for feature in user_features + movie_features: lbe = LabelEncoder() data_df[feature] = lbe.fit_transform(data_df[feature]) feature_max_idx[feature] = data_df[feature].max() + 1 user_feat_cols = [] user_feat_cols.append( sparseFeature(feat='user_id', feat_num=feature_max_idx['user_id'], embed_dim=embed_dim)) user_feat_cols = user_feat_cols + [ sparseFeature(feat=uf, feat_num=feature_max_idx[uf]) for uf in ['age', 'gender'] ] item_feat_cols = [] item_feat_cols.append( sparseFeature(feat='movie_id', feat_num=feature_max_idx['movie_id'], embed_dim=embed_dim)) movie_type = [ 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] item_feat_cols = item_feat_cols + [ sparseFeature(feat=mt, feat_num=feature_max_idx[mt]) for mt in movie_type ] train, test = train_test_split(data_df, test_size=0.2) train_X = [{feat: train[feat].values for feat in user_features}, {feat: train[feat].values for feat in movie_features}] # train_X = [train[user_features].values.astype('int32'), train[movie_features].values.astype('int32'), # train['label'].values.astype('int32')] train_y = train['label'].values.astype('int32') test_X = [{feat: test[feat].values for feat in user_features}, {feat: test[feat].values for feat in movie_features}] # test_X = [test[user_features].values.astype('int32'), test[movie_features].values.astype('int32'), # test['label'].values.astype('int32')] test_y = test['label'].values.astype('int32') print(train_X) return user_feat_cols, item_feat_cols, train_X, train_y, test_X, test_y
def calc_age_bins(self): """Calculates age bins. """ self.Xy["age_bin"] = pd.cut(self.Xy.age, bins=self.age_bins)
#from sklearn.svm import SVC #from sklearn.svm import LinearSVC from sklearn.decomposition import PCA from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix print('Loading dataframe...') t0 = time() with open('jokes.df.pickle', 'rb') as pickle_file: df = pickle.load(pickle_file) print("done in %0.3fs" % (time() - t0)) print('Flattening {} d2v lists ...'.format(df.d2v.values.shape[0])) X = np.reshape([r for r in df.d2v.values], (df.d2v.values.shape[0], 300)) y = pd.cut(df.score, [-1, 15, 50000], labels=['MEH', 'GOOD']) print(df.score.describe()) print("done in %0.3fs" % (time() - t0)) print('Splitting into a training and testing set ...') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) print("done in %0.3fs" % (time() - t0)) print("Projecting the input data on the d2v orthonormal basis (PCA'ing) ...") pca = PCA(n_components=32, svd_solver='randomized', whiten=True).fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0))
data['title'] = data['title'].fillna(0) data['title'] = data['title'].astype(int) data['title'] pd.value_counts(data['title']) # age - standardscaler data['age'].value_counts() data['age'].describe() data['age'].fillna(method='pad', inplace=True) np.bincount(data['age'].isnull()) # age_range - encoding bins = [0, 5, 10, 15, 20, 30, 40, 50, 60, 80, 100] labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] data['age_range'] = pd.cut(data['age'], bins, labels=labels) data['age_range'] np.bincount(data['age_range'].isnull()) data.age_range.unique() sns.countplot(data['age_range'], data=data, hue=data['survived']) plt.close('all') # del data['age'] data['age_range'] # cabin-> cabin list -> encoding data['cabin'].unique() np.bincount(data['cabin'].isnull()) data['cabin'] data.loc[data.cabin.str[0] == 'A', 'cabinlist'] = 1 data.loc[data.cabin.str[0] == 'B', 'cabinlist'] = 2
parser = argparse.ArgumentParser() parser.add_argument("input", help="model TSV") parser.add_argument("faceted_output", help="faceted model fold change scatterplot (PDF, PNG, etc.)") parser.add_argument("combined_output", help="single-panel model fold change scatterplot (PDF, PNG, etc.)") parser.add_argument("year_range") parser.add_argument("viruses") parser.add_argument("predictors") parser.add_argument("sample") args = parser.parse_args() frequency_bins = np.linspace(0.0, 1.0, 5) df = pd.read_table(args.input) df["observed_ratio"] = df["observed_freq"] / df["initial_freq"] df["predicted_ratio"] = df["predicted_freq"] / df["initial_freq"] df["binned_initial_freq"] = pd.cut(df["initial_freq"], bins=frequency_bins) correlation_by_bin = [] mcc_by_bin = [] n_clades_by_bin = [] for freq, freq_df in df.groupby("binned_initial_freq"): correlation_by_bin.append(pearsonr(freq_df["observed_ratio"], freq_df["predicted_ratio"])[0]) # Calculate Matthew's correlation coefficient mcc_by_bin.append(get_matthews_correlation_coefficient_for_data_frame(freq_df)) n_clades_by_bin.append(freq_df.shape[0]) g = sns.FacetGrid(df, col="binned_initial_freq", col_wrap=2, height=4) g.map(sns.regplot, "observed_ratio", "predicted_ratio", fit_reg=False) g.add_legend()
[ train_data.Age[train_data.Pclass == i].plot.kde(bw_method=0.3, color=colors[i - 1]) for i in [1, 2, 3] ] plt.xlim((-20, 100)) plt.legend(["First class", "Second class", "Third class"]) plt.title("Density plot Age wrt Class of Travel") # Consider Age categories instead of specific ages train_data.Age = train_data.Age.fillna(-0.5) labels = ['Unknown', 'Babies', 'Children', 'Youth', 'Adults', 'Seniors'] bins = [-1, 0, 5, 15, 24, 65, np.inf] train_data['AgeCategories'] = pd.cut(train_data["Age"], bins, labels=labels) survival_rate_per_category = ( train_data.AgeCategories[train_data.Survived == 1].value_counts( normalize=True).sort_index() / train_data.AgeCategories[train_data.Survived == 0].value_counts( normalize=True).sort_index()) plt.subplot2grid((1, 3), (0, 2)) survival_rate_per_category.plot( kind="bar", alpha=0.6, color=["lightblue", "teal", "wheat", "beige", "grey", "lavender"]) plt.ylabel("survival rate") plt.title("Survival Rate per Age Category")
# In[4]: df = pd.read_csv ('spotify_songs.csv') # In[6]: lyrics = df['lyrics'].values.astype(str) # In[41]: y = pd.cut(df.valence,bins=[0,0.5,1],labels=[0,1]) # In[42]: lyrics_train, lyrics_test, y_train, y_test = train_test_split(lyrics, y, random_state=1) # In[44]: vectorizer = CountVectorizer() vectorizer.fit(lyrics_train)
import pandas as pd import numpy as np x = np.array([1, 7, 5, 4, 6, 3]) print(pd.cut(x, 3))
for d in data: for i in range(0, 2): for j in range(0, 3): guess = d[(d['Sex'] == i) & (d['Pclass'] == j + 1)]['Age'].dropna() age_guess = guess.mean() ages[i, j] = int(age_guess / 0.5 + 0.5) * 0.5 for i in range(0, 2): for j in range(0, 3): d.loc[(d.Age.isnull()) & (d.Sex == i) & (d.Pclass == j + 1), 'Age'] = ages[i, j] d['Age'] = d['Age'].astype(int) print(train_data.head()) # Examine the survival rate among Age bands train_data['AgeBand'] = pd.cut(train_data['Age'], 5) print(train_data[['AgeBand', 'Survived']].groupby( ['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)) # Replace Age with AgeBand for d in data: d.loc[d['Age'] <= 16, 'Age'] = 0 d.loc[(d['Age'] > 16) & (d['Age'] <= 32), 'Age'] = 1 d.loc[(d['Age'] > 32) & (d['Age'] <= 48), 'Age'] = 2 d.loc[(d['Age'] > 48) & (d['Age'] <= 64), 'Age'] = 3 d.loc[d['Age'] > 64, 'Age'] print(train_data.head()) # Remove AgeBand feature train_data = train_data.drop(['AgeBand'], axis=1)
fig4 = px.box(gss_clean, x='job_prestige', y='sex', color='sex', labels={ 'job_prestige': 'Occupational Prestige', 'sex': '' }) fig4.update_layout(showlegend=False) fig4.update(layout=dict(title=dict(x=0.5))) fig4.show() new_df = gss_clean[['income', 'sex', 'job_prestige']] new_df['job_prestige_cat'] = pd.cut(new_df['job_prestige'], bins=6, labels=['1', '2', '3', '4', '5', '6']) new_df = new_df.sort_values(by='job_prestige_cat') clean_df = new_df.dropna(axis=0) fig_bar = px.box(clean_df, x='sex', y='income', color='sex', facet_col='job_prestige_cat', facet_col_wrap=2, hover_data=['income'], labels={ 'sex': 'Sex', 'income': 'Income'
# load data data_temp = data_total[data_total['channel'] == str(c + 1)] #print(data_temp) # prepare binning (bin_list) photon_max = max(data_temp['intensity [photon]']) print('photon_max: {}'.format(photon_max)) binsize = 1000 photon_bin_max = photon_max // binsize print('photon_bin_max: {}'.format(photon_bin_max)) bin_list = list(range(0, (int(photon_bin_max) + 2) * binsize, binsize)) print(bin_list) # prepare binned data data_total_tmp = data_total data_total_tmp['bins'] = pd.cut(data_total['intensity [photon]'], bins=bin_list) # 1st group by bins data_total_tmp = data_total_tmp.groupby( by=['channel', 'group', 'filename', 'bins']).size() # reset index data_total_tmp = data_total_tmp.reset_index() data_total_tmp = data_total_tmp.rename(index=int, columns={0: 'counts'}) # 2nd group by data_total_tmp_mean = data_total_tmp.groupby( by=['channel', 'group', 'bins']).mean()['counts'] data_total_tmp_sem = data_total_tmp.groupby( by=['channel', 'group', 'bins']).sem()['counts'] print('binned data, mean') display(data_total_tmp_mean) print('binned data, sem')
# it will not cause much interference to the model for ages 200 # 3. LR is a generalized linear model with limited expressive power. # After discretization, each variable has a separate weight, # which is equivalent to introducing non-linearity, # which can improve the expressive power of the model and increase the fitting. # 4. After discrete features, feature crossover can be performed to improve the expression ability. # M + N variables are programmed by M + N variables, # which further introduces non-linearity and improves the expression ability; # 5. The model is more stable after the features are discrete. # For example, the user's age range will not change because the user is one year old. # LightGBM improve the performance of XGBoost and increase data hiving at the same time # which improves the generalization of the model bin = [i*10 for i in range(31)] data['power_bin'] = pd.cut(data['power'], bin, labels=False) print(data[['power_bin', 'power']].head()) #see the original data print(data.shape) print(data.columns) data = data.drop(['creatDate', 'regDate', 'regionCode'], axis=1) #check whether they have been deleted print(data.shape) print(data.columns) from sklearn import preprocessing min_max_scaler = preprocessing.MinMaxScaler() data['power'] = np.log(data['power'] + 1) data['power'] = ((data['power'] - np.min(data['power'])) / (np.max(data['power']) - np.min(data['power']))) data['kilometer'] = ((data['kilometer'] - np.min(data['kilometer'])) /
# In[93]: df.plot(kind='line', x=df.Genres, figsize=[9, 3]) plt.title('What movies watch writers?') plt.ylabel('count') plt.xlabel('genres') ##### 10. Which age group is ranking which genre the most? # In[101]: # we are dividing people in 8 group by age. labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79'] # add new column with age_group data['Age_group'] = pd.cut(data['Age'], range(0, 81, 10), right=False, labels=labels) print data[['Age', 'Age_group']].drop_duplicates()[:20] # In[95]: # let's look at how age is distributed amongst our users # call panda's "hist" on the column to produce a histogram data.Age.hist(bins=10) plt.title("Distribution of users' ages") plt.ylabel('count of users') plt.xlabel('age') # In[96]: # now we are comparing ratings across age groups
# alternative: here is the numpy equivalent to R ifelse # targets_long['value'] = np.where(condition, targets_long['value'] * 1000, targets_long['value']) # %% get advanced puf file %time puf_2018 = pd.read_hdf(PUF_HDF) # 1 sec puf_2018.tail() puf_2018.columns.sort_values().tolist() # show all column names pufsub = puf_2018.copy() # new data frame pufsub = pufsub.loc[pufsub["data_source"] == 1] # ~7k records dropped # create an irs stub categorical variable pufsub['IRS_STUB'] = pd.cut( pufsub['c00100'], IRS_AGI_STUBS, labels=list(range(1, len(IRS_AGI_STUBS))), right=False) pufsub.columns.sort_values().tolist() # show all column names # %% get just the variables we want and create new needed variables # get list of target variables we need to have plist = list(PUFTARG_XWALK.keys()) plist # add names of variables we need for calculations or targeting plist.append('pid') plist.append('IRS_STUB') plist.append('s006') plist.append('MARS') # remove names of variables we are going to create
def main(): inpt_folder = "/mnt/scratch/Data/METData/" v_frac = 0.1 ## Create a dummy model as it has all of the loader capabilities and ensure this is exactly what our networks see during training! model = Model.METNET_Agent('Tight', '/mnt/scratch/Saved_Networks/Presentation/') model.setup_network(True, 'XXX', None, 1, 5, False, 0, dev='cpu') model.inpt_list = ['Tight_Final_ET'] model.setup_dataset(inpt_folder, v_frac, 32, 1024, 8096, 8, 'mag', 0, 0, 0, no_trn=True) ## The bin setup to use for the profiles n_bins = 40 mag_bins = np.linspace(0, 400, n_bins + 1) exy_bins = [ np.linspace(-50, 250, n_bins + 1), np.linspace(-150, 150, n_bins + 1) ] ## All the networks outputs and targets for the batch will be combined into one list all_outputs = [] all_targets = [] ## The information to be saved in our dataframe, the truth et (for binning) and the performance metric per bin met_names = ['Tru', 'Res', 'Lin', 'Ang'] ## Configure pytorch, the network and the loader appropriately T.set_grad_enabled(False) model.net.eval() model.valid_loader.dataset.weight_off() ## Iterate through the validation set for batch in tqdm(model.valid_loader, desc='perfm', ncols=80, ascii=True): ## Get the network outputs and targets tight, targets = myUT.move_dev(batch[:-1], model.device) ## Undo the processing on Tight tight = (tight * model.net.inp_stats[1, :1] + model.net.inp_stats[0, :1]) / 1000 outputs = T.cat([tight, T.zeros_like(tight)], dim=1) all_outputs.append(deepcopy(outputs)) all_targets.append(deepcopy(targets)) ## Combine the lists into single tensors all_outputs = T.cat(all_outputs) all_targets = T.cat(all_targets) ## Undo the normalisation on the outputs and the targets net_xy = all_outputs tru_xy = (all_targets * model.net.trg_stats[1] + model.net.trg_stats[0]) / 1000 net_et = T.norm(net_xy, dim=1) tru_et = T.norm(tru_xy, dim=1) ## Calculate the performance metrics res = ((net_xy - tru_xy)**2).mean(dim=1) lin = (net_et - tru_et) / (tru_et + 1e-8) ang = T.acos( T.sum(net_xy * tru_xy, dim=1) / (net_et * tru_et + 1e-8))**2 ## Calculated using the dot product ## We save the overall resolution model.avg_res = T.sqrt(res.mean()).item() ## Combine the performance metrics into a single pandas dataframe combined = T.vstack([tru_et, res, lin, ang]).T df = pd.DataFrame(myUT.to_np(combined), columns=met_names) ## Make the profiles in bins of True ET using pandas cut and groupby methods df['TruM'] = pd.cut(df['Tru'], mag_bins, labels=(mag_bins[1:] + mag_bins[:-1]) / 2) profs = df.drop('Tru', axis=1).groupby('TruM', as_index=False).mean() profs['Res'] = np.sqrt(profs['Res']) ## Res and Ang are RMSE measurements profs['Ang'] = np.sqrt(profs['Ang']) ## Save the performance profiles profs.to_csv(Path(model.save_dir, model.name, 'perf.csv'), index=False) ## Save the Magnitude histograms h_tru_et = np.histogram(myUT.to_np(tru_et), mag_bins, density=True)[0] h_net_et = np.histogram(myUT.to_np(net_et), mag_bins, density=True)[0] myPL.plot_and_save_hists(Path(model.save_dir, model.name, 'MagDist'), [h_tru_et, h_net_et], ['Truth', 'Outputs'], ['MET Magnitude [Gev]', 'Normalised'], mag_bins, do_csv=True) ## Save the ex and ey contour plots h_tru_xy = np.histogram2d(*myUT.to_np(tru_xy).T, exy_bins, density=True)[0] h_net_xy = np.histogram2d(*myUT.to_np(net_xy).T, exy_bins, density=True)[0] myPL.plot_and_save_contours(Path(model.save_dir, model.name, 'ExyDist'), [h_tru_xy, h_net_xy], ['Truth', 'Outputs'], ['METx [GeV]', 'METy [GeV]'], exy_bins, do_csv=True) ## Get a dataframe from the class dict and write out dict_df = pd.DataFrame.from_dict([model.get_dict()]).set_index('name') dict_df.to_csv(Path(model.save_dir, model.name, 'dict.csv'))
def loadAndTransDatasetsReal(self): """ 读入样本数据(随机生成的模拟真实数据) 给连续特征分组, 返回修改后的新样本数据 同时可输出特征分组文件, 供手动修改分组权重值 """ df = self.df_train.copy() # 1 特征转换: 指标值分组和转换 # 1.1 对类别特征, 用[特征名+'__'+特征值]的结果替换原来的特征值 # 如将'建筑业'替换成'industry__建筑业' for f1 in [ 'industry', 'city', 'district', 'is_change_premises', 'cacellation_reason', 'is_real_premise', 'is_executee', 'is_financial_black_list', 'alert_level' ]: df[f1] = df[f1].map(lambda x: f1 + '__' + str(x)) #print(df[f1].value_counts()) # 1.2 对连续特征值, 先分组, 再用[指标名+'__'+分组]的结果替换分组后的指标值 for f2 in [ 'registered_capital', 'employees', 'total_assets', 'total_tax', 'industry_index', 'fraud_score', 'period_abnormal', 'rate_frozon_holdings', 'total_liabilities', 'total_debt', 'judical_auction_amount', 'complaints_number_monthly', 'rate_conciliation', 'sub_enterprises_number', ]: df[f2] = pd.cut( df[f2], 4, duplicates='drop').map(lambda x: f2 + '__' + str(x)) #print(df[f2].value_counts()) # 1.3 给个别指标手动指定分组后改名 #print(df['period_abnormal'].value_counts()) #print(pd.cut(df['period_abnormal'], bins=5)) del df['enterprise_id'] # 2 输出转换后的数据集 # 2.1 输出所有指标的所有分组值, 存储在features_group_name.xlsx文件中 df_features_group = pd.DataFrame() for c in df.columns: #print(c) df1 = df[c].value_counts().reset_index() df1.columns = ['feature_group_name', 'counts'] df1['interval'] = df1['feature_group_name'].map( lambda x: x.split('__')[1]) #print(df1[['features', 'interval', 'counts']]) df_features_group = pd.concat([ df_features_group, df1[['feature_group_name', 'interval', 'counts']] ]) #print(df_features_group) df_features_group.to_excel('datasets/features_group_name.xlsx', index=False) # 2.2 存储分组转换后的指标值 #print(df.shape, '\n', df.head(), '\n', df.columns) df.to_csv('datasets/features_grouped.csv', sep=';', index=False) # 3 将特征转换后的数据集转换成数组格式, 模型利用datsets做正式训练 self.datasets_init = df.to_numpy().tolist() return self.datasets_init
def mono_bin(Y, X, n=max_bin): df1 = pd.DataFrame({"X": X, "Y": Y}) justmiss = df1[['X', 'Y']][df1.X.isnull()] notmiss = df1[['X', 'Y']][df1.X.notnull()] r = 0 while np.abs(r) < 1: try: d1 = pd.DataFrame({ "X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n) }) d2 = d1.groupby('Bucket', as_index=True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) n = n - 1 except Exception as e: n = n - 1 if len(d2) == 1: n = force_bin bins = algos.quantile(notmiss.X, np.linspace(0, 1, n)) if len(np.unique(bins)) == 2: bins = np.insert(bins, 0, 1) bins[1] = bins[1] - (bins[1] / 2) d1 = pd.DataFrame({ "X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins), include_lowest=True) }) d2 = d1.groupby('Bucket', as_index=True) d3 = pd.DataFrame({}, index=[]) d3["MIN_VALUE"] = d2.min().X d3["MAX_VALUE"] = d2.max().X d3["COUNT"] = d2.count().Y d3["EVENT"] = d2.sum().Y d3["NONEVENT"] = d2.count().Y - d2.sum().Y d3 = d3.reset_index(drop=True) if len(justmiss.index) > 0: d4 = pd.DataFrame({'MIN_VALUE': np.nan}, index=[0]) d4["MAX_VALUE"] = np.nan d4["COUNT"] = justmiss.count().Y d4["EVENT"] = justmiss.sum().Y d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y d3 = d3.append(d4, ignore_index=True) d3["EVENT_RATE"] = d3.EVENT / d3.COUNT d3["NON_EVENT_RATE"] = d3.NONEVENT / d3.COUNT d3["DIST_EVENT"] = d3.EVENT / d3.sum().EVENT d3["DIST_NON_EVENT"] = d3.NONEVENT / d3.sum().NONEVENT d3["WOE"] = np.log(d3.DIST_EVENT / d3.DIST_NON_EVENT) d3["IV"] = (d3.DIST_EVENT - d3.DIST_NON_EVENT) * np.log( d3.DIST_EVENT / d3.DIST_NON_EVENT) d3["VAR_NAME"] = "VAR" d3 = d3[[ 'VAR_NAME', 'MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT', 'DIST_NON_EVENT', 'WOE', 'IV' ]] d3 = d3.replace([np.inf, -np.inf], 0) d3.IV = d3.IV.sum() return (d3)
fig, ax = plt.subplots() ax = sns.distplot(df['boneage'], bins=10) ax.set(xlabel='Boneage (months)', ylabel='Density', title='Boneage distribution'); boneage_mean = df['boneage'].mean()#mean age boneage_div = 2*df['boneage'].std() print(boneage_mean," ",boneage_div) df['boneage_zscore'] = df['boneage'].map(lambda x: (x - boneage_mean) / boneage_div) df.dropna(inplace=True) df['gender'] = df['male'].map(lambda x: 1 if x else 0) df['boneage_category'] = pd.cut(df['boneage'], 10) df['gender'] df['boneage_category'] print(df) raw_train_df, raw_valid_df = train_test_split(df, test_size=VALIDATION_FRACTION, random_state=2018) raw_train_df.shape[0] raw_valid_df.shape[0] raw_valid_df.head() print(df.loc[df['exists']==False,:])
def main(): logging.basicConfig(format="%(funcName)s: %(message)s", level=logging.INFO) output_dir = './german_credit_showcase/' sensitive_microdata_path = output_dir + 'german_credit_data.tsv' if not path.exists(output_dir): mkdir(output_dir) if not path.exists(sensitive_microdata_path): attributes = [] codes = {} logging.info('Retrieving data documentation...') codes_file = request.urlopen( 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc' ).readlines() logging.info('Retrieved') state = 'skip' attribute = '' code = '' value = '' for line in [x.decode('UTF-8').strip() for x in codes_file]: if line == '': state = 'skip' elif state == 'skip': if str.startswith(line, 'Att'): state = 'attribute' elif state == 'attribute': attribute = line.strip() attributes.append(attribute) codes[attribute] = {} state = 'values' elif state == 'values': split = line.index(':') if ':' in line else -1 if split != -1: code = line[:split].strip() value = line[split + 1:].strip().replace(':', '-') codes[attribute][code] = value else: codes[attribute][code] += ' ' + line.replace(':', '-') attributes.append('Credit rating') codes['Credit rating'] = {'1': 'Good', '2': 'Bad'} logging.info('Retrieving data file...') df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data', sep=' ', index_col=False, header=None, names=attributes) logging.info('Retrieved') logging.info('Processing dataset...') values, labels = binValuesAndLabels(df['Duration in month'].max(), 12) df['Duration in month'] = pd.cut(df['Duration in month'], bins=values, labels=labels) values, labels = binValuesAndLabels(df['Credit amount'].max(), 2500) df['Credit amount'] = pd.cut(df['Credit amount'], bins=values, labels=labels) values, labels = binValuesAndLabels(df['Age in years'].max(), 20) df['Age in years'] = pd.cut(df['Age in years'], bins=values, labels=labels) df = df.astype(str).replace(to_replace=r'^nan$', value='', regex=True) for att in attributes: df[att] = df[att].replace(codes[att]) del df['foreign worker'] del df['Property'] del df['Telephone'] del df['Other debtors / guarantors'] del df['Number of people being liable to provide maintenance for'] del df['Other installment plans'] del df['Savings account/bonds'] del df['Present employment since'] del df['Status of existing checking account'] df.to_csv(sensitive_microdata_path, sep='\t', index=False) logging.info('Processed') config = { 'parallel_jobs': 1, 'memory_limit_pct': 90, 'use_columns': [], 'record_limit': -1, 'reporting_length': 5, 'reporting_precision': 2, 'reporting_threshold': 2, 'seeded': True, 'sensitive_zeros': [], 'prefix': 'credit', 'output_dir': output_dir, 'sensitive_microdata_path': sensitive_microdata_path, 'sensitive_microdata_delimiter': '\t', 'report_title': 'German Credit Data Showcase', } json.dump(config, open(path.join('.', config['prefix'] + '_config.json'), 'w'), indent=1) config['aggregate'] = True config['generate'] = True config['navigate'] = True config['evaluate'] = True config['reportable_aggregates_path'] = path.join( config['output_dir'], config['prefix'] + '_reportable_aggregates.tsv') config['synthetic_microdata_path'] = path.join( config['output_dir'], config['prefix'] + '_synthetic_microdata.tsv') config['sensitive_aggregates_path'] = path.join( config['output_dir'], config['prefix'] + '_sensitive_aggregates.tsv') runPipeline(config)
from sklearn.linear_model import LogisticRegression from sklearn import preprocessing from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score from sklearn.metrics import roc_curve, auc import imblearn import xgboost as xgb # conda install py-xgboost import lime import lime.lime_tabular import warnings from lime import submodular_pick import matplotlib.pyplot as plt data = pd.read_csv(os.path.join('..', 'result', 'all_right_wrong_i.csv')) data.drop(['change_1m', 'change_3m'], axis=1, inplace=True) data = data.assign( MRS_TX_3=pd.cut(data['MRS_TX_3'], [-1, 2, 7], labels=[0, 1])) wrong_data = data[data.ctype == 0] wrong_data.drop(['ctype'], axis=1, inplace=True) id_wrong_data = wrong_data[['ICASE_ID', 'IDCASE_ID']] y_wrong_data = wrong_data[['MRS_TX_3']] x_wrong_data = wrong_data.drop(['ICASE_ID', 'IDCASE_ID', 'MRS_TX_3'], axis=1) aucs = [] for i in range(1): X_train, X_test, y_train, y_test = train_test_split(x_wrong_data, y_wrong_data, test_size=0.3, random_state=i, stratify=y_wrong_data) # for train_index, test_index in KFold(n_splits=10).split(x_wrong_data):
# Create a new dataframe with unique columns for station names and station ids df_unique = df.drop_duplicates( subset=['from_station_id', 'from_station_name']) df_unique = df_unique[['from_station_id', 'from_station_name']] df_unique = df_unique.reset_index(drop=True) # Create a json file if needed # with open('./frontend/file.json', 'w') as f: # json.dump(df_unique.to_json(r'./frontend/file.json', # orient="records"), f) # Dropping rows with empty values df.dropna(inplace=True) # Convert the tripduration column into float type df['tripduration'] = df['tripduration'].replace({ ',': '' }, regex=True).astype(float) # Get the current year now = datetime.now() # Add a new column, age to the dataframe df['age'] = now.year - df['birthyear'] # Separate the ages into groups df['age_group'] = pd.cut(df['age'], [0, 15, 30, 45, 200], labels=['0-15', '16-30', '31-45', '46+']) app.run(debug=True)
'RU': 11162, 'SA': 22865, 'SE': 54608, 'TH': 7274, 'TR': 9370, 'TW': 24827, 'UA': 3592, 'US': 65111, 'VN': 2740, 'ZA': 11300, 'CO': 6500 }) df.gdpCountry = pd.to_numeric(df.gdpCountry, errors='coerce') df['gdpCountry'] = df['gdpCountry'].fillna(11335) df['gdpCountry'] = pd.cut(df.gdpCountry, bins=[0, 29960, 50000, 150000], labels=[0, 1, 2]) #End Yev Gdp device_map = { 'IPhone7': 0, 'IPhone7Plus': 0, 'IPhone8Plus': 0, 'IPhone6S': 0, 'IPhoneSE': 0, 'IPhone8': 0, 'IPhone6SPlus': 0, 'IPadAir1G': 0, 'IPhone5S': 0, 'IPadMini3G': 0,