def __init__(self, trainframe, classifier): '''abstract class for prediction. Parameters: `trainframe`: pandas.DataFrame Labeled data. Note that to conserve space this frame will be altered IN PLACE and should not be reused! `classifier`: scikit-learn classifier must support predict_proba ''' print 'Data example: ' print trainframe[0:10] print 'Outcomes overall:' print pd.value_counts(trainframe['OutcomeType'].values, sort=False) self.trainframe = trainframe self.classifier = classifier self.y = trainframe['OutcomeType'].copy() self.length = trainframe.shape[0] for label in 'AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype': try: trainframe.drop(label, 1, inplace=True) except KeyError: pass self.clean_train_data()
def draw(domain_length): """ 绘制柱装图 :param domain_length: """ x_label = [] x = pd.value_counts(pd.Series(domain_length)).index[:25] y = pd.value_counts(pd.Series(domain_length)).values[:25]/1000.0 for label in x: x_label.append(str(label)) x = np.arange(len(y)) fig = plt.figure() fig.add_subplot(111) plt.bar(x,y,align='center') x_min,x_max = x.min(), x.max() y_min,y_max = y.min(), y.max() plt.xlabel(u'顶级域名') plt.ylabel(u'域名个数(K)') plt.xlim(x_min-1, x_max+1) plt.ylim(y_min, y_max+10) plt.xticks(x,x_label,rotation=50) # plt.grid(axis='y') plt.subplots_adjust(top=0.95,bottom=0.15,left=0.08,right=0.97) plt.savefig(u"各个顶级域名含有的域名数量",dpi=140) plt.show()
def slide_14(): ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] bins = [18, 25, 35, 60, 100] cats = pd.cut(ages, bins) print cats # labels じゃなくて codes を使え # print cats.labels print cats.codes # print cats.levels # levels じゃなくて categories を使え print cats.categories print pd.value_counts(cats) print pd.cut(ages, [18, 26, 36, 61, 100], right=False) group_names = ['Youth', 'YoungAdultl', 'MiddleAged', 'Senior'] print pd.cut(ages, bins, labels=group_names) data = np.random.rand(20) print data print pd.cut(data, 3, precision=2) data = np.random.randn(1000) cats = pd.qcut(data, 3) print cats print pd.value_counts(cats) print pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
def statsFromRun(feat,DL,RW): left=pd.Series() light=pd.Series() next_waypoint=pd.Series() oncoming=pd.Series() right=pd.Series() for f in feat: left= left.add(pd.value_counts(f.left.ravel()), fill_value=0) light= light.add(pd.value_counts(f.light.ravel()), fill_value=0) next_waypoint= next_waypoint.add(pd.value_counts(f.next_waypoint.ravel()), fill_value=0) oncoming= oncoming.add(pd.value_counts(f.oncoming.ravel()), fill_value=0) right= right.add(pd.value_counts(f.right.ravel()), fill_value=0) fig, axes = plt.subplots(nrows=2, ncols=3,figsize=(14,6)) fig.suptitle( "Runs:{}".format(len(feat))) left.plot(kind='bar', title="Left",ax=axes[0,0]) light.plot(kind='bar', title="light",ax=axes[0,1]) next_waypoint.plot(kind='bar', title="next_waypoint",ax=axes[0,2]) oncoming.plot(kind='bar', title="oncoming",ax=axes[1,0]) right.plot(kind='bar', title="right",ax=axes[1,2]) axes[1,1].plot(DL,label="Deadlines") axes[1,1].plot(RW,label="Rewards") avgDist=3 axes[1,1].plot( #add a line to the graph representing the avg of all point within avgDist of the current run. [(np.mean(DL[i-avgDist:i+avgDist])+np.mean(RW[i-avgDist:i+avgDist]))/2 for i in range(len(DL))], label="Avg {:2.2f}".format( # use the last half avg in the label (np.mean(DL[len(DL)/2:len(DL)])+np.mean(RW[len(DL)/2:len(DL)]))/2)) #axes[1,1].xlabel('Run') axes[1,1].legend(loc=2) #axes[1,1].title("Deadline and Rewards per Run") plt.show() plt.close()
def read_file(filename): for df in pd.read_csv(filename, index_col='id',keep_default_na=False, na_values=[""], chunksize=50000): for ds in df: factor = pd.cut(df[ds], 10) print pd.value_counts(factor) break break
def feature_userBased(filename): df = pd.read_csv(filename, sep=',', header=0) csv_file = open(filename[0:-4]+'_user.csv', 'w') a = csv.writer(csv_file, delimiter=',') for user_id, group in df.groupby('user_id'): dict = pd.value_counts(group.behavior_type, sort=False) num_skim = 0 num_collect = 0 num_cart = 0 num_buy = 0 if (1 in dict): num_skim = int(dict[1]) # 浏览数 if (2 in dict): num_collect = int(dict[2]) # 收藏数 if 3 in dict: num_cart = int(dict[3]) # 加入购物车数 if 4 in dict: num_buy = int(dict[4]) # 购买数 conversion_rate = 0 conversion_rate = (num_buy*1.0)/dict.sum() conversion_rate = float('%.4f'% conversion_rate) buy_group = group[group['behavior_type']==4] num_item_buy = (pd.value_counts(buy_group.item_id)).shape[0] # 该客户购买的商品种数 item = [user_id, num_skim, num_collect, num_cart, num_buy, conversion_rate, num_item_buy] a.writerow(item)
def kftest(df, column, label, tag): df_tmp = df.loc[:, [column, label]] df_tmp = df_tmp.dropna() col = dict(pd.value_counts(df_tmp[column])) lab = dict(pd.value_counts(df_tmp[label])) f_obs = [] f_exp = [] obs_d = {} for i in col: for j in lab: obs = sum([1 \ if df_tmp.iloc[k][column] == i and df_tmp.iloc[k][label] == j \ else 0 for k in range(len(df_tmp))]) obs_d.setdefault(j, {}) obs_d[j][i] = obs f_obs.append(obs) f_exp.append(1. * lab[j] / (sum(lab.values())) * col[i]) statics, p_value = chisquare(f_obs, f_exp, ddof=len(f_obs) - 2) str1 = "%d(%f),%d(%f),%d(%f),%f,%f" % (col[tag], col[tag] * 1. / sum(col.values()), obs_d[0][tag], 1. * obs_d[0][tag] / sum(obs_d[0].values()), obs_d[1][tag], 1. * obs_d[1][tag] / sum(obs_d[1].values()), statics, p_value) return str1
def dist_by_group(grp, predictor): df = pd.concat([grp, predictor], axis=1) colnames = df.columns.values grouped = df.groupby(colnames[0]) agg_df = grouped.apply(lambda x: pd.value_counts(x.iloc[:,1])/sum(pd.value_counts(x.iloc[:,1]))) agg_df = agg_df.unstack() return agg_df
def discretize_bins_quantiles(df,col_name,number_of_bins, verbose = False): new_col = 'bins_' + str(col_name) df[new_col] = pd.qcut(df[col_name],number_of_bins, labels = False) if verbose: print pd.value_counts(data[new_col]) return new_col
def discretize_bins_values(df,col_name, bins, verbose = False): new_col = 'bins_' + str(col_name) df[new_col] = pd.cut(df[col_name], bins = bins, include_lowest = True, labels = False) if verbose: print pd.value_counts(data[new_col]) return new_col
def run_test2(): orig_animals = ['cat', 'dog', 'mouse'] animals = orig_animals * 3 raw_data = { 'animal' : animals, 'score' : get_rand_num_array(len(animals)) } # make DataFrame # df = pd.DataFrame(raw_data, columns = ['animal', 'score']) print '-' * 10 print df print '-' * 10 #return # Create array for bins # bins = get_bin_list(step=20, low_num=0, high_num=100) # For each score assign it to a bin # labels = pd.cut(df['score'], bins) # Same as above but adding the bin value as a column to the DataFrame # df['bin_label'] = pd.cut(df['score'], bins) print type(df) print df.describe print '-' * 10 from collections import Counter c = Counter(df['bin_label']) print '-' * 10 print c vcounts = pd.value_counts(df['bin_label']) print vcounts #print 'by_bin', by_bin print '-' * 10 vcounts = df['bin_label'].value_counts() d = vcounts.to_dict() keys = d.keys() keys.sort() for k in keys: print k, d[k], type(k) return # Show the count in each bin # vc_series = pd.value_counts(df['bin_label']) print '\n', 'vc_series', vc_series print '-' * 10 print vc_series.axes import ipdb; ipdb.set_trace()
def cont_var_to_disc(df, column_name, max_value, number_of_bins): '''function that can discretize a continuous variable''' df[column_name] = df[column_name].apply(lambda x: cap_values(x, max_value)) variable_name = column_name + "_bins" df[variable_name] = pd.cut(df[column_name], bins=number_of_bins, labels=False) print pd.value_counts(df[variable_name]) #This is useful if you take all of the features to do the model but not if specify features #df.drop(column_name, axis=1, inplace=True) return df
def pd_01(): obj=Series(['c','a','d','a','a','b','c']) uniques=obj.unique() print uniques print uniques.sort() print pd.value_counts(obj,sort=False) mask=obj.isin(['b','c']) print mask print obj[mask]
def queryVolatile(sym,startdate,dbconn): df=stockeod.getAllDataFrame(sym,startdate,dbconn) #df['chg']=pd.Series(np.random.randn(sLength), index=df.index) df['chg']=1 df['lschg']=1 p1=0.0 p0=0.0 for index, row in df.iterrows(): if index==0: df.ix[0,['chg']] = 0 else: p1 = df.ix[index,'sadjclose'] p0 = df.ix[index-1,'sadjclose'] pclose = df.ix[index,'sclose'] plow = df.ix[index,'slow'] chg = 100*(p1 / p0 - 1) #prev_close = pclose / (chg/100+1) lschg = abs((plow/pclose-1)*100) #print index,plow,prev_close,chg,lschg #lschg = lschg - chg #if chg>=0: # chg+=0.4 #else: # chg-=0.4 df.ix[index,'chg'] = chg #int(round(chg)) df.ix[index,'lschg'] = lschg #print df[['symbol','sdate','sopen','sadjclose','chg','lschg']] print df #bins = [-1000,-5,-3,-1,1,3,5,1000] #cats = pd.cut(df['chg'],bins) #cats.plot(kind='kde') fig = plt.figure() ax1 = fig.add_subplot(2,1,1) ax2 = fig.add_subplot(2,1,2) ax1.set_xlim([-20,20]) mybins=[-15,-5,-3,-1,1,3,5,15] cats = pd.cut(df['chg'],mybins) print "change percent\n", pd.value_counts(cats) shadowbins=[0,1,3,5,15] shadowcats = pd.cut(df['lschg'],shadowbins) print "shadow line change percent\n", pd.value_counts(shadowcats) df['chg'].hist(ax=ax1,bins=mybins) df['chg'].plot(ax=ax1,kind='kde') df['lschg'].hist(ax=ax2,bins=shadowbins) df['lschg'].plot(ax=ax2,kind='kde') plt.show()
def slide_21(): import json db = json.load(open(FOODJSONPATH)) print len(db) print db[0].keys() print db[0]['nutrients'][0] nutrients = DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] print "今から全部のnutrientsを扱うよ" nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) print "なんか重複多い" print nutrients.duplicated().sum() nutrients = nutrients.drop_duplicates() print "infoとnutrients両方にdescriptionとgroupがあるから変えよう" col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) col_mapping = {'description': 'nutrient', 'group': 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) ndata = pd.merge(nutrients, info, on='id', how='outer') print ndata.ix[30000] result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].order().plot(kind='barh') plt.show() by_nutrient = ndata.groupby(['nutgroup', 'nutrient']) get_maximum = lambda x: x.xs(x.value.idxmax()) get_minimum = lambda x: x.xs(x.value.idxmin()) max_foods = by_nutrient.apply(get_maximum)[['value', 'food']] max_foods.food = max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def check_frequency(ax, true_data, sampling_data): true_seq = list(chain.from_iterable(true_data)) sampling_seq = list(chain.from_iterable(sampling_data)) f1 = pd.value_counts(true_seq) / len(true_seq) f2 = pd.value_counts(sampling_seq) / len(sampling_seq) freq = merge_and_sort(f1, f2) jsd = JSD(freq.true_data, freq.sampling_data) freq.index = [str(x)[:20] for x in freq.index] freq.plot(ax=ax, kind='bar') ax.set_title("Frequency(JSD: %.5f)" % jsd)
def cut_data(): ages=[20,22,25,27,21,23,37,31,61,45,41,32] bins=[18,25,35,60,100] cats=pd.cut(ages,bins) print cats print cats.levels print cats.labels print pd.value_counts(cats) print pd.cut(ages,[18,25,35,60,100],right=False) group_names=['Youth','YoungAdult','MiddleAged','Senior'] print pd.cut(ages,bins,labels=group_names) data=np.random.randn(20) print pd.cut(data,4,precision=2)
def draw(entire_tlds,sub_tlds,first_tlds): """ 绘制柱装图 :param entire_tlds: """ fig =plt.figure() x_label = [] x = pd.value_counts(pd.Series(entire_tlds)).index[:25] y = pd.value_counts(pd.Series(entire_tlds)).values[:25]/1000.0 for label in x: x_label.append(str(label)) x = np.arange(len(y)) ax = fig.add_subplot(121) ax.bar(x,y) ax.set_xticks(x) ax.set_xticklabels(x_label,rotation=50) plt.grid(axis='y') plt.ylabel(u'域名数量(K)') plt.xlabel(u'二级顶级域名') x_label = [] x = pd.value_counts(pd.Series(sub_tlds)).index[:20] # 去重 y = pd.value_counts(pd.Series(sub_tlds)).values[:20]/1000.0 # 去重 for label in x: x_label.append(str(label)) x = np.arange(len(y)) ax2 = fig.add_subplot(122) ax2.bar(x,y) ax2.set_xticks(x) ax2.set_xticklabels(x_label,rotation=50) plt.grid(axis='y') plt.xlabel(u'第二级顶级域名') plt.subplots_adjust(top=0.96,bottom=0.15,left=0.06,right=0.98,wspace=0.10) plt.savefig(u"二级顶级域名",dpi=140) # x_label = [] # x = pd.value_counts(pd.Series(first_tlds)).index[:20] # y = pd.value_counts(pd.Series(first_tlds)).values[:20] # for label in x: # x_label.append(str(label)) # x = np.arange(len(y)) # ax3 = fig.add_subplot(223) # ax3.bar(x,y) # ax3.set_xticks(x) # ax3.set_xticklabels(x_label,rotation=50) # plt.grid() plt.show()
def get_binned_data( df, bin_count=10 ): v_max, v_min = df.max(), df.min() bins = [(v_max-v_min)/(bin_count+1)*i+v_min for i in range(bin_count+1)] labels = ["{0} {1:.1f}".format(i, (v_max-v_min)/(bin_count+1)*(i+0.5)+v_min) for i in range(bin_count)] categories = pd.cut(df, bins, labels=labels) #print( categories) print( df) print(pd.value_counts( categories )) ret_df = pd.dataFrame() ret_df.index = labels ret_df['count'] = pd.value_counts(categories) return ret_df
def density_(df, n=100): x = pd.cut(df.t1, n) y = pd.cut(df.t2, n) x_counts = pd.value_counts(x) y_counts = pd.value_counts(y) x_mid = map(get_mid, x_counts.index) y_mid = map(get_mid, y_counts.index) lower = min(min(x_mid), min(y_mid)) upper = max(max(x_mid), max(y_mid)) arr = np.linspace(lower, upper, 100) grid = np.meshgrid(arr, arr) x_counts.index = x_mid y_counts.index = y_mid x_counts = x_counts.sort_index() y_counts = y_counts.sort_index()
def barz(self): "--barplot" "frame" bz = pd.DataFrame({'he': self.he, 'inc': self.inc, 'alrmV':self.alrmV } ) print '**alrmV', self.alrmV[:10], bz.alrmV.value_counts() "parse data" #confs/dmhi-current/reports/.txt "group/count unique/ sort -> value_counts" #self.df['Label'].idx(1).count() "counts" at = pd.value_counts(bz.alrmV); #print "**alert-types-10\n", at.shape, at[:10] inc = bz.inc.value_counts(); #print "incorrect\n", inc #1=incorrect he = bz.he.value_counts(); #print "hardeasy\n", he #inc.plot(kind='bar') "group by" #gender, geography, timeofday grouped = bz.groupby(['he','inc'])#.sum().plot(kind='bar', stacked=True) #key = [k for (k,v) in grouped.groups] #print 'key', key #print grouped.size() #print 'PPP', grouped.value_counts() pew=grouped['alrmV'].value_counts().unstack().fillna(0.) print 'heic vals(\n' pprint.pprint(pew) pew.plot(kind='bar',stacked=True)
def player_performance_plots(database,table,player_name): conn = MySQLdb.connect(user="******",passwd="xxxx",db=database, cursorclass=MySQLdb.cursors.DictCursor) cmd_target = 'SELECT * FROM '+ table + ' WHERE player IN (\''+ player_name +'\');' player_frame = pd.read_sql(cmd_target, con=conn) conn.close() player_values = player_frame['pos_streak_list'].values streaks = [ast.literal_eval(x) for x in player_values] streak_data = np.concatenate(streaks) x=range(len(streak_data)) y=streak_data df_streaks = pd.DataFrame(dict(streaks=x, streak_length=y)) streak_counts = pd.value_counts(df_streaks.values.ravel()) xData = streak_counts.index[:15] xData_1 = [x-1 for x in xData] yData = streak_counts.values[:15] # yData_1 = yData*(1000)/yData[0] popt, pcov = optimize.curve_fit(exp_func, xData, yData) yEXP = exp_func(xData, *popt) plt.figure() sns.factorplot("streak_length", data=df_streaks,kind="bar",palette="Blues",size=6,aspect=2,legend_out=False); plt.plot(xData_1, yData, label='Data', marker='o') plt.plot(xData_1, yEXP, 'r-',ls='--', label="Exp Fit") plt.legend() plt.show() a,b,c = popt return streak_counts
def test_series_groupby(self): """Test boxplot groupby using a series of data labels.""" vals = dist._box_reshape(self.df.y, self.df.g, None, None)[0] nt.assert_equal(len(vals), 3) want_lengths = pd.value_counts(self.df.g)[["a", "b", "c"]] got_lengths = [len(a) for a in vals] npt.assert_array_equal(want_lengths, got_lengths)
def test_series_groupby_order(self): """Test a series-based groupby with a forced ordering.""" order = ["c", "a", "b"] vals = dist._box_reshape(self.df.y, self.df.g, None, order)[0] want_lengths = pd.value_counts(self.df.g)[order] got_lengths = [len(a) for a in vals] npt.assert_array_equal(want_lengths, got_lengths)
def fix_barcode_columns(df, patients=None, tissue_code='All', get_batch=False): """ Takes TCGA barcode and reformats it into a MultiIndex if all tissue_codes are desired, or just pulls the correct tissue codes and filteres the DataFrame. df: pandas DataFrame patients: patient list to filter on tissue_code: ['01','11','All'] #if all returns MultiIndex """ if get_batch is False: df.columns = pd.MultiIndex.from_tuples([(i[:12], i[13:15]) for i in df.columns]) else: df.columns = pd.MultiIndex.from_tuples([(i[:12], i[13:15], i[21:24]) for i in df.columns]) if patients is not None: df = df.ix[:, patients] if tissue_code != 'All': try: df = df.T.xs(tissue_code, level=1).T # pandas bug df = df.groupby(axis=1, level=0).first() except KeyError: # try different cross-seciton new_code = pd.value_counts(df.columns.get_level_values(1)).idxmax() df = df.T.xs(new_code, level=1).T # pandas bug df = df.groupby(axis=1, level=0).first() else: df = df.groupby(axis=1, level=[0, 1]).first() return df
def is_noninformative_feature(self, feature_name): value_counts = pd.value_counts(self.data[feature_name], dropna = False) if len(value_counts) == 1: return True elif value_counts.max()*1./self.data.shape[0] >= self.params["FRAC_OF_FEAT_TO_BE_NONINFORMATIVE"]: return True return False
def plot_layer(self, layer): layer = {k: v for k, v in layer.iteritems() if k in self.VALID_AES} layer.update(self.manual_aes) x = layer.pop('x') if 'weight' not in layer: counts = pd.value_counts(x) labels = counts.index.tolist() weights = counts.tolist() else: weights = layer.pop('weight') if not isinstance(x[0], Timestamp): labels = x else: df = pd.DataFrame({'weights':weights, 'timepoint': pd.to_datetime(x)}) df = df.set_index('timepoint') ts = pd.TimeSeries(df.weights, index=df.index) ts = ts.resample('W', how='sum') ts = ts.fillna(0) weights = ts.values.tolist() labels = ts.index.to_pydatetime().tolist() indentation = np.arange(len(labels)) + 0.2 width = 0.35 idx = np.argsort(labels) labels, weights = np.array(labels)[idx], np.array(weights)[idx] labels = sorted(labels) plt.bar(indentation, weights, width, **layer) plt.autoscale() return [ {"function": "set_xticks", "args": [indentation+width/2]}, {"function": "set_xticklabels", "args": [labels]} ]
def featureExtraction(filename): """ 提取特征 :param filename: :return: """ df = pd.read_csv(filename, sep=',', header=0) df = df.drop_duplicates() csv_file = open(filename[0:-4]+'_num.csv', 'w') a = csv.writer(csv_file, delimiter=',') for (u, i), group in df.groupby(['user_id', 'item_id']): num_skim = 0 num_collect = 0 num_cart = 0 num_buy = 0 dict = pd.value_counts(group.behavior_type, sort=False) if (1 in dict): num_skim = int(dict[1]) if (2 in dict): num_collect = int(dict[2]) if 3 in dict: num_cart = int(dict[3]) if 4 in dict: num_buy = int(dict[4]) item = [u, i, num_skim, num_collect, num_cart, num_buy] a.writerow(item)
def several_tools_per_phase(supertupel): """What can we gather from people who use a tool more than once?""" title = "Mehrfach genutzte Tools pro Forschungszyklus" filename = "19_tools_in_mehreren_phasen." + EXTENSION tisp_user_how_many = [] einer = [] alle = [] for st in supertupel: # User benutzt %d Tools in mehreren Phasen tisp_user_how_many.append(len(st)) for tool in st: alle.append(category_lookup(tool[0])) tisp_series2 = pd.Series(alle) pvc2 = pd.value_counts(tisp_series2.values) print("Welche Kategorien werden mehrfach genutzt?") # print(pvc2) fig, axes = plt.subplots(nrows=1, ncols=1)#, figsize=(20,10)) axes.set_xlabel("BenutzerInnen", alpha=ALPHA_VALUE, ha='left') axes.set_ylabel("Kategorien", alpha=ALPHA_VALUE, ha='left') # Here we go! pvc2.plot(kind="barh", ax=axes, color=COLOURS[0], width=WIDTH) fig.savefig(filename, bbox_inches='tight') plt.close(fig) print("Antwort in %s" % filename) titles.write("%s: Abb. 4.:%s\n" % (filename,title))
def process_dataset(ds): # Deal with missing data: (1) kick (2) filled with median ds["Age"] = ds["Age"].fillna(ds["Age"].median()) ds["Fare"] = ds["Fare"].fillna(ds["Fare"].median()) ds["Embarked"] = ds["Embarked"].fillna('S') # Categorized ds.loc[ds["Sex"] == "male", "Sex"] = 0 ds.loc[ds["Sex"] == "female", "Sex"] = 1 ds.loc[ds["Embarked"] == 'S', "Embarked"] = 0 ds.loc[ds["Embarked"] == 'C', "Embarked"] = 1 ds.loc[ds["Embarked"] == 'Q', "Embarked"] = 2 # Binning binning(ds, "Fare") binning(ds, "Age") # Create new feature ds["FamilySize"] = ds["SibSp"] + ds["Parch"] ds["NameLength"] = ds["Name"].apply(lambda x: len(x)) titles = ds["Name"].apply(get_title) title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2, "Dona": 10} for k, v in title_mapping.items(): titles[titles == k] = v ds["Title"] = titles family_ids = ds.apply(get_family_id, axis=1) family_ids[ds["FamilySize"] < 3] = -1 print(pd.value_counts(family_ids)) ds["FamilyId"] = family_ids return ds
import numpy as np import pandas as pd from pandas import Series, DataFrame prime_nos = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47] number_bins = [0, 10, 20, 30, 40, 50] category = pd.cut(prime_nos, number_bins) print category print category.categories #value_counts() to get count prime number witthin a range print pd.value_counts(category) #limits number of bins print pd.cut(prime_nos, 3, precision=1)
net = Graph(DATADIR + DATASET + '\\links.txt', typ='dir', order=ORDER, withdiag=WITHDIAG) print('READ TIME: %.2f' % (time.time() - pt)) f.write('%d %d %d\n' % (net.nVertices, net.nEdges, DIMENSION)) pt = time.time() grouping_model = Louvain(net, rand=RANDOM_GROUPING) groups = grouping_model.execute(merge=MERGE) print('GROUP TIME: %.2f' % (time.time() - pt)) group_sizes = [len(t) for t in groups] print('Grouping Results:') print(pd.value_counts(group_sizes)) inv_index_original = groups2inv_index(groups, net.nVertices) # sizes_index = [group_sizes[t - 1] for t in inv_index_original] pt = time.time() # k_set = sample(net, k=K_SIZE, method='deg_deter') k_set = sample(net, k=K_SIZE, method=SAMPLE_METHOD) #, vertex_group_sizes=sizes_index) print('SAMPLE TIME: %.2f' % (time.time() - pt)) inv_index = groups2inv_index(groups, net.nVertices, k_set) pure_override_nodes(groups, inv_index) groups = [k_set] + groups pt = time.time() model = Optimizer(net,
from pyecharts import Bar import pandas as pd import re d = pd.read_csv('BGM_week_v1.csv', nrows=250) star = d["week"] result = star.values.tolist() result1 = pd.value_counts(result) sum_ = 0 week1 = result1["星期一"] week2 = result1["星期二"] + result1["火曜日"] week3 = result1["星期三"] + result1["水曜日"] week4 = result1["星期四"] + result1["周四"] + result1["木曜日"] week5 = result1["星期五"] week6 = result1["星期六"] week7 = result1["星期天"] + result1["周日"] value = [ week1, week2, week3, week4, week5, week6, week7 ] for i in value: sum_ += i other = 250-sum_ value.append(other) attr = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日", "剧场版"] bar = Bar('Top250动画 TV动画放送日期', "count") bar.add("count", attr, value) bar.render('Top250动画 TV动画放送日期.html')
import numpy as np import pandas as pd #import tensorflow as tf import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] bins = [18, 25, 35, 60, 100] # Also, we can pass a unique name to each label. bin_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior'] new_cats = pd.cut(ages, bins, labels=bin_names) print(pd.value_counts(new_cats)) #we can also calculate their cumulative sum # pd.value_counts(new_cats).cumsum() print(pd.value_counts(new_cats).cumsum())
def test_index(self): # test that the various tests get properly aggregated, no duplicate indices self.assertEqual(max(pd.value_counts(self.summary.index)), 1)
def handle_b(b, n): global df7 global d_test #对文件下每个CSV操作 for each_src in data_dirs: df5 = DataFrame( pd.read_csv( each_src, names=['code', 'from', 'to', 'date', 'time', 'ci', 'lac'])) #去除一些错误条目 df7 = df5[df5['date'] > 20000000] #每天 if modes == 'd': #列出唯一的日期 d_test = sorted(df7['date'].unique()) for i in d_test: #从信息源筛选出每一天的信息 temp_df = df7[df7['date'] == int(i)] #按拨出次数排名 temp_count0 = pd.value_counts(temp_df['from']) #选出前n名 temp_count = DataFrame(temp_count0[0:int(n)], columns=['degree']) #增加第三列(日期),导出csv temp_count['when'] = int(i) temp_dir = 'temp_count/d/' + str(i) + '_topn.csv' temp_count.to_csv(temp_dir, index=True, header=False) print 'done' #每周 if modes == 'w': d_test2 = sorted(df7['date'].unique()) date_to_week = {} for i in d_test2: #根据日期获取该日期在一年的第几周 temp_date = datetime.datetime.strptime(str(i), "%Y%m%d") n = temp_date.strftime('%W') #添加映射 date_to_week[i] = n #从日期映射到第几周 df7['week'] = df7['date'].map(date_to_week) #列出唯一的周数 d_test = sorted(df7['week'].unique()) for i in d_test: temp_df = df7[df7['week'] == i] temp_count0 = pd.value_counts(temp_df['from']) temp_count = DataFrame(temp_count0[0:int(n)], columns=['degree']) temp_count['when'] = '第' + str(i) + '周' temp_dir = 'temp_count/w/' + str(i) + '_topn.csv' temp_count.to_csv(temp_dir, index=True, header=False) print 'Done' #每月 if modes == 'm': d_test2 = sorted(df7['date'].unique()) date_to_month = {} for i in d_test2: temp_date = datetime.datetime.strptime(str(i), "%Y%m%d") n = temp_date.strftime('%m') date_to_month[i] = n df7['month'] = df7['date'].map(date_to_month) d_test = sorted(df7['month'].unique()) for i in d_test: temp_df = df7[df7['month'] == i] temp_count0 = pd.value_counts(temp_df['from']) temp_count = DataFrame(temp_count0[0:int(n)], columns=['degree']) temp_count['when'] = '第' + str(i) + '月' temp_dir = 'temp_count/m/' + str(i) + '_topn.csv' temp_count.to_csv(temp_dir, index=True, header=False) print 'donE'
import numpy as np data.replace('n/a', np.nan, inplace=True) data.emp_length.fillna(value=0, inplace=True) data['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True) data['emp_length'] = data['emp_length'].astype(int) data['term'] = data['term'].apply(lambda x: x.lstrip()) import seaborn as sns import matplotlib s = pd.value_counts(data['emp_length']).to_frame().reset_index() s.columns = ['type', 'count'] def emp_dur_graph(graph_title): sns.set_style('whitegrid') ax = sns.barplot(x='type', y='count', data=s) ax.set(xlabel='', ylabel='', title=graph_title) matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')) _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=0) emp_dur_graph('Distribution of employment length for issued loans') from matplotlib import pyplot as plt print(plt.style.available)
#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd Location = r"C:\Users\rfsas\Documents\MBA\Spring 2020 Class docs\ISM 6419 - Data Visualization\week 7\datasets\gradedata.csv" df = pd.read_csv(Location) df.head() # In[2]: bins = [0, 60, 70, 80, 90, 100] group_names = ['F', 'D', 'C', 'B', 'A'] df['lettergrade'] = pd.cut(df['grade'], bins, labels=group_names) df # In[3]: pd.value_counts(df['lettergrade']) # In[4]: bins = [0, 80, 100] group_names = ['fail', 'pass'] df['Pass/Fail'] = pd.cut(df['grade'], bins, labels=group_names) df # In[ ]:
x = len(pac_internado)/all_data y = len(pac_não_internado)/all_data print('Pacientes Internados :', round(x*100,2),'%') print('Pacientes Não Internados :', round(y*100,2),'%') #Pacientes Internados : 11.72 % #Pacientes Não Internados : 88.28 % #Verificação do % de Dados da Variável Target - Gráfico - Op 1 import matplotlib.pyplot as plt labels = ['Pacientes Não Internados','Pacientes Internados'] classes = pd.value_counts(df['CONT_STATUS_INT'], sort = True) plt.figure(figsize = (14, 7)) classes.plot(kind = 'bar', rot = 0) plt.title("Target Class Distribution") plt.xticks(range(2), labels) plt.xlabel("Class") plt.ylabel("Frequency") #Verificação do % de Dados da Variável Target - Gráfico - Op 2 import seaborn as sns import matplotlib.pyplot as plt targets = df['DESC_STATUS_INT'].values sns.set(style = "darkgrid")
#Delete the duplicate way-2 (only 1 column) import pandas as pd data = pd.read_csv('c:/Users/DCUK/.PyCharmCE2018.1/PycharmProjects/gender.csv') id = data["id"] val = data["Claim Value"] non_du_id = [] for x in sorted(set(id)): non_du_id.append(x) #Delete the duplicate way-3 (whole dataframe) import pandas as pd data = pd.read_csv('c:/Users/DCUK/.PyCharmCE2018.1/PycharmProjects/gender.csv') df = pd.DataFrame(data) mask = df.duplicated(keep=False) print(pd.value_counts(mask)) # False 239712 # True 79 # dtype: int64 mask1 = df.drop_duplicates(keep=False) print(len(mask1)) # directly return 239712 non_du_id = df[~mask] non_du_id.to_csv('non_du_id-2.csv') import pandas as pd data = pd.read_csv( 'c:/Users/DCUK/.PyCharmCE2018.1/PycharmProjects/Aspnet_user/final_column_3.csv' ) df = data[data["Claim Type"].str.match("diabetes_type")]
Let's divide these into bins of 18 to 25, 26 to 35, 36 to 60, and finally 61 and older. To do so, you have to use cut, a function a pandas:''' bins = [18, 25, 35, 60, 100] cats = pd.cut(ages, bins) cats # categorical object ''' The object pandas returns is a special Categorical object. You can treate it like an array of strings indicating the bin name; initially it contains a levels array indicating the distinct category names along with a labeling for the ages data in the labels attribute:''' cats.labels cats.value_counts() pd.value_counts(cats) ''' Which side is closed or open can be changed by passing right=False:''' pd.cut(ages, bins, right=False) ''' You can also pass your own bin names by passing a list or array to the labels option:''' group_names = ['youth','youngadult','middleage','senior'] cats2 = pd.cut(ages, bins, labels=group_names) pd.value_counts(cats2) ''' If you pass cut a integer number of bins instead of explicit bin ages, it will
descrip_speed=accidentes['Speed_limit'].describe() quick_report1=accidentes.describe().transpose() #%% quick_report2=accidentes.describe(include=['object']).transpose() ## incluye variables tipo objeto #%% quick_report3=accidentes.mode().transpose() ## para la moda #%% accidentes_por_dia=pd.value_counts(accidentes['Date']) #%% ##estadisticas de numero de vehiculos print "mean value:{}".format(accidentes['Number_of_Vehicles'].mean()) print "min value:{}".format(accidentes['Number_of_Vehicles'].min()) print "max value:{}".format(accidentes['Number_of_Vehicles'].max()) print "mode value:{}".format(accidentes['Number_of_Vehicles'].mode()) print "std value:{}".format(accidentes['Number_of_Vehicles'].std()) #%% vehicle_counts=accidentes.groupby('Date').agg({'Number_of_Vehicles':np.sum}) casualty_counts=accidentes.groupby('Date').agg({'Number_of_Casualties':np.sum}) #%%
timetoswtich.extend(swarmans[9]) switch_facs.extend(swarmans[10]) interpolation_list.extend(swarmans[2]) #lists by groups usernames_list.extend(swarmans[3]) initialpull_list.extend(swarmans[4]) interpolation_final.extend(swarmans[5]) iterate_sheets(10) swarm_repeatability=[] swarm_instance_repeatability=[] crowd_instance_repeatability=[] for q in range(len(All_swarmanswers)): swarmcounts=pd.value_counts(All_swarmanswers[q]) dictionary=dict(swarmcounts) swarm_repeatability.append(list(swarmcounts)[0]/sum(swarmcounts)) for s in range(len(All_swarmanswers[q])): ans=All_swarmanswers[q][s] count= dictionary[ans] swarm_instance_repeatability.append((count-1)/(sum(swarmcounts)-1)) ### repeatability by question graph ### x_labels=np.arange(1,26,1) fig, ax = plt.subplots() rects = ax.bar(np.arange(1,26,1), swarm_repeatability,alpha=0.4) ax.set_title('Repeatability by Question',size=16) ax.set_xlabel('Question',size=14) ax.set_ylabel('Repeatability',size=14) plt.xticks(x_labels,x_labels)
'Quantidade por sexo': quantidade_sexo, 'Percentual': percentual_sexo }) tabela.rename(index={0: 'Masculino', 1: 'Feminino'}, inplace=True) tabela.rename_axis('Sexo', axis='columns', inplace=True) tabela """Distribuição de renda""" labels = ['E', 'D', 'C', 'B', 'A'] classes = [0, 1576, 3152, 7880, 15760, 200000] #Contagem da frequência por classe frequencia_renda = pd.value_counts( pd.cut(dados.Renda, bins=classes, labels=labels, include_lowest=True)) #Calculo percentual de cada classe percentual_renda = pd.value_counts(pd.cut( dados.Renda, bins=classes, labels=labels, include_lowest=True), normalize=True) * 100 #Criando o dataframe para servir de tabela para a análise tabela_renda = pd.DataFrame({ 'Frequência por classe': frequencia_renda, 'Percentual por classe': percentual_renda }) tabela_renda.sort_index(ascending=False, inplace=True) tabela_renda.rename_axis('Classe', axis='columns') tabela_renda['Frequência por classe'].plot.bar(width=1, color='red',
#特色菜 import jieba delicious = [] for i in range(750): try: recommend = jieba.lcut(data['recommend'][i]) while ',' in recommend: recommend.remove(',') while '(' in recommend: recommend.remove('(') while ')' in recommend: recommend.remove(')') delicious.extend(recommend) except: continue delicious = pd.value_counts(delicious) from pyecharts import WordCloud wordcloud = WordCloud(width=1000, height=600) wordcloud.add("", delicious.index, delicious.values, word_size_range=[12, 150], is_more_utils=True) wordcloud.render("delicious.html") # from sklearn.cluster import KMeans #为更好聚类,我们将星级转为数字 for i in range(750): try: if data.loc[i, 'star'] == '五星商户':
# # EDA and Data Cleaning # The variables are broken into 4 categories: Client Data, Last Contact Info, Other, and Social and Economic Variables. # I have performed EDA on each category seperately to get a better picture # #### EDA-Part 1 # In[116]: bank_client = data.iloc[:, 0:7] bank_client.head() # In[117]: #Checking for unique job titles and their counts in data bank_client['job'].value_counts() pd.value_counts(bank_client['job']).plot.bar() # In[118]: #Checking for counts of different marital status in data bank_client['marital'].value_counts() pd.value_counts(bank_client['marital']).plot.bar() # In[119]: #Checking for Educationa;l background unique counts bank_client['education'].value_counts() pd.value_counts(bank_client['education']).plot.bar() # In[120]:
prime_nos = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47] number_bins = [0, 10, 20, 30, 40, 50] # In[127]: category = pd.cut(prime_nos, number_bins) category # In[128]: category.categories # In[130]: pd.value_counts(category) # In[131]: # Limits pd.cut(prime_nos, 3, precision=1) # ### Observation # In[132]: df = DataFrame(np.random.randn(1000, 5)) #basic observation df.head() # In[133]:
data = scale(df_select) # Define number of clusters noOfClusters = 4 # Train a model model = KMeans(init='k-means++', n_clusters=noOfClusters, n_init=20).fit(data) # In[33]: print(90 * '_') print("\nCount of players in each cluster") print(90 * '_') pd.value_counts(model.labels_, sort=False) # In[34]: # Create a composite dataframe for plotting # ... Use custom function declared in customplot.py (which we imported at the beginning of this notebook) P = pd_centers(featuresUsed=select5features, centers=model.cluster_centers_) P # <h1 style="font-size:2em;color:#2467C0">Visualization of Clusters</h1> # We now have 4 clusters based on the features we selected, we can treat them as profiles for similar groups of players. We can visualize these profiles by plotting the centers for each cluster, i.e., the average values for each featuere within the cluster. We will use matplotlib for this visualization. We will learn more about matplotlib in Week 5. # In[35]: # For plotting the graph inside the notebook itself, we use the following command
clf.fit(x_train, y_train) # 传入训练数据, 进行参数训练 predictions.append(bool(clf.predict( [x_predict]))) # 传入测试数据进行预测, 得到预测的结果 # 3- 进行交易 if predictions[-1] == True: # 如果预测结果为涨: 买入 print(quote.datetime, "预测下一交易日为 涨") target_pos.set_target_volume(10) else: # 如果预测结果为跌: 卖出 print(quote.datetime, "预测下一交易日为 跌") target_pos.set_target_volume(-10) break except BacktestFinished: # 回测结束, 获取预测结果,统计正确率 klines["pre_close"] = klines["close"].shift( 1) # 增加 pre_close(上一交易日的收盘价) 字段 klines = klines[-len(predictions) + 1:] # 取出在回测日期内的K线数据 klines[ "prediction"] = predictions[: -1] # 增加预测的本交易日涨跌情况字段(向后移一个数据目的: 将 本交易日对应下一交易日的涨跌 调整为 本交易日对应本交易日的涨跌) results = (klines["close"] - klines["pre_close"] >= 0) == klines["prediction"] print(klines) print("----回测结束----") print("预测结果正误:\n", results) print("预测结果数目统计: 总计", len(results), "个预测结果") print(pd.value_counts(results)) print("预测的准确率:") print((pd.value_counts(results)[True]) / len(results))
df = pd.read_csv('csv/raw_26_April_Sensors.csv') ## Analyze and Visualize Dataset # General Information df.info() print('The dataset contains ' + str(df.shape[0]) + ' data samples and ' + str(df.shape[1]) + ' data columns') # Identifying NaN Values print(df.isnull().sum()) # Overview of numerical data print(df.describe()) print('Dataset contains ' + str(pd.value_counts(df['RecordID'].values)[0]) + ' "safe" data samples as well as ' + str(pd.value_counts(df['RecordID'].values)[1]) + ' "relatevely safe" data samples and ' + str(pd.value_counts(df['RecordID']).values[2]) + ' "unsafe" data samples') # Overview of dataset rows print(df.head(20)) # Numerical Data Distribution SENSOR_DATA_COLUMNS = [ 'GyroX1', 'GyroY1', 'GyroZ1', 'AccX1', 'AccY1', 'AccZ1', 'MagX1', 'MagY1', 'MagZ1', 'GyroX2', 'GyroY2', 'GyroZ2', 'AccX2', 'AccY2', 'AccZ2', 'MagX2', 'MagY2', 'MagZ2', 'GyroX3', 'GyroY3', 'GyroZ3', 'AccX3', 'AccY3', 'AccZ3', 'MagX3', 'MagY3', 'MagZ3'
for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) tweets = pd.DataFrame() tweets['text'] = map(lambda tweet: tweet['text'], tweets_data) tweets['language'] = map(lambda tweet: tweet['lang'], tweets_data) tweets['country'] = map( lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data) except: continue #convert the map object to list for plotting tweets_list_by_lang = list(tweets['language'][0]) tweets_by_lang_count = pd.value_counts(tweets_list_by_lang)[:5] #plot the top 5 languages on the received tweets with filtered tags fig, ax = plt.subplots() ax.tick_params(axis='x', labelsize=7) ax.tick_params(axis='y', labelsize=7) ax.set_xlabel('Languages', fontsize=7) ax.set_ylabel('Number of tweets', fontsize=7) ax.set_title('Top 5 languages', fontsize=7, fontweight='bold') tweets_by_lang_count.plot(ax=ax, kind='bar', color='green') plt.show() #convert the map object to list for plotting tweets_list_by_Country = list(tweets['country'][0]) tweets_by_Country_count = pd.value_counts(tweets_list_by_Country)[:5]
from gensim.models import doc2vec from collections import namedtuple np.random.seed(0) if __name__ == "__main__": SPLIT_SIZE = 0.3 VECTOR_SIZE = 100 # load data train_df = pd.read_csv('./kaggledata/records.tsv', sep='\t', header=0) raw_docs_train = train_df['Review'].values sentiment_train = train_df['Score'].values num_labels = len(np.unique(sentiment_train)) print pd.value_counts(sentiment_train) print sentiment_train print "Label's categories amount: " + str(num_labels) # text pre-processing stop_words = set(stopwords.words('english')) stop_words.update( ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) stemmer = SnowballStemmer("english") print "pre-processing train docs..." processed_docs_train = [] for doc in raw_docs_train: tokens = word_tokenize(doc.lower()) filtered = [word for word in tokens if word not in stop_words] stemmed = [stemmer.stem(word) for word in filtered]
def value_counts(self, dropna: bool = True): from pandas import value_counts return value_counts(self._ndarray, dropna=dropna).astype("Int64")
def select_unimportant(self, delt): delete = pd.value_counts(delt)[pd.value_counts(delt) > 1].index return delete
print scores.mean() ##############提特征###################### titanic['Familysize'] = titanic['SibSp'] + titanic['Parch'] #家庭总共多少人 titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x)) #名字的长度 import re def get_title(name): title_reserch = re.search('([A-Za-z]+)\.', name) if title_reserch: return title_reserch.group(1) return "" titles = titanic['Name'].apply(get_title) print pandas.value_counts(titles) #将称号转换成数值表示 title_mapping = { "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Col": 7, "Major": 8, "Mlle": 9, "Countess": 10, "Ms": 11, "Lady": 12, "Jonkheer": 13,
def plot_company_ranking(self): # df sel contains the data of the subset of arjen df_sel = self.read_input_file() table_name = 'company' data_df = read_sql_table(table_name, connection=self.connection, reset=self.reset) # df[df["datetime"].isnull] data_df.dropna(axis=0, subset=["datetime"], inplace=True) data_df.set_index(KVK_KEY, inplace=True, drop=True) if self.dump_to_file: data_df.to_csv(table_name + ".csv") df_sel = pd.concat([data_df, df_sel], axis=1, join="inner") count_sel = pd.value_counts(df_sel["ranking"]).sort_index() count_sel.index = count_sel.index.astype(int) tot_sel = count_sel.sum() count_sel = 100 * (count_sel / tot_sel) print("counted sel {}".format(tot_sel)) count_all = pd.value_counts(data_df["ranking"]).sort_index() count_all.index = count_all.index.astype(int) tot_all = count_all.sum() count_all = 100 * (count_all / tot_all) print("counted all {}".format(tot_all)) count_all = pd.concat([count_all, count_sel], axis=1) count_all.columns = [f"All (N={tot_all})", f"Sel (N={tot_sel}"] fig, axis = plt.subplots(figsize=(6.5, 5)) plt.subplots_adjust(left=0.1, right=0.9, top=0.85) axis.set_xlabel("Ranking [-]") axis.set_ylim([0, 40]) axis.set_ylabel("% kvks") count_all.plot(kind="bar", ax=axis, label="# kvks", rot=0) axis.set_xlim([-1, 10]) ax2 = axis.twinx() ax2.set_ylabel("cumulative %") cum_sum_all = count_all.cumsum() cum_sum_sel = pd.DataFrame(index=count_sel.index, data=count_sel.cumsum().values, columns=[count_all.columns[1]]) # cum_sum_all.plot(y=[cum_sum_all.columns[0]], ax=ax2, style="--o", color="tab:red", legend=False) cum_sum_sel.plot(y=[cum_sum_sel.columns[0]], ax=ax2, style="--x", color="tab:green", legend=False) ax2.set_ylim([0, 110]) ax2.set_xlim([-1, 10]) # ax2.tick_params(axis="y", labelcolor="black") axis.legend(bbox_to_anchor=(0.65, 1.22), title="% KVK") ax2.legend(bbox_to_anchor=(1.05, 1.22), title="Cumulative %") logger.info("plot fig") plt.savefig("url_score_NL.jpg") logger.info("save to csv sel") cum_sum_sel.to_csv("url_score_DH.csv") logger.info("save to csv all") cum_sum_all.to_csv("url_score_NL.csv")
'column9', 'column14' ] disease_data = disease_data[ features] # Putting side by side features with the same input data type # Analyzing Output Distribution data_size = disease_data.shape[0] sick = disease_data[disease_data['column14'] == 1] not_sick = disease_data[disease_data['column14'] == 0] x = len(sick) / data_size y = len(not_sick) / data_size print('Sick :', x * 100, '%') print('Not sick :', y * 100, '%') plt.figure(14) # Plotting output feature for distribution analysis labels = ['Sick', 'Not Sick'] graph = pd.value_counts(disease_data['column14'], sort=True) graph.plot(kind='bar', rot=0) plt.title("Transaction class distribution") plt.xticks(range(2), labels) plt.xlabel("Class") plt.ylabel("Frequency") plt.figure(15) sns.heatmap(disease_data.corr(), annot=True) # Correlation Matrix of the Data # Checking and Removing Outliers using Z-score function z = np.abs(stats.zscore(disease_data)) threshold = 3 disease_data = disease_data[(z < 3).all(axis=1)] # Scatter Plot - Uncomment only if needed. High computational time required.
meanval = dataemp['overall-ratings'].mean() # # Classifying labels - 1 - Satisfied happy employee 0- Employee is not satisfied with job. Greater than mean of overall-rating is considered as satisifed and less than mean of overall-rating is considered as unsatisfied for classification # In[20]: dataemp['label'] = dataemp['overall-ratings'].apply(lambda x: 1 if x > meanval else 0) # In[21]: dataemp.head() # In[22]: pd.value_counts(dataemp['label']).plot.bar() plt.show() # In[23]: def datatext_preprocess(total_text): removepunc = [ char for char in total_text if char not in string.punctuation ] removepunc = ''.join(removepunc) re.sub('[^A-Za-z]+', '', removepunc) return ' '.join([ word for word in removepunc.split() if word.lower() not in stopwords.words('english') ])
from datetime import datetime ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d')) ffp_year = ffp.map(lambda x : x.year) # 绘制各年份会员入会人数直方图 fig = plt.figure(figsize = (8 ,5)) # 设置画布大小 plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示 plt.rcParams['axes.unicode_minus'] = False plt.hist(ffp_year, bins='auto', color='#0504aa') plt.xlabel('年份') plt.ylabel('入会人数') plt.title('各年份会员入会人数') plt.show() plt.close # 提取会员不同性别人数 male = pd.value_counts(data['GENDER'])['男'] female = pd.value_counts(data['GENDER'])['女'] # 绘制会员性别比例饼图 fig = plt.figure(figsize = (7 ,4)) # 设置画布大小 plt.pie([ male, female], labels=['男','女'], colors=['lightskyblue', 'lightcoral'], autopct='%1.1f%%') plt.title('会员性别比例') plt.show() plt.close # # 提取不同级别会员的人数 # lv_four = pd.value_counts(data['FFP_TIER'])[4] # lv_five = pd.value_counts(data['FFP_TIER'])[5] # lv_six = pd.value_counts(data['FFP_TIER'])[6] # # 绘制会员各级别人数条形图 # fig = plt.figure(figsize = (8 ,5)) # 设置画布大小
def case_study_example_question(case_question): ''' param case_question: question index to use as example generates graphs of group interpolation over time prints p-values of individuals and groups ''' ########## Case Study on GROUPS; Interpolation through time graph ############# print('Case Question: Initial vs Survey by Swarms pvalue = %.6f' %(stats.ttest_rel(swarm_initial_mean[case_question],crowd_avg_byquestion[case_question])[1])) print('Case Question: Initial vs Final by Swarms pvalue = %.6f' %(stats.ttest_rel(swarm_initial_mean[case_question],All_interpolations[case_question])[1])) print('Case Question: Survey vs Final by Swarms pvalue = %.6f' %(stats.ttest_rel(All_interpolations[case_question],crowd_avg_byquestion[case_question])[1])) ### Graph of Interpolation over time for each of 10 groups on this question ### initial_interpoltion=[] for i in range(10): group=i impulse_array=np.array((imp_throughtime[case_question][group])) interpolation_through_time=[] time=np.arange(4,len(impulse_array)+4,4) ## starting from 1 second; 4 timesteps = 1 second time_list=time/4 #timesteps >> seconds percenttime_bins=np.arange(.1,1.1,.1) initial_interpoltion.append(impulse_linear_interpolation(sum(impulse_array[4:12]))) ##initial interpolation from 1-3 seconds for t in range(len(percenttime_bins)-1): timestep1=int(percenttime_bins[t]*(len(impulse_array)) -1) #lower bin timestep2=int(percenttime_bins[t+1]*(len(impulse_array)) -1) #upper bin time_impulse=sum(impulse_array[timestep1:timestep2]) #start at one second #change lower index to 4 and upper index timestep2 if wanting cumulative interp over time time_interpolation=impulse_linear_interpolation(time_impulse) interpolation_through_time.append(time_interpolation) plt.plot(percenttime_bins[1:],interpolation_through_time,marker='o',label='Group %s'%(group+1),color='C%s'%i) plt.xlabel('Percent Time',size=14) plt.ylabel('Interpolation',size=14) plt.title('Case Study: Interpolation Through Time',size=16) plt.show() ######## CASE STUDY on INDIVIDUALS: ####### surveyavg=np.mean(crowd_avg_byquestion[case_question]) # survey average on this question (10 groups) swarmavg=np.mean(All_interpolations[case_question]) #swarm interpolation on this question (10 groups) swarm_init=np.array(swarm_initial_question[case_question]) #initial individuals' answers on this question survey_init=np.array(survey_initial_question[case_question]) #survey individuals' answers on this question final_interp=np.array(swarm_final_interp_question[case_question]) #final individuals' interpolations on this question print('Case Question: Initial vs Survey by Individuals pvalue = %.6f' %(stats.ttest_rel(swarm_init,survey_init)[1])) print('Case Question: Initial vs Final by Individuals pvalue = %.6f' %(stats.ttest_rel(swarm_init,final_interp)[1])) print('Case Question: Survey vs Final by Individuals pvalue = %.6f' %(stats.ttest_rel(survey_init,final_interp)[1])) ### making dictionary of faction support frequency ### countsvals=(dict(pd.value_counts(survey_init))) countsvals2=dict(pd.value_counts(swarm_init)) values1=[] for i in list(countsvals.keys()): values1.append(i+.15) #offsetting bins to graph side by side values2=[] for i in list(countsvals2.keys()): values2.append(i-.15) #offsetting bins to graph side by side frac_vals=[] for i in range(len(countsvals.values())): frac_vals.append( list(countsvals.values())[i]/(sum(countsvals.values()))) frac_vals_2=[] for i in range(len(countsvals2.values())): frac_vals_2.append(list(countsvals2.values())[i]/(sum(countsvals2.values()))) #### Graph of Individuals' Survey, Swarm Initial, and Swarm Final Interpolation ### plt.title('Case Question: Individuals Answers') plt.bar(values1,frac_vals,width=.3,color='C0',alpha=.4,label='Survey: Mean = %.1f, std = %.2f'%(np.mean(survey_init),np.std(survey_init))) plt.bar(values2,frac_vals_2,width=.3,color='C1',alpha=.4,label='Swarm Initial: Mean = %.1f, std = %.2f'%(np.mean(swarm_init),np.std(swarm_init))) weights = np.ones_like(final_interp)/float(len(final_interp)) plt.hist(final_interp,label='Swarm Final: Mean = %.1f, std = %.2f'%(np.mean(final_interp),np.std(final_interp)),color='C2',bins=np.arange(1,5,.3),alpha=.4,weights=weights) plt.xlabel('Answer') plt.ylabel('Frequency') plt.legend() plt.show() ### individual mean (mean diff, CI) of swarm initial vs swarm final ### diff=np.array(final_interp) - np.array(swarm_init) print('Case Question: Mean Final - Initial Mean Difference = %.3f , p = %.3f'%(np.mean(diff), stats.ttest_rel(final_interp,swarm_init)[1])) #### bootstrapped individual standard deviation (mean diff, CI) of swarm initial vs swarm final ### bootstrapped=bootstrap(range(len(swarm_init)),1000,len(swarm_init)) ## bootstrapping individuals init_std=[] final_std=[] for i in range(len(bootstrapped)): init_std .append(np.std(list(swarm_init[bootstrapped[i]])) ) #std of individuals initially final_std.append(np.std(list(final_interp[bootstrapped[i]])) ) #std of individuals final p_val=stats.ttest_rel(init_std,final_std)[1] print('Case Question: Mean Final - Initial Standard Deviation Difference = %.3f, p= %.3f'%(np.mean(np.array(final_std)-np.array(init_std)),p_val))