def run_demographics(f): df = pd.read_csv(f, skiprows=1,usecols=['DB_ID', 'Age', 'Gender', 'Day 3']) df = df[df['Day 3']=='y'] for idx in range(len(df)): if df['Gender'].iloc[idx] == 'M': df['Gender'].iloc[idx] = 'm' if df['Gender'].iloc[idx] == 'F': df['Gender'].iloc[idx] = 'f' print '\n#### overall cohort ####\n' print 'N = ' + str(len(df)) print 'N female = ' + str(len(df[df['Gender']=='f'])) print 'N male = ' + str(len(df[df['Gender']=='m'])) sns.countplot(df['Gender']) plt.show() print 'age distribution - female:' sns.distplot(df['Age'][df['Gender'] == 'f']) plt.show() print 'age distribution - male:' sns.distplot(df['Age'][df['Gender'] == 'm']) plt.show() print '\n\n#### young cohort ####\n' print 'N = ' + str(len(df[df['Age']<=40])) print 'N female = ' + str(len(df[(df['Age']<=40) & (df['Gender']=='f')])) print 'N male = ' + str(len(df[(df['Age']<=40) & (df['Gender']=='m')])) sns.countplot(df[df['Age']<=40]['Gender']) plt.show() print 'age distribution - female:' sns.distplot(df['Age'][(df['Age']<=40) & (df['Gender'] == 'f')]) plt.show() print 'age distribution - male:' sns.distplot(df['Age'][(df['Age']<=40) & (df['Gender'] == 'm')]) plt.show()
def inspect_dataset(train_data, test_data): """ 查看数据集 """ print('\n===================== 数据查看 =====================') print('训练集有{}条记录。'.format(len(train_data))) print('测试集有{}条记录。'.format(len(test_data))) # 可视化各类别的数量统计图 plt.figure(figsize=(10, 5)) # 训练集 ax1 = plt.subplot(1, 2, 1) sns.countplot(x='price_range', data=train_data) plt.title('训练集') plt.xticks(rotation='vertical') plt.xlabel('价格等级') plt.ylabel('数量') # 测试集 plt.subplot(1, 2, 2, sharey=ax1) sns.countplot(x='price_range', data=test_data) plt.title('测试集') plt.xticks(rotation='vertical') plt.xlabel('价格等级') plt.ylabel('数量') plt.tight_layout() plt.show()
def run_demographics(df): print '\n#### overall cohort, at time of scanning ####\n' print 'N = ' + str(len(df)) print 'N female = ' + str(len(df[df['gender']==1])) print 'N male = ' + str(len(df[df['gender']==2])) sns.countplot(df['gender']) plt.show() print 'age distribution - female:' sns.distplot(df['age day 3'][df['gender'] == 1].dropna()) plt.show() print 'age distribution - male:' sns.distplot(df['age day 3'][df['gender'] == 2].dropna()) plt.show() print '\n\n' for survey in ['age day 1', 'age day 2', 'age day 3', 'age day 4', 'age day 5a', 'age day 5b', 'age day 6', 'age LEMON']: print '\n\n\n#### young cohort, %s ####\n' % survey print 'N = ' + str(len(df[df[survey]<=40])) print 'N female = ' + str(len(df[(df[survey]<=40) & (df['gender']==1)])) print 'N male = ' + str(len(df[(df[survey]<=40) & (df['gender']==2)])) sns.countplot(df[df[survey]<=40]['gender']) plt.show() print 'age distribution - female:' sns.distplot(df[survey][(df[survey]<=40) & (df['gender'] == 1)].dropna(), bins=15) plt.show() print 'age distribution - male:' sns.distplot(df[survey][(df[survey]<=40) & (df['gender'] == 2)].dropna(), bins=15) plt.show()
def run_demographics(df): print '\n#### overall cohort, at time of scanning ####\n' print 'N = ' + str(len(df)) print 'N female = ' + str(len(df[df['gender']==1])) print 'N male = ' + str(len(df[df['gender']==2])) sns.countplot(df['gender']) plt.show() print 'age distribution - female:' sns.distplot(df['age_C'][df['gender'] == 1].dropna()) plt.show() print 'age distribution - male:' sns.distplot(df['age_C'][df['gender'] == 2].dropna()) plt.show() for survey in ['A', 'B', 'C', 'F', 'G']: print '\n\n#### young cohort, Survey %s ####\n' % survey print 'N = ' + str(len(df[df['age_%s' % survey]<=40])) print 'N female = ' + str(len(df[(df['age_%s' % survey]<=40) & (df['gender']==1)])) print 'N male = ' + str(len(df[(df['age_%s' % survey]<=40) & (df['gender']==2)])) sns.countplot(df[df['age_%s' % survey]<=40]['gender']) plt.show() print 'age distribution - female:' sns.distplot(df['age_%s' % survey][(df['age_%s' % survey]<=40) & (df['gender'] == 1)].dropna(), bins=15) plt.show() print 'age distribution - male:' sns.distplot(df['age_%s' % survey][(df['age_%s' % survey]<=40) & (df['gender'] == 2)].dropna(), bins=15) plt.show()
def inspect_dataset(train_data, test_data): """ 查看数据集 参数: - train_data 训练数据 - test_data 测试数据 """ print('\n===================== 数据查看 =====================') print('训练集有{}条记录。'.format(len(train_data))) print('测试集有{}条记录。'.format(len(test_data))) # 可视化各类别的数量统计图 plt.figure(figsize=(10, 5)) # 训练集 ax1 = plt.subplot(1, 2, 1) sns.countplot(x='text_type', data=train_data) plt.title('Training Data') plt.xlabel('Type') plt.ylabel('Count') # 测试集 plt.subplot(1, 2, 2, sharey=ax1) sns.countplot(x='text_type', data=test_data) plt.title('Test Data') plt.xlabel('Type') plt.ylabel('Count') plt.tight_layout() plt.show()
def bar_plot(data, col, hue=None, file_name=None): sns.countplot(col, hue=hue, data=data.sort(col)) sns.despine(left=True) subplots = [x for x in plt.gcf().get_children() if isinstance(x, matplotlib.axes.Subplot)] for plot in subplots: rectangles = [x for x in plot.get_children() if isinstance(x, matplotlib.patches.Rectangle)] autolabel(rectangles)
def make_bar(): df = pd.DataFrame(data={"age": randint(10, 50, 1000), "response": [choice(['Yes', 'No']) for i in range(1000)]}) df['age_group'] = pd.cut(df.age, bins=[g for g in range(10, 51, 5)], include_lowest=True) df.head() sns.countplot(y='response', hue='age_group', data=df, palette="Greens_d")
def specificAminoAcidPDACanalysis(mostCommonPDACgeneOnly): #variant variable should be updated to variant of interest variant='U2AF1' #codon_changes is a list of lists, each list represents a patient, and the values in the list #are a list of all the nonsyn mutations that patient has of the given gene above #NOTE: one cannot derive with particular patient is responsible for the list (see patientIDwithVariant variable) codon_changes=[] #removes patients in varsInAA that are not PDAC #NOTE: varsInAA now only consists of PDAC ONLY patients for duration of function for patients in noPDAC: if patients in varsInAA: del varsInAA[patients] #a list of patient IDs that have at least one nonsyn mutation for the variant gene listed above patientIDwithVariant=[] for key in varsInAA: codon_changes.append(list(varsInAA[key].loc[varsInAA[key].gene==variant]['amino_acid_change'])) if len(list(varsInAA[key].loc[varsInAA[key].gene==variant]['amino_acid_change']))>0: patientIDwithVariant.append(key) #patientsWithMuts removes lists that are empty, each list should have the nonsyn codon changes listed for a particular patientID #as a check, the len(patientsWithMuts)==len(patientIDwithVariant)) patientsWithMuts=[codon_changes[x] for x in range(len(codon_changes)) if len(codon_changes[x])!=0] print patientsWithMuts print 'The number of PDAC patients with at least one variant/mutation in '+str(variant)+' is '+ str(len(patientIDwithVariant)) #removes codon_changes from list and combines all into a single list so can turn into dataframe amino_acid_changes=[] for x in range(0, len(patientsWithMuts)): for i in range(0, len(patientsWithMuts[x])): amino_acid_changes.append(patientsWithMuts[x][i]) #ordered=Counts and orders amino_acid_changes from most frequent counts to least frequent counts #ordered is a dictionary where key=nonsyn mutation, value=number of occurennces of that mutation ordered=OrderedDict(sorted(Counter(amino_acid_changes).items(), key=lambda x: -x[1])) #extracts just the mutation name from ordered, so it can be passed through sns to #illustrate the order of mutations in graph in decreasing frequency order_in_graph=[] for i in ordered: order_in_graph.append(i) #this will plot the number of patients that have a particular non-syn mutation of the #given variant variable gene listed at the beginning of function #countplot parameters headers=['non-synonymous amino acid changes'] #converts amino_acid_changes into a convenient dataframe for graphing purposes, #although amino_acid_changes is not orders, it is ok, because when using countplot, we specify #the order in which to plot the mutations variant_dataframe=pandas.DataFrame(amino_acid_changes, columns=headers) sns.countplot(x='non-synonymous amino acid changes', data=variant_dataframe, order=order_in_graph) sns.plt.title('Variants in '+str(variant)+' across '+str(len(patientsWithMuts))+' PDAC patients') sns.plt.show() return variant, patientIDwithVariant, patientsWithMuts
def make_figures(df, outdir): import matplotlib matplotlib.use('Agg') # avoid using the X backend for saving figures import pylab as pl import seaborn as sns pl.figure() sns.countplot(y='OS', data=df) pl.xlabel('Number of submissions') pl.ylabel('Primary operating system') pl.savefig(opj(outdir, 'graph_sub_by_os.png'))
def weather_distribution(self): data_dir = g_singletonDataFilePath.getTrainDir() self.gapdf = self.load_weatherdf(data_dir) print self.gapdf['weather'].describe() # sns.distplot(self.gapdf['gap'],kde=False, bins=100); sns.countplot(x="weather", data=self.gapdf, palette="Greens_d"); plt.title('Countplot of Weather') # self.gapdf['weather'].plot(kind='bar') # plt.xlabel('Weather') # plt.title('Histogram of Weather') return
def plot(self): print('\n','Take a moment to review the plots in the export folder. These show the diversity of information found within each data column.') print('ID and Category Columns have been reduced to their first character.') print('Sensitive information has been converted to numbers.') print('At this time, this program does not support plots of Time categories', '\n') datCopy = self.data ID = datCopy.xs('ID', axis=1, level=1) for i in ID.columns: dat = [] for row in ID[i]: if row is np.nan: continue else: temp = row[0] dat.append(temp) dat1 = np.array(dat) sns_plot = sns.countplot(x=dat1) sns_plot = sns_plot.get_figure() sns_plot.savefig("exports/"+i+".png") CAT = datCopy.xs('CAT', axis=1, level=1) for i in CAT.columns: dat = [] for row in CAT[i]: if row is np.nan: continue else: temp = row[0] dat.append(temp) dat1 = np.array(dat) sns_plot = sns.countplot(x=dat1) sns_plot = sns_plot.get_figure() sns_plot.savefig("exports/"+i+".png") CONT = datCopy.xs('CONT', axis=1, level=1) for i in CONT.columns: dat = [] for row in CONT[i]: if row is np.nan: continue else: #temp = row[0] dat.append(row) dat1 = np.array(dat) sns_plot = sns.countplot(x=dat1) sns_plot = sns_plot.get_figure() sns_plot.savefig("exports/"+i+".png")
def plot(data): if len(data) == 0: return plot_order = [ "delParA", "delParAB", "WT ParAB int", "WT ParB int", "WT episomal ParB", ] plt.figure(figsize=(8, 8)) ax = plt.subplot(221) sns.barplot( x="dataset", y="v", data=data, order=plot_order ) _fmt_barplot(ax, r"Mean separation velocity (\si{\micro\metre\per\hour})") ax = plt.subplot(222) sns.barplot( x="dataset", y="elongation", data=data, order=plot_order ) _fmt_barplot(ax, r"Mean elongation rate (\si{\micro\metre\per\hour})") ax = plt.subplot(224) sns.barplot( x="dataset", y="growth", data=data, order=plot_order ) _fmt_barplot(ax, r"Mean growth rate (\si{\per\hour})") ax = plt.subplot(223) sns.countplot( x="dataset", data=data, order=plot_order ) _fmt_barplot(ax, "n") plt.tight_layout() plt.savefig("parB_interspot/parB_interspot.pdf") g = sns.PairGrid(data, vars=["v", "growth", "elongation"], hue="dataset") g = g.map_diag(plt.hist) g = g.map_offdiag(plt.scatter) g = g.add_legend(bbox_to_anchor=(1.2, 0.55)) g.savefig("parB_interspot/parB_interspot_data.pdf")
def composite_qc(df_orig, size=(16, 12)): """ Plot composite QC figures """ df = df_orig.rename(columns={"hli_calc_age_sample_taken": "Age", "hli_calc_gender": "Gender", "eth7_max": "Ethnicity", "MeanCoverage": "Mean coverage", "Chemistry": "Sequencing chemistry", "Release Client": "Cohort", }) fig = plt.figure(1, size) ax1 = plt.subplot2grid((2, 7), (0, 0), rowspan=1, colspan=2) ax2 = plt.subplot2grid((2, 7), (0, 2), rowspan=1, colspan=2) ax3 = plt.subplot2grid((2, 7), (0, 4), rowspan=1, colspan=3) ax4 = plt.subplot2grid((2, 7), (1, 0), rowspan=1, colspan=2) ax5 = plt.subplot2grid((2, 7), (1, 2), rowspan=1, colspan=2) ax6 = plt.subplot2grid((2, 7), (1, 4), rowspan=1, colspan=3) sns.distplot(df["Age"].dropna(), kde=False, ax=ax1) sns.countplot(x="Gender", data=df, ax=ax2) sns.countplot(x="Ethnicity", data=df, ax=ax3, order = df['Ethnicity'].value_counts().index) sns.distplot(df["Mean coverage"].dropna(), kde=False, ax=ax4) ax4.set_xlim(0, 100) sns.countplot(x="Sequencing chemistry", data=df, ax=ax5) sns.countplot(x="Cohort", data=df, ax=ax6, order = df['Cohort'].value_counts().index) # Anonymize the cohorts cohorts = ax6.get_xticklabels() newCohorts = [] for i, c in enumerate(cohorts): if c.get_text() == "Spector": c = "TwinsUK" elif c.get_text() != "Health Nucleus": c = "C{}".format(i + 1) newCohorts.append(c) ax6.set_xticklabels(newCohorts) for ax in (ax6,): ax.set_xticklabels(ax.get_xticklabels(), ha="right", rotation=30) for ax in (ax1, ax2, ax3, ax4, ax5, ax6): ax.set_title(ax.get_xlabel()) ax.set_xlabel("") plt.tight_layout() root = fig.add_axes((0, 0, 1, 1)) labels = ((.02, .96, "A"), (.3, .96, "B"), (.6, .96, "C"), (.02, .52, "D"), (.3, .52, "E"), (.6, .52, "F")) panel_labels(root, labels) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def plot_and_show_crime_type_histogram(dataframe): """ Shows the histogram where each crime type is shown with the number of times that it was committed. :param dataframe: The dataframe that holds the data. It should have the column named "Category" :result: shows a histogram in a popup window. """ #df2.Category.value_counts().plot(kind='bar') sns.countplot(y="Category", data=dataframe, palette="Greens_d") plt.suptitle("Crime Type Instances", fontsize=30) plt.ylabel("Type of Crime", fontsize=26) plt.xlabel("Number of Crimes Committed", fontsize=26) plt.show()
def plothist(df, classes='target', ax=None, col=None, order=None, title=""): if ax is None: fig = plt.figure() ax = fig.add_subplot(1, 1, 1) if col is not None: if order is not None: sns.countplot(x=classes, hue=col, data=df, ax=ax, order=order) else: sns.countplot(x=classes, hue=col, data=df, ax=ax) if col is None: if order is not None: sns.countplot(x=classes, data=df, ax=ax, order=order) else: sns.countplot(x=classes, data=df, ax=ax) maxi = 0; saveval = [] for p in ax.patches: x = p.get_bbox().get_points()[:, 0] y = p.get_bbox().get_points()[1, 1] saveval += [(y, 100. * y / df.shape[0])] if maxi < y: maxi = y ax.annotate('{:.1f} %'.format(100. * y / df.shape[0]), (x.mean(), y), ha='center', va='bottom', fontsize=12) # set the alignment of the text ax.set_ylim((0, 1.15 * maxi)) ax.set_title(title) return ax
def show_variable_info(col, quiet = False): if not quiet: notnull = col.notnull() missing_cnt = len(col)-sum(notnull) print(col.name, "| Missing values: {0} ({1:0.2f} %)".format(missing_cnt,missing_cnt/len(col))) if col.dtype == 'float64': sns.distplot(col[notnull]) elif len(col.unique()) < 16: sns.countplot(col[notnull]) else: print('Showing only first 16 levels from', len(col.unique())) col_cut = col.value_counts()[:16] ax = plt.axes() sns.barplot(x = col_cut.index, y = col_cut, ax = ax) ax.set_ylabel('Count')
def sex_analyze(is_plot=True): ''' 性别分析 将性别按照 male, fmale, child :return ''' global titanic_df global test_df titanic_df['Person'] = titanic_df[['Age', 'Sex']].apply(get_persion, axis=1) test_df['Person'] = test_df[['Age', 'Sex']].apply(get_persion, axis=1) print '-' * 40 print titanic_df.head(n=20) print test_df.head(n=20) # 将 'Sex' 删除 titanic_df.drop(['Sex'], axis=1, inplace=True) test_df.drop(['Sex'], axis=1, inplace=True) # 创建基于person的dummy向量 # 删除掉 'Male'的原因是 'Male'有较低的存活率 所以这个因子起到的作用不大 person_dummies_titanic = pd.get_dummies(titanic_df['Person']) person_dummies_titanic.columns = ['Child', 'Female', 'Male'] person_dummies_titanic.drop(['Male'], axis=1, inplace=True) person_dummies_test = pd.get_dummies(test_df['Person']) person_dummies_test.columns = ['Child', 'Female', 'Male'] person_dummies_test.drop(['Male'], axis=1, inplace=True) titanic_df = titanic_df.join(person_dummies_titanic) test_df = test_df.join(person_dummies_test) fig, (axis1, axis2) = plt.subplots(1, 2, figsize=(10, 5)) if is_plot: # 绘制person的数量 sns.countplot(x='Person', data=titanic_df, ax=axis1) if is_plot: # 绘制存活率 person_perc = titanic_df[['Person', 'Survived']].groupby('Person', as_index=False).mean() sns.barplot(x='Person', y='Survived', data=person_perc, ax=axis2, order=['male', 'female', 'child']) # 将person扔掉 titanic_df.drop(['Person'], axis=1, inplace=True) test_df.drop(['Person'], axis=1, inplace=True)
def plot_chemical_trajectory(self, environment, filename): """ Plot the trajectory through chemical space. Parameters ---------- environment : str the name of the environment for which the chemical space trajectory is desired """ chemical_state_trajectory = self.extract_state_trajectory(environment) visited_states = list(set(chemical_state_trajectory)) state_trajectory = np.zeros(len(chemical_state_trajectory)) for idx, chemical_state in enumerate(chemical_state_trajectory): state_trajectory[idx] = visited_states.index(chemical_state) with PdfPages(filename) as pdf: sns.set(font_scale=2) fig = plt.figure(figsize=(28, 12)) plt.subplot2grid((1,2), (0,0)) ax = sns.scatterplot(np.arange(len(state_trajectory)), state_trajectory) plt.yticks(np.arange(len(visited_states)), visited_states) plt.title("Trajectory through chemical space in {}".format(environment)) plt.xlabel("iteration") plt.ylabel("chemical state") plt.tight_layout() plt.subplot2grid((1,2), (0,1)) ax = sns.countplot(y=state_trajectory) pdf.savefig(fig) plt.close()
def explor(file, target): import matplotlib import matplotlib.pyplot as plt import os import seaborn as sns # Delete the old png outPutPath = settings.MEDIA_ROOT mediaFiles = os.listdir(path = outPutPath) for item in mediaFiles: tempPath = os.path.join(outPutPath,item) if (os.path.isdir(tempPath) == False): if (item.startswith("expl") or item.startswith("pairplot")): os.remove(tempPath) explPathOne = os.path.join(outPutPath,'expl.png') explPathTwo = os.path.join(outPutPath,'pairplot.png') matplotlib.use('Agg') sns.set(style="whitegrid", color_codes=True) if (file.dataset[target].dtype != "int64" and file.dataset[target].dtype != "float64"): g = sns.countplot(x=target, data=file.dataset, palette="Greens_d") g.figure.subplots_adjust(bottom=0.4) for item in g.xaxis.get_major_ticks(): item.label.set_fontsize(8) item.label.set_rotation(90) plt.tight_layout() else: file.dataset.hist(column = target) plt.savefig(explPathOne) plt.close('all') datatypes = set() for col in file.colNames: datatypes.add(str(file.dataset[col].dtype)) if ((len(file.colNames) > 10) or (('int64' not in datatypes) and ('float64' not in datatypes))): matplotlib.use('Agg') fig = plt.figure() ax = fig.add_subplot(111) fig = fig.subplots_adjust(top=0.85) ax.text(0.2, 0.8, 'Sometimes, you just cannot get what you want.', style='italic',bbox={'facecolor':'red', 'alpha':0.5, 'pad':10}) ax.text(0.2, 0.6, 'There are two possible reasons:', style='italic',bbox={'facecolor':'blue', 'alpha':0.5, 'pad':5}) ax.text(0.2, 0.5, '1. You have too many columns; ', style='italic',bbox={'facecolor':'blue', 'alpha':0.5, 'pad':5}) ax.text(0.2, 0.4, '2. All your columns are categorial variables', style='italic',bbox={'facecolor':'blue', 'alpha':0.5, 'pad':5}) ax.text(0.2, 0.3, 'Unhappy? Go to use other website, please!', style='italic',bbox={'facecolor':'yellow','alpha':0.5, 'pad':5}) else: matplotlib.use('Agg') sns.set() g = sns.pairplot(file.dataset, hue = target) g.fig.subplots_adjust(top=0.8, right = 0.8) g.fig.suptitle('Pair Plots of All Numberic Variables', fontsize=20,color="r",alpha=0.5) # plt.legend(prop={'size':6}) plt.savefig(explPathTwo) plt.close('all')
def univariate(self): path_to_uni_plots = os.path.join(self.directory, self.univariate_dir) if not os.path.exists(path_to_uni_plots): os.makedirs(path_to_uni_plots) for num_col in self.numericals: x = self.train[num_col].dropna() plt.figure(figsize=(10, 8)) sns.distplot(x, kde=False, rug=False) plt.savefig(os.path.join(path_to_uni_plots, num_col + '_histogram.png' ), bbox_inches='tight') for cat_col in self.categoricals: if len( self.train[cat_col].unique() ) < 10: x = self.train[cat_col].dropna() plt.figure(figsize=(10, 8)) sns.countplot(x=cat_col, data=self.train) plt.savefig(os.path.join(path_to_uni_plots, cat_col + '_bar.png' ), bbox_inches='tight')
def plotExp(self,exp,myData): plt.figure(); # print (exp[1]['xaxis']) g=sns.countplot(myData[exp[1]['xaxis']]); plt.savefig("plots/static/%d.png" %settings.count); plt.clf() plt.close() settings.count=settings.count+1;
def basicplots(df_copy): try: #make a vertical barplot of crime counts per day of week plt.figure(1) sns.countplot(x="DayOfWeek", data=df, palette="Greens_d", order=week) #make a horizontal barplot of crime counts per PdDistrict plt.figure(2) sns.countplot(y="PdDistrict", data=df, palette="Greens_d") #plot X and Y coordinates plt.figure(3) plt.plot(df['X'], df['Y'], 'o') #remove outliers df_copy[((df_copy.X - df_copy.X.mean()) / df_copy.X.std()).abs() < 3] df_copy[((df_copy.Y - df_copy.X.mean()) / df_copy.Y.std()).abs() < 3] plt.show() except Exception as e: print(e)
def family_analyze(is_plot=True): # 将父母和兄弟姐妹整合成一个特征 titanic_df['Family'] = titanic_df['Parch'] + titanic_df['SibSp'] print titanic_df['Family'].head(n=20) family_no = titanic_df['Family'].loc[titanic_df['Family'] > 0] print type(family_no) print family_no.head() family_no_1 = titanic_df['Family'][titanic_df['Family'] > 0] print type(family_no_1) print family_no_1.head() # 有家庭的设置为1 无家庭成员的设置成0 titanic_df['Family'].loc[titanic_df['Family'] > 0] = 1 titanic_df['Family'].loc[titanic_df['Family'] == 0] = 0 #同样的方式处理测试集 test_df['Family'] = test_df['Parch'] + test_df['SibSp'] test_df['Family'].loc[test_df['Family'] > 0] = 1 test_df['Family'].loc[test_df['Family'] == 0] = 0 # 将'Parch'和'SibSp' 删除 titanic_df.drop(['Parch', 'SibSp'], axis=1, inplace=True) test_df.drop(['Parch', 'SibSp'], axis=1, inplace=True) print '-' * 40 print titanic_df.head() print test_df.head() # 绘制 'Family'的数量在0,1上面 fig, (axis1, axis2) = plt.subplots(1, 2, sharex=True, figsize=(10,5)) if is_plot: # 绘制count plot sns.countplot(x='Family', data=titanic_df, order=[0,1],ax=axis1) # 在 family='0' 或者 '1'的时候, surviced的情况 family_perc = titanic_df[['Family', 'Survived']].groupby('Family', as_index=False).mean() if is_plot: sns.barplot(x='Family', y='Survived', data=family_perc, order=[0,1], ax=axis2) axis1.set_xticklabels([u'有家庭', u'孤身一人'], rotation=0)
def draw_histograms(data, headings, data_set): """ Chart relationships between Variables """ #chart_categories = ["Age"] #create a folder for the dataset directory = os.path.dirname(os.path.join(os.getcwd(),"Results","Data Counts",data_set, "")) if not os.path.exists(directory): os.makedirs(directory) #convert to a pandas dataset pandas_data=pd.DataFrame(data = data, columns = headings) for i in range(len(chart_categories)): chart_category = chart_categories[i] chart_title = chart_titles[i] #get the slice index = np.argwhere(headings == chart_category) chart_column = data[ : , index[0][0]] #get counts plt.figure() plt.xlabel(chart_title) plt.ylabel("Count") plt.title("%s" % chart_title) try: #try converting to numbers chart_column = chart_column.astype(np.float) #create histogram hist, bin_edge = np.histogram(chart_column, 10) bin_middles = bin_edge[:-1] + np.diff(bin_edge)/2 plt.hist(chart_column, 10, normed=False, histtype='bar', rwidth=0.8) pylab.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category)) plt.close() except: #get unique values unique_categories, unique_counts = np.unique(chart_column, return_counts=True) sns_plot = sns.countplot(x=chart_category, data=pandas_data.sort(chart_category), palette="Greens_d"); #plt.setp(sns_plot.get_xticklabels(), rotation=45) sns_plot.figure.autofmt_xdate() sns_plot.figure.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category)) plt.close()
def make_count(series, title): ''' Takes pd series, title. Outputs a barplot of counts ''' title = title.title() sns_plot = sns.countplot(x=series, palette = "RdBu") plt.title('Count of ' + title.replace("_", " ")) plt.savefig('Count_' + title + '.png', format='png') plt.close()
def isoformCheck(path): gene='TP53' patientIso={} noPatIDonlyTups=[] os.chdir(path) for files in os.listdir(path): with open(files) as input: #extracts patient ID from file name patientID='' for i in range(len(files)): if files[i]=='-': break; else: patientID=patientID+files[i] #comment out header line if no header is included in input header=next(input) for line in input: line=line.split('\t') if len(line)<35: print line break; if (line[11]==gene) and (float(line[36].rstrip('\n'))>=15): noPatIDonlyTups.append([str(line[14]), str(line[17]), str(line[18])]) if patientID in patientIso: patientIso[patientID]=patientIso[patientID]+[(str(line[14]), str(line[17]), str(line[18]))] else: patientIso[patientID]=[(str(line[14]), str(line[17]), str(line[18]))] print patientIso print len(patientIso) print noPatIDonlyTups for x in range(0, len(noPatIDonlyTups)): if noPatIDonlyTups[x][0]=='NON_SYNONYMOUS_CODING': noPatIDonlyTups[x][0]='NON_SYN' #print sorted(noPatIDonlyTups, key=lambda x: x[0]) dataframe=pandas.DataFrame(noPatIDonlyTups, columns=['mutation type', 'mutation', 'isoform']) print dataframe with sns.plotting_context("notebook", font_scale=1.5): sns.countplot(y="mutation type", hue="isoform", data=dataframe, palette="Set2") sns.plt.show()
def plot_classification_frequency(df, category, file_name, convert_labels = False): ''' Plots the frequency at which labels occur INPUT df: Pandas DataFrame of the image name and labels category: category of labels, from 0 to 4 file_name: file name of the image convert_labels: argument specified for converting to binary classification OUTPUT Image of plot, showing label frequency ''' if convert_labels == True: labels['level'] = change_labels(labels, 'level') sns.set(style="whitegrid", color_codes=True) sns.countplot(x=category, data=labels) plt.title('Retinopathy vs Frequency') plt.savefig(file_name)
def draw_histograms(data, headings, data_set): """ Chart relationships between Variables """ chart_categories = ['course_grade', 'Assig_1_full_40', 'Assig_2_full_40', 'Assig_3_full_40', 'proj_1_100', 'proj_2_100', 'proj_3_100', 'final_exam_100', 'peer_feedback_100', 'birth_country', 'residence_country', 'gender', 'age', 'primary_language', 'english_fluency', 'time_zone', 'occupation', 'highest_education', 'expected_hours_spent', 'formal_class_prog_taken', 'C', 'C#', 'C++', 'Java', 'JavaScript', 'Lisp', 'Objective C', 'Perl', 'PHP', 'Python', 'Ruby', 'Shell', 'Swift', 'Visual Basic', 'Other (specify below)', 'years_programming', 'prior_omscs_classes_completed', 'besides_KBAI_how_many_classes', 'moocs_completed_outside_OMSCS', 'qtr_proj1_confidence', 'qtr_proj2_confidence', 'qtr_piazza_opinion', 'qtr_peerfeedback_opinion', 'qtr_on_piazza', 'qtr_email', 'qtr_hipchat', 'qrt_gplus', 'qtr_other_chat', 'qtr_phone', 'qtr_facebook', 'qtr_in_person', 'CS6210_Completed', 'CS8803_Completed', 'CS6250_Completed', 'CS7641_Completed', 'CS6300_Completed', 'CS6310_Completed', 'CS4495_Completed', 'CS6475_Completed', 'CS6505_Completed', 'CS6290_Completed', 'CS8803_Completed', 'CS6440_Completed', 'mid_proj2_confidence', 'mid_proj3_confidence', 'mid_piazza_opinion', 'mid_peerfeedback_opinion', 'mid_on_piazza', 'mid_email', 'mid_hipchat', 'qrt_gplus', 'mid_other_chat', 'mid_phone', 'mid_facebook', 'mid_in_person', 'final_proj3_confidence', 'hours_spent', 'lessons_watched', 'exercises_completed', 'forum_visit_frequency', 'final_on_piazza', 'final_email', 'final_hipchat', 'qrt_gplus', 'final_other_chat', 'final_phone', 'final_facebook', 'final_in_person', 'watch_out_order', 'fall_behind', 'get_ahead', 'rewatch_full_lesson', 'rewatch_partial_lesson', 'view_answer_after_1incorrect', 'repeat_exercise_until_correct', 'skip_exercise', 'correct_first_attempt', 'access_from_mobile', 'download_videos', 'piazza_answers', 'piazza_days', 'piazza_asks', 'piazza_posts', 'piazza_views', 'total_lecture_time', 'overal_lecture_views', 'lecture_1_views', 'lecture_2_views', 'lecture_3_views', 'lecture_4_views', 'lecture_5_views', 'lecture_6_views', 'lecture_7_views', 'lecture_8_views', 'lecture_9_views', 'lecture_10_views', 'lecture_11_views', 'lecture_12_views', 'lecture_13_views', 'lecture_14_views', 'lecture_15_views', 'lecture_16_views', 'lecture_17_views', 'lecture_18_views', 'lecture_19_views', 'lecture_20_views', 'lecture_21_views', 'lecture_22_views', 'lecture_23_views', 'lecture_24_views', 'lecture_25_views', 'lecture_26_views', 'lecture_1_pace', 'lecture_2_pace', 'lecture_3_pace', 'lecture_4_pace', 'lecture_5_pace', 'lecture_6_pace', 'lecture_7_pace', 'lecture_8_pace', 'lecture_9_pace', 'lecture_10_pace', 'lecture_11_pace', 'lecture_12_pace', 'lecture_13_pace', 'lecture_14_pace', 'lecture_15_pace', 'lecture_16_pace', 'lecture_17_pace', 'lecture_18_pace', 'lecture_19_pace', 'lecture_20_pace', 'lecture_21_pace', 'lecture_22_pace', 'lecture_23_pace', 'lecture_24_pace', 'lecture_25_pace', 'lecture_26_pace', 'overall_pace'] #chart_categories = ["Age"] #create a folder for the dataset directory = os.path.dirname(os.path.join(os.getcwd(),"Results","Data Counts",data_set, "")) if not os.path.exists(directory): os.makedirs(directory) #convert to a pandas dataset pandas_data=pd.DataFrame(data = data, columns = headings) for chart_category in chart_categories: #get the slice index = np.argwhere(headings == chart_category) chart_column = data[ : , index[0][0]] #get counts plt.figure() plt.xlabel(chart_category) plt.ylabel("Count") plt.title("%s Count" % chart_category) try: #try converting to numbers chart_column = chart_column.astype(np.float) #create histogram hist, bin_edge = np.histogram(chart_column, 10) bin_middles = bin_edge[:-1] + np.diff(bin_edge)/2 plt.hist(chart_column, 10, normed=False, histtype='bar', rwidth=0.8) pylab.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category)) plt.close() except: #get unique values unique_categories, unique_counts = np.unique(chart_column, return_counts=True) sns_plot = sns.countplot(x=chart_category, data=pandas_data, palette="Greens_d"); #plt.setp(sns_plot.get_xticklabels(), rotation=45) sns_plot.figure.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category)) plt.close()
def segments_countplot(data_sf, x=None, y=None, hue=None, order=None, hue_order=None, figsize_tuple= None, title=None, seaborn_style='whitegrid', seaborn_palette='deep', color='b', **kwargs): '''Function for fancy seaborn barplot: Parameters ---------- data_sf: SFrame SFrame for plotting. If x and y are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form. x, y, hue: seaborn countplot names of variables in data or vector data, optional Inputs for plotting long-form data. See examples for interpretation. order, hue_order: seaborn countplot lists of strings, optional Order to plot the categorical levels in, otherwise the levels are inferred from the data objects. figsize_tuple: tuple of integers, optional, default: None width, height in inches. If not provided, defaults to rc figure.figsize. title: string Provides the countplot title. seaborn_style: dict, None, or one of {darkgrid, whitegrid, dark, white, ticks} Set the aesthetic style of the plots through the seaborn module. A dictionary of parameters or the name of a preconfigured set. seaborn_palette: {deep, muted, pastel, dark, bright, colorblind} Change how matplotlib color shorthands are interpreted. Calling this will change how shorthand codes like 'b' or 'g' are interpreted by matplotlib in subsequent plots. color: matplotlib color, optional Color for all of the elements, or seed for light_palette() when using hue nesting in seaborn.barplot(). kwargs : key, value mappings Other keyword arguments which are passed through (a)seaborn.countplot API and/or (b)plt.bar at draw time. ''' # define the plotting style sns.set(style=seaborn_style) # initialize the matplotlib figure plt.figure(figsize=figsize_tuple) # transform the SFrame into a Pandas DataFrame data_df = data_sf.to_dataframe() # plot the segments counts ax = sns.countplot(x=x, y=y, hue=hue, data=data_df, order=order, hue_order=hue_order, orient='v', palette=seaborn_palette, color=color, **kwargs) # add informative axis labels, title # make final plot adjustments plt.title(title, {'fontweight': 'bold'}) sns.despine(left=True, bottom=True) plt.show()
def main(argv): if len(argv) != 2 and len(argv) != 4: print("Usage: {} <training_data.csv> [<testing_data.csv> <output_results>]".format(argv[0])) exit(2) animals, outcomes = import_training(argv[1]) forest = first_pass(animals, outcomes) naive(animals, outcomes) if len(argv) == 4: test_data = import_testing(argv[2]) result = output(forest, test_data, argv[3]) # test_data['OutcomeType'] = pd.Series(result, index=test_data.index) result = pd.DataFrame({"result": result}) test_data = test_data.join(result) test_data["SexuponOutcome"] = test_data.SexuponOutcome.apply(cats.num_to_value) print(test_data) sb.countplot(test_data.Hour, hue=test_data.result, hue_order=["Return_to_owner", "Euthanasia", "Adoption", "Transfer", "Died"]) plt.show()
iris = pd.read_csv('../Datasets/Iris.csv') # Get Column List from Iris print(iris.columns) # Pairplot of Iris sns.pairplot(iris) # Don't Forget to add this line when we want to plot any graph plt.show() # Get Unique Class print(iris['Species'].unique()) # Show Distribution of each class sns.countplot(iris['Species']) plt.show() # Explore Sepal Length of Each Class iris_versicolor = iris.loc[ lambda data: iris['Species'] == 'Iris-versicolor', :] iris_setosa = iris.loc[lambda data: iris['Species'] == 'Iris-setosa', :] iris_virginica = iris.loc[lambda data: iris['Species'] == 'Iris-virginica', :] sns.distplot(iris_versicolor['SepalLengthCm'], hist=False, color='blue') sns.distplot(iris_setosa['SepalLengthCm'], hist=False, color='green') sns.distplot(iris_virginica['SepalLengthCm'], hist=False, color='red') plt.show() sns.lmplot(x='SepalLengthCm', y='SepalWidthCm',
import seaborn as sns import matplotlib.pyplot as plt tip_data = sns.load_dataset('tips') #sns.scatterplot(x="tip", y="total_bill", data=tip_data, hue='size', size='size', sizes=(70, 280), style='size', palette='Set3', color=".5", marker="*") #sns.lineplot(x="tip", y="total_bill", data=tip_data, hue="size", markers=True, dashes=False) #sns.catplot(x='tip', y='total_bill', data=tip_data, row='size', kind='smoker') #sns.barplot(x='tip', y='total_bill', data=tip_data, hue='size', ci=68) sns.countplot(x='tip', data=tip_data, saturation=1.4, palette='Set3', linewidth=5, edgecolor=sns.color_palette("dark", 3)) plt.show()
# -*- coding: utf-8 -*- """first_lesson_home_work_3.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/17z2eYAMXU5svbEpFP3vaT2WkXSleRlXl """ import pandas as pd import seaborn as sns pokemon = pd.read_csv("Pokemon.csv", index_col=0) pokemon.head() sns.countplot(pokemon['Generation']) sns.distplot(pokemon['HP']) sns.jointplot(x='Attack', y='Defense', data=pokemon) sns.jointplot(x='Attack', y='Defense', data=pokemon, kind='hex', gridsize=20) sns.kdeplot(pokemon['HP'], pokemon['Attack']) sns.boxplot(x='Legendary', y='Attack', data=pokemon) sns.violinplot(x='Legendary', y='Attack', data=pokemon)
# print(books.loc[books.publisher.isnull(), :]) books.loc[(books.ISBN == '193169656X'), 'publisher'] = 'other' books.loc[(books.ISBN == '1931696993'), 'publisher'] = 'other' # 用户数据集 # print(users.shape) # print(users.head()) # print(users.dtypes) # print(users.userID.values) users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.NAN users.Age = users.Age.fillna(round(users.Age.mean())) users.Age = users.Age.astype(np.int32) # print(sorted(users.Age.unique())) # print(ratings.shape) n_users = users.shape[0] n_books = books.shape[0] # print(n_users*n_books) # print(ratings.head()) ratings_new = ratings[ratings.ISBN.isin(books.ISBN)] ratings_new = ratings_new[ratings_new.userID.isin(users.userID)] # print("number of users: " + str(n_users)) # print("number of books: " + str(n_books)) sparsity = 1.0 - len(ratings_new) / float(n_users * n_books) print('图书交叉数据集的稀疏级别是 ' + str(sparsity * 100) + ' %') ratings_explicit = ratings_new[ratings_new.bookRating != 0] ratings_implicit = ratings_new[ratings_new.bookRating == 0] # print(ratings_new.shape) #print(ratings_explicit.shape) # print(ratings_implicit.shape) sns.countplot(data=ratings_explicit, x='bookRating') plt.show()
columns="Embarked", values="Ticket", aggfunc="count") ax = tbl.T.plot(kind='bar') # Survivor split Deck tbl = pd.pivot_table(train_df, index="Survived", columns="Deck", values="Ticket", aggfunc="count") ax = tbl.T.plot(kind='bar') # unknown deck for vast majority of people? # Survival countplots all in one fig, axs = plt.subplots(2, 3, figsize=(10, 8)) sns.countplot(data=train_df, y="Survived", hue="Pclass", ax=axs.flatten()[0]) axs.flatten()[0].legend(title="Pclass", loc=4) sns.countplot(data=train_df, y="Survived", hue="Sex", ax=axs.flatten()[1]) axs.flatten()[1].legend(title="Sex", loc=4) axs.flatten()[1].set_ylabel('') sns.countplot(data=train_df, y="Survived", hue="Deck", ax=axs.flatten()[2]) axs.flatten()[2].set_ylabel('') axs.flatten()[2].legend(title="Deck", loc=4) sns.countplot(data=train_df, y="Survived", hue="Parch", ax=axs.flatten()[3]) axs.flatten()[3].legend(title="Parch", loc=4) sns.countplot(data=train_df, y="Survived", hue="SibSp", ax=axs.flatten()[4]) axs.flatten()[4].set_ylabel('') axs.flatten()[4].legend(title="SibSp", loc=4) sns.countplot(data=train_df, y="Survived", hue="Embarked", ax=axs.flatten()[5]) axs.flatten()[5].set_ylabel('') axs.flatten()[5].legend(title="Embarked", loc=4)
data.append((category, os.path.join(data_dir, category, file))) df = pd.DataFrame(data, columns=['class', 'file_path']) len_df = len(df) print(f"There are {len_df} images") print(df['class'].value_counts()) # Figure 1 plt.figure() df['class'].value_counts().plot(kind='bar') plt.title('Class counts') # Figure 2 plt.figure() _ = sns.countplot(y=df['class']) plt.title('Class counts') data_dir = pathlib.Path(data_dir) image_count = len(list(data_dir.glob('*/*.jpg'))) CLASS_NAMES = np.array([item.name for item in data_dir.glob('*') if item.name != ".DS_Store"]) image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255) BATCH_SIZE = 32 IMG_HEIGHT = 224 IMG_WIDTH = 224 STEPS_PER_EPOCH = np.ceil(image_count / BATCH_SIZE) train_data_gen = image_generator.flow_from_directory(directory=str(data_dir),
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns train = pd.read_csv('titanic_train.csv') print(train.head()) sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis') plt.show() print(train.isnull()) sns.set_style('whitegrid') sns.countplot(x='Survived', data=train, palette='RdBu_r') plt.show() sns.set_style('whitegrid') sns.countplot(x='Survived', hue='Sex', data=train, palette='RdBu_r') plt.show() sns.set_style('whitegrid') sns.countplot(x='Survived', hue='Pclass', data=train, palette='rainbow') plt.show() sns.distplot(train['Age'].dropna(), kde=False, color='darkred', bins=30) plt.show() train['Age'].hist(bins=30, color='darkred', alpha=0.7) plt.show()
tweet_lengths = [ len(tokens) for tokens in df_topic_modeling['topic_modeling_text'] ] vocab = sorted(list(set(all_words))) # Printing the answer - print('{} words total, with a vocabulary size of {}'.format( len(all_words), len(vocab))) print('Max tweet length is {}'.format(max(tweet_lengths))) # Taking only a part of it - tweet_lengths = [num for num in tweet_lengths if num < 25] # Plotting the distribution of the word count of tweets - fig1 = plt.figure(figsize=(15, 8)) sns.countplot(tweet_lengths) plt.title('Tweet Length Distribution', fontsize=18) plt.xlabel('Words per Tweet', fontsize=14) plt.ylabel('Number of Tweets', fontsize=14) plt.savefig('tweets_distribution.png') plt.show() # Since we have the word count, we can break the tweets data into teams so that we only have to load a small part of data at a time. # We can further use cache function of streamlit to ensure once a data for a particular team is loaded, we don't have to lead it again. teams = list(df_sentiment['team'].unique()) for selected_team in teams: temp = df_topic_modeling.loc[df_topic_modeling['team'] == selected_team] pickle.dump(temp, open(f'{selected_team}.p', 'wb')) del (temp, teams, selected_team)
# combine train and val and divide them again with 8:2 ratio temp = np.concatenate([train, val], axis=0) len(temp) train, val = train_test_split(temp, test_size=0.2, random_state=0) print('the number of images in training set:', len(train)) print('the number of images in validation set:', len(val)) # compare the number of cases and non-cases l = [] for i in train: if (i[1] == 0): l.append("Pneumonia") else: l.append("Normal") sns.set_style('darkgrid') sns.countplot(l) # visualize images plt.figure(figsize=(5, 5)) plt.imshow(train[0][0], cmap='gray') plt.title(labels[train[0][1]]) plt.figure(figsize=(5, 5)) plt.imshow(train[-1][0], cmap='gray') plt.title(labels[train[-1][1]]) # separate features and labels x_train = [] y_train = [] x_val = []
'traffic_volume_corr.png'), format='png') plt.close() #%% plot histogram figure = plt.figure() rawData.hist() plt.tight_layout() plt.savefig(os.path.join(cfg.default.traffic_figures, 'traffic_volume_hist.png'), format='png') plt.close(figure) #%% count plots of categorical plt.figure() sns.countplot(y='weather_main', data=rawData) plt.tight_layout() plt.savefig(os.path.join(cfg.default.traffic_figures, 'weather_main_count.png'), format='png') plt.close(figure) plt.figure() sns.countplot(y='weather_description', data=rawData) plt.tight_layout() plt.savefig(os.path.join(cfg.default.traffic_figures, 'weather_description_count.png'), format='png') plt.close(figure) plt.figure()
# Drop 'label' column X_train = train.drop(labels = ["label"],axis = 1) # free some space del train g = sns.countplot(Y_train) Y_train.value_counts() # Check the data X_train.isnull().any().describe() test.isnull().any().describe() # Normalize the data X_train = X_train / 255.0 test = test / 255.0 # Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
def general(): if flask.request.method == 'GET': return (flask.render_template('general_search.html')) if flask.request.method == 'POST': reviews = [] search_query = '' search_query = request.form.get("search_query") search_query = search_query.replace(' ', '+') base_url = "https://www.amazon.in/s?k=" url = base_url + search_query header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36', 'referer': 'https://www.amazon.in/s?k=nike+shoes+men&crid=28WRS5SFLWWZ6&sprefix=nike%2Caps%2C357&ref=nb_sb_ss_organic-diversity_2_4' } search_response = requests.get(url, headers=header) search_response.status_code #function to get the content of the page of required query #orig search page with all products cookie = {} # insert request cookies within{} def getAmazonSearch(search_query): url = "https://www.amazon.in/s?k=" + search_query #print(url) page = requests.get(url, headers=header) if page.status_code == 200: return page else: return "Error" #function to get the contents of individual product pages using 'data-asin' number (unique identification number) #individual product page def Searchasin(asin): url = "https://www.amazon.in/dp/" + asin #print(url) page = requests.get(url, cookies=cookie, headers=header) if page.status_code == 200: return page else: return "Error" #function to pass on the link of 'see all reviews' and extract the content #review page def Searchreviews(review_link): url = "https://www.amazon.in" + review_link #print(url) page = requests.get(url, cookies=cookie, headers=header) if page.status_code == 200: return page else: return "Error" #EXTRACT ASIN data_asin = [] response = getAmazonSearch(search_query) soup = BeautifulSoup(response.content) for i in soup.findAll( "div", { 'class': [ "sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20", "s-result-item s-asin sg-col-0-of-12 sg-col-16-of-20 sg-col sg-col-12-of-16" ] }): data_asin.append(i['data-asin']) #extract price & avg rating price = [] average_rating = [] for i in range(len(data_asin)): response = Searchasin(data_asin[i]) soup = BeautifulSoup(response.content) h1 = 0 for l in soup.findAll("span", {'data-hook': "rating-out-of-text"}): h1 = l.text h1 = h1[:h1.index(' ')] h1 = float(h1) #average_rating.append(l.text) average_rating.append(h1) h2 = 0 for l in soup.findAll( "span", {'id': ["priceblock_ourprice", "priceblock_dealprice"]}): h2 = l.text h2 = h2.replace(',', '') h2 = h2[1:h2.index('.')] h2 = float(h2) #price.append("₹"+l.text[2:]) price.append(h2) #EXTRACT PRODUCT NAME product_name = [] response = getAmazonSearch(search_query) soup = BeautifulSoup(response.content) for i in soup.findAll( "span", { 'class': [ "a-size-medium a-color-base a-text-normal", "a-size-base-plus a-color-base a-text-normal" ] }): product_name.append(i.text) #make length of all equal data_asin1 = [] product_name1 = [] price1 = [] average_rating1 = [] for i in range(len(data_asin)): if (data_asin[i] == '0' or product_name[i] == '0' or price[i] == 0 or average_rating[i] == 0): o = 0 else: data_asin1.append(data_asin[i]) product_name1.append(product_name[i]) price1.append(price[i]) average_rating1.append(average_rating[i]) #EXTRACT SEE ALL REVIEW LINK link = [] data_asin2 = [] product_name2 = [] price2 = [] average_rating2 = [] for i in range(len(data_asin1)): response = Searchasin(data_asin1[i]) soup = BeautifulSoup(response.content) for l in soup.findAll("a", {'data-hook': "see-all-reviews-link-foot"}): if ( l['href'] ): #choose only those products whose see all reviews option is available link.append(l['href']) data_asin2.append(data_asin1[i]) product_name2.append(product_name1[i]) price2.append(price1[i]) average_rating2.append(average_rating1[i]) link1 = [] [link1.append(x) for x in link if x not in link1] data_asin3 = [] product_name3 = [] price3 = [] average_rating3 = [] k = 1 for i in range(len(data_asin2)): c = data_asin2[i] k = 1 for j in range(i + 1, len(data_asin2)): if (c == data_asin2[j]): k = 0 break if (k == 1): data_asin3.append(data_asin2[i]) product_name3.append(product_name2[i]) price3.append(price2[i]) average_rating3.append(average_rating2[i]) reviews = [] brand_name = [] brand_name1 = [] #product_name=[] product_name4 = [] rating = [] review_length = [0] length1 = 0 price4 = [] average_rating4 = [] for j in range(len(link1)): #for j in range(6): for k in range(1, 3): response = Searchreviews(link1[j] + '&pageNumber=' + str(k)) soup = BeautifulSoup(response.content) #for i in soup.findAll("a",{'class':"a-size-base a-link-normal"}): #brand_name.append(i.text) #for i in soup.findAll("a",{'data-hook':"product-link"}): #product_name.append(i.text) for i in soup.findAll("span", {'data-hook': "review-body"}): reviews.append(i.text) price4.append(price3[j]) product_name4.append(product_name3[j]) average_rating4.append(average_rating3[j]) pos = link1[j].index('-') brand_name1.append(link1[j][1:pos]) #pos1 = link[j].index('/',1) #product_name1.append(link[j][1:pos1]) review_length.append(len(reviews)) #for h in range(review_length[j+1]-review_length[j]): #product_name1.append(product_name[j]) #brand_name1.append(brand_name[j]) for i in soup.findAll("i", { 'data-hook': ["review-star-rating", "cmps-review-star-rating"] }): if (i.text): rating.append(i.text) rating1 = [] average_rating5 = [] for i in range(len(rating)): rating1.append(int(rating[i][0])) average_rating5.append(average_rating4[i]) rev = { 'Brand': brand_name1, 'Product': product_name4, 'Price': price4, 'Average Rating': average_rating5, 'Reviews': reviews, 'Review Rating': rating1 } review_data = pd.DataFrame.from_dict(rev) pd.set_option('max_colwidth', 800) review_data['Brand'] = review_data['Brand'].str.upper() review_data['Product'] = review_data['Product'].str.upper() def sentiment(n): if n > 2: #return 1 return 'Positive' else: #return 0 return 'Negative' #return 1 if n >= 3 return 0 elif n==3 else 0 review_data['Sentiment'] = review_data['Review Rating'].apply( sentiment) reviews = review_data.head() #Most common rating in reviews. plt.figure(figsize=(11, 6)) sns.countplot(review_data['Review Rating']) #review_data['rating'].value_counts().sort_index().plot(kind='bar') plt.title('Distribution of Rating') plt.xlabel('Rating') plt.ylabel('Number of Reviews') #plt.savefig(/usr/src/app/\\templates\\rating_distribution.png') plt.savefig('./static/rating_distribution.png') if (review_data.Brand.nunique() > 2): #Most reviewed brand plt.figure(figsize=(11, 6)) sns.countplot( y="Brand", data=review_data, order=review_data['Brand'].value_counts().iloc[:10].index, palette="Wistia_r") plt.title('Distribution of Brands') plt.ylabel('Brands') plt.xlabel('Number of Reviews') plt.savefig('./static/brand_distribution.png') plt.close() else: #Most reviewed product plt.figure(figsize=(11, 6)) sns.countplot( y="Product", data=review_data, order=review_data['Product'].value_counts().iloc[:10].index, palette="Wistia_r") plt.title('Distribution of Brands') plt.ylabel('Product Name') plt.xlabel('Number of Reviews') plt.savefig('./static/brand_distribution.png') plt.close() #Price Distribution plt.figure(figsize=(11, 6)) plt.title('Price Distribution') x = review_data['Price'] sns.distplot(x, bins='auto', kde=False, color='g') plt.ylabel('No of Products') plt.xlabel('Price (₹)') plt.savefig('./static/price_distribution.png') if (review_data.Brand.nunique() > 2): #Highest avg_rating plt.figure(figsize=(11, 6)) x = review_data.nlargest(len(review_data), ['Average Rating']) plt.barh(x['Brand'], x['Average Rating'], color='navajowhite') sns.barplot(y="Brand", x="Average Rating", data=review_data, palette="cool_r") #plt.xticks(rotation=90) plt.ylabel('Brand Name') plt.xlabel('Rating') plt.title('Average Ratings of Brands') plt.savefig('./static/avgrating_brands.png') else: #Highest avg_rating plt.figure(figsize=(11, 6)) x = review_data.nlargest(len(review_data), ['Average Rating']) plt.barh(x['Product'], x['Average Rating'], color='navajowhite') sns.barplot(y="Product", x="Average Rating", data=review_data, palette="cool_r") #plt.xticks(rotation=90) plt.ylabel('Product Name') plt.xlabel('Rating') plt.title('Average Ratings of Products') plt.savefig('./static/avgrating_brands.png') if (review_data.Brand.nunique() > 2): #Sentiment plt.figure(figsize=(11, 6)) sns.countplot( y="Brand", data=review_data, hue="Sentiment", order=review_data['Brand'].value_counts().iloc[:10].index, palette="Reds_r") plt.title('Sentiments of Brands') plt.ylabel('Brands') plt.xlabel('Number of Reviews') plt.savefig('./static/sentiment.png') else: #Sentiment plt.figure(figsize=(11, 6)) sns.countplot( y="Product", data=review_data, hue="Sentiment", order=review_data['Product'].value_counts().iloc[:10].index, palette="Reds_r") plt.title('Sentiments of Brands') plt.ylabel('Product') plt.xlabel('Number of Reviews') plt.savefig('./static/sentiment.png') review_data['Review'] = review_data['Reviews'].str.lower() all_reviews = review_data['Reviews'].str.split(' ') all_reviews_cleaned = [] for text in all_reviews: text = [x.strip(string.punctuation) for x in text] all_reviews_cleaned.append(text) text_review = [" ".join(text) for text in all_reviews_cleaned] final_text_review = " ".join(text_review) wordcloud_spam = WordCloud( background_color="white").generate(final_text_review) plt.figure(figsize=(11, 6)) plt.imshow(wordcloud_spam, interpolation='bilinear') plt.axis("off") plt.title('Most common words appearing in the reviews') plt.savefig('./static/word_cloud.png') def cleanText(raw_text, remove_stopwords=True, stemming=False, split_text=False): #text = BeautifulSoup(raw_text, 'lxml').get_text() #remove html letters_only = re.sub("[^a-zA-Z]", " ", raw_text) # remove non-character words = letters_only.lower().split() # convert to lower case if remove_stopwords: # remove stopword stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] if stemming == True: # stemming stemmer = PorterStemmer() #stemmer = SnowballStemmer('english') words = [stemmer.stem(w) for w in words] if split_text == True: # split text return (words) return (" ".join(words)) X_train, X_test, y_train, y_test = train_test_split( review_data['Reviews'], review_data['Sentiment'], test_size=0.2, random_state=0) # Preprocess text data in training set and validation set X_train_cleaned = [] X_test_cleaned = [] for d in X_train: X_train_cleaned.append(cleanText(d)) for d in X_test: X_test_cleaned.append(cleanText(d)) tfid = TfidfVectorizer() tf_xtr = tfid.fit_transform(X_train) tf_xte = tfid.transform(X_test) model_tf = LogisticRegression() model_tf.fit(tf_xtr, y_train) feature_names = np.array(tfid.get_feature_names()) sorted_coef_index = model_tf.coef_[0].argsort() negative = feature_names[sorted_coef_index[:30]].tolist() positive = feature_names[sorted_coef_index[:-31:-1]].tolist() #text_review = [" ".join(text) for text in negative] final_text_review = " ".join(positive) wordcloud_spam = WordCloud( max_font_size=50, background_color="white").generate(final_text_review) plt.figure(figsize=(11, 6)) plt.imshow(wordcloud_spam, interpolation='bilinear') plt.axis("off") plt.title('Most common words appearing in positive reviews') plt.savefig('./static/positive.png') final_text_review = " ".join(negative) wordcloud_spam = WordCloud( max_font_size=50, background_color="white").generate(final_text_review) plt.figure(figsize=(11, 6)) plt.imshow(wordcloud_spam, interpolation='bilinear') plt.axis("off") plt.title('Most common words appearing in negative reviews') plt.savefig('./static/negative.png') #reviews = data.to_dict() reviews = reviews.values.tolist() return flask.render_template('general_search.html', search_query=search_query, reviews=reviews, length=len(reviews), negative=negative, positive=positive)
wbcd.head() wbcd.shape wbcd.describe() wbcd = wbcd.drop('id', axis=1) wbcd.head() wbcd['diagnosis'].value_counts() sns.heatmap(wbcd.isnull(), cmap='viridis', cbar=False, yticklabels=False) sns.countplot(wbcd['diagnosis']) from imblearn.over_sampling import SMOTE x = wbcd.iloc[:, 1:30] x.head() y = wbcd.iloc[:, 0] y.head() x.columns x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
def main(): map_characters1 = {0: 'No COVID', 1: 'Yes COVID'} dict_characters = map_characters1 df = pd.DataFrame() print(dict_characters) model_file = "weights/covid.pb" weight_path1 = './drive/My Drive/Colab Notebooks/weights/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5' weight_path2 = './drive/My Drive/Colab Notebooks/weights/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5' train_dir = "./drive/My Drive/Colab Notebooks/covid_dataset/train/" test_dir = "./drive/My Drive/Colab Notebooks/covid_dataset/test/" with tf.device('/device:GPU:0'): X_train, y_train = get_data(train_dir) X_test, y_test = get_data(test_dir) # Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0]) y_trainHot = to_categorical(y_train, num_classes=2) y_testHot = to_categorical(y_test, num_classes=2) plotHistogram(X_train[1]) plt.show() class_weight1 = class_weight.compute_class_weight( 'balanced', np.unique(y_train), y_train) pretrained_model_1 = VGG16(weights=weight_path1, include_top=False, input_shape=(299, 299, 3)) #pretrained_model_2 = InceptionV3(weights = weight_path2, include_top=False, input_shape=(299, 299, 3)) optimizer1 = keras.optimizers.SGD(learning_rate=0.1) optimizer2 = keras.optimizers.Adam(lr=0.01, epsilon=0.0001) # Deal with imbalanced class sizes below # Make Data 1D for compatability upsampling methods X_trainShape = X_train.shape[1] * X_train.shape[2] * X_train.shape[3] X_testShape = X_test.shape[1] * X_test.shape[2] * X_test.shape[3] X_trainFlat = X_train.reshape(X_train.shape[0], X_trainShape) X_testFlat = X_test.reshape(X_test.shape[0], X_testShape) Y_train = y_train Y_test = y_test ros = RandomUnderSampler(sampling_strategy='auto') X_trainRos, Y_trainRos = ros.fit_sample(X_trainFlat, Y_train) X_testRos, Y_testRos = ros.fit_sample(X_testFlat, Y_test) Y_trainRosHot = to_categorical(Y_trainRos, num_classes=2) Y_testRosHot = to_categorical(Y_testRos, num_classes=2) df["labels"] = y_train lab = df['labels'] dist = lab.value_counts() sns.countplot(lab) # Make Data 2D again for i in range(len(X_trainRos)): height, width, channels = IMG_SIZE, IMG_SIZE, 3 X_trainRosReshaped = X_trainRos.reshape(len(X_trainRos), height, width, channels) for i in range(len(X_testRos)): height, width, channels = IMG_SIZE, IMG_SIZE, 3 X_testRosReshaped = X_testRos.reshape(len(X_testRos), height, width, channels) # Plot Label Distribution dfRos = pd.DataFrame() dfRos["labels"] = Y_trainRos labRos = dfRos['labels'] distRos = lab.value_counts() sns.countplot(labRos) class_weight2 = class_weight.compute_class_weight( 'balanced', np.unique(Y_trainRos), Y_trainRos) print("New Class Weights: ", class_weight2) pretrainedNetwork(X_trainRosReshaped, Y_trainRosHot, X_testRosReshaped, Y_testRosHot, pretrained_model_1, weight_path1, class_weight2, 2, 100, optimizer2, map_characters1) return 1
ax2.set_xlabel("Sprint Speed") ax2.set_ylabel("Agility") ax3.scatter(x_SprintSpeed, y_balance, s=23) ax3.set_xlabel("Sprint Speed") ax3.set_ylabel("Balance") ax4.scatter(x_SprintSpeed, y_dribbling, s=23) ax4.set_xlabel("Sprint Speed") ax4.set_ylabel("Dribbling") plt.subplots_adjust(top=0.5, right=0.8) plt.show() p = sns.countplot(x='Preferred Foot', data=df) plt.show() p = sns.countplot(x='Weak Foot', data=df) plt.show() p = sns.countplot(x='Position', data=df) _ = plt.setp(p.get_xticklabels(), rotation=90) plt.show() top_10 = df.head(10) p = sns.barplot(x='Name', y='Finishing', data=top_10) _ = plt.setp(p.get_xticklabels(), rotation=90) plt.show() plt.figure(1, figsize=(15, 7))
# Config os.chdir("/home/jovyan/work") %config InlineBackend.figure_format = 'retina' %matplotlib inline plt.rcParams["figure.figsize"] = (12, 3) # Preparation data = pd.read_csv("./data/times_magazine.csv") print(tabulate(data.head(), headers="keys", tablefmt="psql")) # Modeling N = len(data.Female) lam_ = data.Female.mean() with pm.Model() as model: lam_1 = pm.Exponential("lam_1", lam_) lam_2 = pm.Exponential("lam_2", lam_) tau = pm.DiscreteUniform("tau", lower=1923, upper=1923+N) idx = np.arange(1923, 1923+N) lam = pm.math.switch(tau > idx, lam_1, lam_2) female = pm.Poisson("female", lam, observed=data.Female) step = pm.Metropolis() trace = pm.sample(20000, tune=5000, step=step) # Plot fig, ax = plt.subplots(nrows=1, ncols=2) sns.distplot(trace["lam_1"], label="λ1", ax=ax[0]) sns.distplot(trace["lam_2"], label="λ2", ax=ax[0]) sns.countplot(trace["tau"], ax=ax[1]) plt.xticks(rotation=90) plt.tight_layout() plt.savefig("./results/3-14-times-magazine.png")
"Date", "Death (1 = Yes)", "Injury (1 = Yes)", "Time", "State", "WaterConditions", "Wind", "Visibility", "DayofWeek", "AccidentCause", "AccidentEvent", "OperatorGender", "VesselType", "Operation", "Activity", "DeceasedGender", "CauseofDeath", "DeceasedPFDWorn", "DeceasedRole", "InjuredGender", "InjuryType", "InjuredRole", "TotalDamage" ]] # creating as smaller dataframe to work catagorical features # The intent here is to uncover any intersting insights with repect to day of month a person was contacted, the amount of times a person was contacted, the duration of the contact, if the person previously participated in campaings, the balance of the clients account, and how these # interact with the target feature. dfcat.head() # inspect dfplot dfcat.shape # inspect dfplot dfcat.columns #Accident Event (Decending) sns.set(style='white') sns.countplot(x="AccidentEvent", data=dfcat, order=dfcat["AccidentEvent"].value_counts().index).set( xlabel="AccidentEvent", ylabel="Count") plt.xticks(rotation=90, size=7) #Accident Cause (Decending) sns.set(style='white') sns.countplot(x="AccidentCause", data=dfcat, order=dfcat["AccidentCause"].value_counts().index).set( xlabel="Accident Cause", ylabel="Count") plt.xticks(rotation=90, size=7) #Day of week sns.set(style='white') order = [ "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
#Data manipulations to clean the dataset. Let's start with the column TotalCharges telecom_df["TotalCharges"] = telecom_df["TotalCharges"].replace(" ", np.nan) telecom_df["TotalCharges"] = telecom_df["TotalCharges"].astype(float) telecom_df = telecom_df[telecom_df["TotalCharges"].notnull()] telecom_df = telecom_df.reset_index()[telecom_df.columns] # Next we will clean the data for column 'MultipleLines' telecom_df["MultipleLines"] = telecom_df["MultipleLines"].replace( {"No phone service": "No"}) telecom_df["MultipleLines"].unique() cols_list = [ "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies" ] for i in cols_list: sns.countplot(data=telecom_df, x=i) plt.show() #As we see that, the columns from cols_list could be cleaned to have only 2 values each which are 'Yes' & 'No' for i in cols_list: telecom_df[i] = telecom_df[i].replace({"No internet service": "No"}) #next we will clean tenure column - Currently tenure is integer column we can create seperate bins for the tenure and make a categorical column print(telecom_df["tenure"].unique()) plt.hist(telecom_df['tenure']) plt.show() # Function to create categorical column for tenure def tenure_cat(telecom_df): if telecom_df["tenure"] <= 12: return "tenure-0-12" elif (telecom_df["tenure"] > 12) & (telecom_df["tenure"] <= 24):
import pandas as pd import numpy as np #IMPORTING DATASET data = pd.read_csv( '/content/gdrive/My Drive/Colab Notebooks/CreditCard_Fraud_Detection/creditcard.csv', index_col=False) df = data #ANALYSIS OF BAIS IN DATASET import matplotlib.pyplot as plt import seaborn as sns print('Distribution of the Classes in the subsample dataset') print(df['Class'].value_counts() / len(df)) sns.countplot('Class', data=df) plt.title('Unequally Distributed Classes', fontsize=14) plt.show() #SINCE THE DATASET IS HIGHLY UNBALANCED - APPLY RESAMPLING AND SHUFFLING TECHNIUES AND THEN ANALYSING THE BAIS IN RESAMPLED AND RESHUFFLED DATA #STEP 1 - SCALING THE TIME AND AMOUNT ATTRIBUTES USING ROBUSTSCALER from sklearn import preprocessing rob_scalar = preprocessing.RobustScaler() df['Amt_Scaled'] = rob_scalar.fit_transform(df['Amount'].values.reshape(-1, 1)) df['Time_Scaled'] = rob_scalar.fit_transform(df['Time'].values.reshape(-1, 1)) df.drop(['Time', 'Amount'], axis=1, inplace=True) #STEP 2 - SMALL TWEAK IN DATASET - JUST PIVOTING THE POSITIONS OF TIME AND AMOUNT amt_scl = df['Amt_Scaled'] time_scl = df['Time_Scaled']
import numpy as np import seaborn as sb import matplotlib.pyplot as plt from sklearn import preprocessing from sklearn import metrics from scipy.stats import mode from scipy import stats insurance=pd.read_csv('C:/Users/Eswar Chowdary/Desktop/Projects/Health insurance-1/Insurance Dataset .csv') insurance['result'].value_counts() #checking the data is imbalance or not. insurance.isnull().sum()#checking for the null values sb.countplot(x='areaservice',data=insurance,palette='hls') sb.countplot(x='certificatenum',data=insurance,palette='hls') sb.countplot(x='hospitalcounty',data=insurance,palette='hls') sb.countplot(x='hospitalid',data=insurance,palette='hls') sb.countplot(x='hospitalname',data=insurance,palette='hls') sb.countplot(x='age',data=insurance,palette='hls') sb.countplot(x='zipcode',data=insurance,palette='hls') sb.countplot(x='gender',data=insurance,palette='hls') sb.countplot(x='culturalgroup',data=insurance,palette='hls') sb.countplot(x='ethnicity',data=insurance,palette='hls') sb.countplot(x='daysspendhospital',data=insurance,palette='hls') sb.countplot(x='admissiontype',data=insurance,palette='hls') sb.countplot(x='homeorselfcare',data=insurance,palette='hls') sb.countplot(x='yeardischarge',data=insurance,palette='hls') sb.countplot(x='ccsdiagnosiscode',data=insurance,palette='hls') sb.countplot(x='ccsdiagnosisdescription',data=insurance,palette='hls')
print(relation) sns.pairplot(X_train) # Code ends here # -------------- import seaborn as sns import matplotlib.pyplot as plt # Code starts here cols = ['children', 'sex', 'region', 'smoker'] fig, axes = plt.subplots(2, 2) for i in range(0, 2): for j in range(0, 2): col = cols[i * 2 + j] sns.countplot(x=X_train[col], hue=y_train, ax=axes[i, j]) # Code ends here # -------------- from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # parameters for grid search parameters = {'C': [0.1, 0.5, 1, 5]} # Code starts here lr = LogisticRegression(random_state=9) grid = GridSearchCV(estimator=lr, param_grid=parameters) grid.fit(X_train, y_train)
# CABIN print('Percent of missing "Cabin" records is %.2f%%' % ((train_df['Cabin'].isnull().sum() / train_df.shape[0]) * 100)) # since 77% of Cabin data is missing, we will omit this information in our model # EMBARKED print('Per cent of missing "Embarked" records is %.2f%%' % ((train_df['Embarked'].isnull().sum() / train_df.shape[0]) * 100)) # oly 2 data are missing out of 1200 so we will impute the most popular port print( 'Boarded passengers grouped by port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton):' ) print(train_df['Embarked'].value_counts()) # palette='Set2' defines the style sns.countplot(x='Embarked', data=train_df, palette='Set2') # plt.show() # the most popular Embarked port is 'S' so Southampton, so I will impute this value into the mising ones # DATA Wrangling train_data = train_df.copy() # inserting median of Age into the missing values: train_data['Age'].fillna(train_df['Age'].median(skipna=True), inplace=True) train_data['Embarked'].fillna(train_df['Embarked'].value_counts().idxmax(), inplace=True) train_data.drop('Cabin', axis='columns', inplace=True) # check if all the NULL values are gone print(train_data.isnull().sum()) # Mergining two variables which are possibly multicollinear into one Variable ('TravellingAlone")
from sklearn.model_selection import cross_val_score, StratifiedKFold from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from imblearn.metrics import geometric_mean_score, specificity_score, sensitivity_score # Glass Type Identification KEEL Dataset (~65%) gls_df = pd.read_csv( 'Glass2.csv', names=["RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class"], index_col=False) # Check for Dataset Imbalance and Plot Class Distribution gls_total = gls_df["Class"] gls_neg = gls_total.value_counts()[0] / len(gls_total) gls_pos = gls_total.value_counts()[1] / len(gls_total) sns.countplot('Class', data=gls_df) plt.title('Glass Dataset Class Distribution') plt.savefig('Glass_Imbalance_Dist.png', dpi=300) print("Class '0': ", round(gls_neg, 2), "%") print("Class '1': ", round(gls_pos, 2), "%") print("Imbalance Ratio: ", round(gls_neg / gls_pos, 2)) # Split dataset into 70/30 train-test ratio x = ["RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe"] y = ["Class"] x_gls = gls_df[x] y_gls = gls_df[y] x_train, x_test, y_train, y_test = train_test_split(x_gls, y_gls, test_size=0.3)
maillists = pd.read_csv(maillists) migrations = 'https://raw.githubusercontent.com/Nedu/RecommenderSystemTeamJava/master/data/migrations.csv' migrations = pd.read_csv(migrations) thoughts = 'https://raw.githubusercontent.com/Nedu/RecommenderSystemTeamJava/master/data/thoughts.csv' thoughts = pd.read_csv(thoughts) user_settings = 'https://raw.githubusercontent.com/Nedu/RecommenderSystemTeamJava/master/data/user_settings.csv' user_settings = pd.read_csv(user_settings) thoughts.head() #checking first 5 rows thoughts.nunique() #checking number of unique in each columns sns.countplot(x='user_id',data=thoughts) #this counts the amount of time a user_id appears corr=thoughts.corr() sns.heatmap(corr.corr(),cmap='coolwarm',annot=True) #check if positively or negetively corrralated plt.figure(figsize = (15,10)) sns.countplot(y='created_at',hue ='user_id',data=thoughts) #counts the amount of created at was found also spliting it categories under user_id through hue plt.figure(figsize = (15,10)) sns.countplot(y='updated_at',hue ='user_id',data=thoughts) contact_settings.head() extfeeds.head() extfeeds.shape
# In[5]: test_data.describe() # In[6]: # Purely going with the gender for prediction first # With the both dataset we could see that age has missing columns # We can either drop out them or fill the missing values with mean value import seaborn as sns sns.countplot(data=train_data,x='Sex',hue='Survived') # In[7]: train_data['Age'].fillna((train_data['Age'].mean()), inplace=True) # In[8]: train_data.describe(include='all') # In[9]:
saveplot_highlow_balstatus(data, 'loan') # => 有貸款的人低餘額人數比例較高餘額人數比例高 saveplot_highlow_balstatus(data, 'marital') # => 離婚的人低餘額人數比例較高餘額人數比例高 saveplot_highlow_balstatus(data, 'job') # => 退休人士、管理階層、學生與自僱者的高餘額人數比例與低於額人數比例差異較多 saveplot_highlow_balstatus(data, 'default') # => 違約的人低餘額人數比例較高餘額人數比例高 saveplot_highlow_balstatus(data, 'housing') # => 有房產的人低餘額人數比例較高餘額人數比例高 # create new column "duration_level" data['duration_level'] = data['duration'].apply( lambda x: 'above' if x >= data['duration'].mean() else 'below') ax = sns.countplot(y='deposit', hue='duration_level', data=data) #show percentage on countplot c = 1 for p in ax.patches: if c <= 2: total = len(data[data['deposit'] == 'yes']) else: total = len(data[data['deposit'] == 'no']) percentage = '{:.1f}%'.format(100 * p.get_width() / total) x = p.get_x() + p.get_width() + 0.02 y = p.get_y() + p.get_height() / 2 ax.annotate(percentage, (x, y)) c += 1 plt.title('The distribubtion of deposit by duration status') plt.savefig('deposit_count_bydurationstatus.png', dpi=300) # => 在開定存的人中,有56.2%的人通話時間高於平均;
# In[7]: #If the season column is "None" them it is most likely a movie, lets add another column to our dataframe #my_history[my_history['season'].isna()] df['show_type'] = df.apply(lambda x: 'Movie' if pd.isnull(x['season']) else 'TV Show', axis=1) df # In[ ]: # In[8]: plt.figure(figsize=(12, 8)) sns.countplot(x='show_type', data=df) # ### Moro watches to a significant level, more tv shows than movies. # In[9]: plt.figure(figsize=(18, 10)) sns.countplot(y='show_name', data=df, order=df['show_name'].value_counts().iloc[:10].index) # ### I thought 'Friends' would be taking the lead, but look at that... Moro must really love Fresh Princes ;) # In[10]: plt.figure(figsize=(12, 8))
ax=axes[0]).set_title('Employee Satisfaction Distribution') axes[0].set_ylabel('Employee Count') # Graph Employee Evaluation sns.distplot(df.evaluation, kde=False, color="r", ax=axes[1]).set_title('Employee Evaluation Distribution') axes[1].set_ylabel('Employee Count') # Graph Employee Average Monthly Hours sns.distplot( df.averageMonthlyHours, kde=False, color="b", ax=axes[2]).set_title('Employee Average Monthly Hours Distribution') axes[2].set_ylabel('Employee Count') f, ax = plt.subplots(figsize=(15, 4)) sns.countplot(y="salary", hue='turnover', data=df).set_title('Employee Salary Turnover Distribution') # Employee distribution # colors for different department color_types = [ '#78C850', '#F08030', '#6890F0', '#A8B820', '#A8A878', '#A040A0', '#F8D030', '#E0C068', '#EE99AC', '#C03028', '#F85888', '#B8A038', '#705898', '#98D8D8', '#7038F8' ] f, ax = plt.subplots(figsize=(15, 4)) # Count Plot (a.k.a. Bar Plot) sns.countplot( x='department', data=df, palette=color_types).set_title('Employee Department Distribution')
plt.xlabel("t") plt.ylabel("Passengers") plt.plot(airline["t_square"], airline["Passengers"], "bo") plt.xlabel("t_square") plt.ylabel("Passengers") # table pd.crosstab(airline["log_Passengers"], airline["Passengers"]) pd.crosstab(airline["t"], airline["Passengers"]) pd.crosstab(airline["t_square"], airline["Passengers"]) ## Barplot pd.crosstab(airline["log_Passengers"], airline["Passengers"]).plot(kind="bar", width=1.85) pd.crosstab(airline["t"], airline["Passengers"]).plot(kind="bar", width=1.85) pd.crosstab(airline["t_square"], airline["Passengers"]).plot(kind="bar", width=1.85) sns.countplot(x="Passengers", data=airline, palette="hls") sns.countplot(x="log_Passengers", data=airline, palette="hls") sns.countplot(x="t", data=airline, palette="hls") sns.countplot(x="t_square", data=airline, palette="hls") # getting boxplot of Delivery Time with respect to each category of Sorting Time sns.boxplot(x="log_Passengers", y="Passengers", data=airline, palette="hls") sns.boxplot(x="t", y="Passengers", data=airline, palette="hls") sns.boxplot(x="t_square", y="Passengers", data=airline, palette="hls") sns.pairplot( airline.iloc[:, 0:17] ) # histogram of each column and scatter plot of each variable with respect to other columns sns.pairplot(airline, hue="Passengers", size=2) airline["Passengers"].value_counts() airline["log_Passengers"].value_counts() airline["t"].value_counts() airline["t_square"].value_counts()
df_tmp.loc['2020-09-22':, 'rac'] = True # type: ignore df_tmp = df_tmp.reset_index() df_tmp['hour'] = df_tmp['datetime'].dt.hour plt.figure(figsize=(10, 10)) sns.set(style="whitegrid", palette=sns.color_palette("muted", n_colors=6, desat=1.0)) sns.barplot(y=df_tmp['hour'], x=df_tmp['count'], hue=df_tmp['rac'], orient='h') plt.draw() # %% df_melt = pd.melt(df, value_vars=['rac'], value_name='ractopamine') plt.figure(figsize=(10, 10)) sns.set(style="whitegrid", palette=sns.color_palette("muted", n_colors=6, desat=1.0)) ax = sns.countplot(data=df_melt, x='ractopamine', hue='ractopamine') for p in ax.patches: ax.annotate(f'\n{p.get_height()}', (p.get_x() + 0.2, p.get_height()), ha='center', va='top', color='white', size=18) plt.draw() # %% # using sklearn's MinMaxScaler scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1)) df_train = df.iloc[:, 3:-1].copy()