def run_demographics(f):
    df = pd.read_csv(f, skiprows=1,usecols=['DB_ID', 'Age', 'Gender', 'Day 3'])
    df = df[df['Day 3']=='y']
    
    for idx in range(len(df)):
        if df['Gender'].iloc[idx] == 'M':
            df['Gender'].iloc[idx] = 'm'
        if df['Gender'].iloc[idx] == 'F':
            df['Gender'].iloc[idx] = 'f'
    
    print '\n#### overall cohort ####\n'        
    print 'N = ' + str(len(df))
    print 'N female = ' + str(len(df[df['Gender']=='f']))
    print 'N male = ' + str(len(df[df['Gender']=='m']))
    sns.countplot(df['Gender'])
    plt.show()
    print 'age distribution - female:'
    sns.distplot(df['Age'][df['Gender'] == 'f'])
    plt.show()
    print 'age distribution - male:'
    sns.distplot(df['Age'][df['Gender'] == 'm'])
    plt.show()
    
    print '\n\n#### young cohort ####\n'  
    print 'N = ' + str(len(df[df['Age']<=40]))
    print 'N female = ' + str(len(df[(df['Age']<=40) & (df['Gender']=='f')]))
    print 'N male = ' + str(len(df[(df['Age']<=40) & (df['Gender']=='m')]))
    sns.countplot(df[df['Age']<=40]['Gender'])
    plt.show()
    print 'age distribution - female:'
    sns.distplot(df['Age'][(df['Age']<=40) & (df['Gender'] == 'f')])
    plt.show()
    print 'age distribution - male:'
    sns.distplot(df['Age'][(df['Age']<=40) & (df['Gender'] == 'm')])
    plt.show()
Example #2
0
def inspect_dataset(train_data, test_data):
    """
        查看数据集
    """
    print('\n===================== 数据查看 =====================')
    print('训练集有{}条记录。'.format(len(train_data)))
    print('测试集有{}条记录。'.format(len(test_data)))

    # 可视化各类别的数量统计图
    plt.figure(figsize=(10, 5))

    # 训练集
    ax1 = plt.subplot(1, 2, 1)
    sns.countplot(x='price_range', data=train_data)

    plt.title('训练集')
    plt.xticks(rotation='vertical')
    plt.xlabel('价格等级')
    plt.ylabel('数量')

    # 测试集
    plt.subplot(1, 2, 2, sharey=ax1)
    sns.countplot(x='price_range', data=test_data)

    plt.title('测试集')
    plt.xticks(rotation='vertical')
    plt.xlabel('价格等级')
    plt.ylabel('数量')

    plt.tight_layout()
    plt.show()
def run_demographics(df):
    
    print '\n#### overall cohort, at time of scanning ####\n'        
    print 'N = ' + str(len(df))
    print 'N female = ' + str(len(df[df['gender']==1]))
    print 'N male = ' + str(len(df[df['gender']==2]))
    sns.countplot(df['gender'])
    plt.show()
    print 'age distribution - female:'
    sns.distplot(df['age day 3'][df['gender'] == 1].dropna())
    plt.show()
    print 'age distribution - male:'
    sns.distplot(df['age day 3'][df['gender'] == 2].dropna())
    plt.show()
    print '\n\n'
    
    for survey in ['age day 1', 'age day 2', 'age day 3', 'age day 4', 'age day 5a', 'age day 5b', 'age day 6', 'age LEMON']:
    
        print '\n\n\n#### young cohort, %s ####\n' % survey  
        print 'N = ' + str(len(df[df[survey]<=40]))
        print 'N female = ' + str(len(df[(df[survey]<=40) & (df['gender']==1)]))
        print 'N male = ' + str(len(df[(df[survey]<=40) & (df['gender']==2)]))
        sns.countplot(df[df[survey]<=40]['gender'])
        plt.show()
        print 'age distribution - female:'
        sns.distplot(df[survey][(df[survey]<=40) & (df['gender'] == 1)].dropna(), bins=15)
        plt.show()
        print 'age distribution - male:'
        sns.distplot(df[survey][(df[survey]<=40) & (df['gender'] == 2)].dropna(), bins=15)
        plt.show()
def run_demographics(df):
    
    print '\n#### overall cohort, at time of scanning ####\n'        
    print 'N = ' + str(len(df))
    print 'N female = ' + str(len(df[df['gender']==1]))
    print 'N male = ' + str(len(df[df['gender']==2]))
    sns.countplot(df['gender'])
    plt.show()
    print 'age distribution - female:'
    sns.distplot(df['age_C'][df['gender'] == 1].dropna())
    plt.show()
    print 'age distribution - male:'
    sns.distplot(df['age_C'][df['gender'] == 2].dropna())
    plt.show()
    
    
    for survey in ['A', 'B', 'C', 'F', 'G']:
    
        print '\n\n#### young cohort, Survey %s ####\n' % survey  
        print 'N = ' + str(len(df[df['age_%s' % survey]<=40]))
        print 'N female = ' + str(len(df[(df['age_%s' % survey]<=40) & (df['gender']==1)]))
        print 'N male = ' + str(len(df[(df['age_%s' % survey]<=40) & (df['gender']==2)]))
        sns.countplot(df[df['age_%s' % survey]<=40]['gender'])
        plt.show()
        print 'age distribution - female:'
        sns.distplot(df['age_%s' % survey][(df['age_%s' % survey]<=40) & (df['gender'] == 1)].dropna(), bins=15)
        plt.show()
        print 'age distribution - male:'
        sns.distplot(df['age_%s' % survey][(df['age_%s' % survey]<=40) & (df['gender'] == 2)].dropna(), bins=15)
        plt.show()
Example #5
0
def inspect_dataset(train_data, test_data):
    """
        查看数据集
        参数:
            - train_data    训练数据
            - test_data     测试数据
    """
    print('\n===================== 数据查看 =====================')
    print('训练集有{}条记录。'.format(len(train_data)))
    print('测试集有{}条记录。'.format(len(test_data)))

    # 可视化各类别的数量统计图
    plt.figure(figsize=(10, 5))

    # 训练集
    ax1 = plt.subplot(1, 2, 1)
    sns.countplot(x='text_type', data=train_data)

    plt.title('Training Data')
    plt.xlabel('Type')
    plt.ylabel('Count')

    # 测试集
    plt.subplot(1, 2, 2, sharey=ax1)
    sns.countplot(x='text_type', data=test_data)

    plt.title('Test Data')
    plt.xlabel('Type')
    plt.ylabel('Count')

    plt.tight_layout()
    plt.show()
def bar_plot(data, col, hue=None, file_name=None):
    sns.countplot(col, hue=hue, data=data.sort(col))
    sns.despine(left=True)

    subplots = [x for x in plt.gcf().get_children() if isinstance(x, matplotlib.axes.Subplot)]
    for plot in subplots:
        rectangles = [x for x in plot.get_children() if isinstance(x, matplotlib.patches.Rectangle)]
    autolabel(rectangles)
Example #7
0
def make_bar():
    df = pd.DataFrame(data={"age": randint(10, 50, 1000),
                        "response": [choice(['Yes', 'No']) for i in range(1000)]})

    df['age_group'] = pd.cut(df.age, bins=[g for g in range(10, 51, 5)], include_lowest=True)
    df.head()

    sns.countplot(y='response', hue='age_group', data=df, palette="Greens_d")
	def specificAminoAcidPDACanalysis(mostCommonPDACgeneOnly):
		#variant variable should be updated to variant of interest
		variant='U2AF1'
		#codon_changes is a list of lists, each list represents a patient, and the values in the list
		#are a list of all the nonsyn mutations that patient has of the given gene above
		#NOTE: one cannot derive with particular patient is responsible for the list (see patientIDwithVariant variable)
		codon_changes=[]

		#removes patients in varsInAA that are not PDAC
		#NOTE: varsInAA now only consists of PDAC ONLY patients for duration of function
		for patients in noPDAC:
			if patients in varsInAA:
				del varsInAA[patients]
		
		#a list of patient IDs that have at least one nonsyn mutation for the variant gene listed above
		patientIDwithVariant=[]
		for key in varsInAA:
			codon_changes.append(list(varsInAA[key].loc[varsInAA[key].gene==variant]['amino_acid_change']))
			if len(list(varsInAA[key].loc[varsInAA[key].gene==variant]['amino_acid_change']))>0:
				patientIDwithVariant.append(key)

		#patientsWithMuts removes lists that are empty, each list should have the nonsyn codon changes listed for a particular patientID
		#as a check, the len(patientsWithMuts)==len(patientIDwithVariant))	
		patientsWithMuts=[codon_changes[x] for x in range(len(codon_changes)) if len(codon_changes[x])!=0]
		print patientsWithMuts
		print 'The number of PDAC patients with at least one variant/mutation in '+str(variant)+' is '+ str(len(patientIDwithVariant))
	
		#removes codon_changes from list and combines all into a single list so can turn into dataframe
		amino_acid_changes=[]
		for x in range(0, len(patientsWithMuts)):
			for i in range(0, len(patientsWithMuts[x])):
				amino_acid_changes.append(patientsWithMuts[x][i])
		
		#ordered=Counts and orders amino_acid_changes from most frequent counts to least frequent counts
		#ordered is a dictionary where key=nonsyn mutation, value=number of occurennces of that mutation
		ordered=OrderedDict(sorted(Counter(amino_acid_changes).items(), key=lambda x: -x[1]))

		#extracts just the mutation name from ordered, so it can be passed through sns to
		#illustrate the order of mutations in graph in decreasing frequency
		order_in_graph=[]
		for i in ordered:
			order_in_graph.append(i)
	
		
		#this will plot the number of patients that have a particular non-syn mutation of the
		#given variant variable gene listed at the beginning of function
		#countplot parameters
		headers=['non-synonymous amino acid changes']
		#converts amino_acid_changes into a convenient dataframe for graphing purposes,
		#although amino_acid_changes is not orders, it is ok, because when using countplot, we specify
		#the order in which to plot the mutations
		variant_dataframe=pandas.DataFrame(amino_acid_changes, columns=headers)
		sns.countplot(x='non-synonymous amino acid changes', data=variant_dataframe, order=order_in_graph)
		sns.plt.title('Variants in '+str(variant)+' across '+str(len(patientsWithMuts))+' PDAC patients')
		sns.plt.show()

		return variant, patientIDwithVariant, patientsWithMuts
Example #9
0
def make_figures(df, outdir):
    import matplotlib
    matplotlib.use('Agg')  # avoid using the X backend for saving figures

    import pylab as pl
    import seaborn as sns
    pl.figure()
    sns.countplot(y='OS', data=df)
    pl.xlabel('Number of submissions')
    pl.ylabel('Primary operating system')
    pl.savefig(opj(outdir, 'graph_sub_by_os.png'))
    def weather_distribution(self):
        data_dir = g_singletonDataFilePath.getTrainDir()
        self.gapdf = self.load_weatherdf(data_dir)
        print self.gapdf['weather'].describe()
#         sns.distplot(self.gapdf['gap'],kde=False, bins=100);
        
        sns.countplot(x="weather", data=self.gapdf, palette="Greens_d");
        plt.title('Countplot of Weather')
#         self.gapdf['weather'].plot(kind='bar')
#         plt.xlabel('Weather')
#         plt.title('Histogram of Weather')
        return
Example #11
0
 def plot(self):
     print('\n','Take a moment to review the plots in the export folder. These show the diversity of information found within each data column.')
     print('ID and Category Columns have been reduced to their first character.')  
     print('Sensitive information has been converted to numbers.') 
     print('At this time, this program does not support plots of Time categories', '\n')
     datCopy = self.data
     
     ID = datCopy.xs('ID', axis=1, level=1)
     for i in ID.columns:
         dat = []
         for row in ID[i]:
             if row is np.nan:
                 continue
             else:
                 temp = row[0]
                 dat.append(temp)
         
         dat1 = np.array(dat)
         sns_plot = sns.countplot(x=dat1)
         sns_plot = sns_plot.get_figure()
         sns_plot.savefig("exports/"+i+".png")
         
     CAT = datCopy.xs('CAT', axis=1, level=1)
     for i in CAT.columns:
         dat = []
         for row in CAT[i]:
             if row is np.nan:
                 continue
             else:
                 temp = row[0]
                 dat.append(temp)
         
         dat1 = np.array(dat)
         sns_plot = sns.countplot(x=dat1)
         sns_plot = sns_plot.get_figure()
         sns_plot.savefig("exports/"+i+".png")
         
     CONT = datCopy.xs('CONT', axis=1, level=1)
     for i in CONT.columns:
         dat = []
         for row in CONT[i]:
             if row is np.nan:
                 continue
             else:
                 #temp = row[0]
                 dat.append(row)
         
         dat1 = np.array(dat)
         sns_plot = sns.countplot(x=dat1)
         sns_plot = sns_plot.get_figure()
         sns_plot.savefig("exports/"+i+".png")
def plot(data):
    if len(data) == 0:
        return

    plot_order = [
        "delParA", "delParAB",
        "WT ParAB int", "WT ParB int",
        "WT episomal ParB",
    ]
    plt.figure(figsize=(8, 8))

    ax = plt.subplot(221)
    sns.barplot(
        x="dataset", y="v",
        data=data,
        order=plot_order
    )
    _fmt_barplot(ax, r"Mean separation velocity (\si{\micro\metre\per\hour})")

    ax = plt.subplot(222)
    sns.barplot(
        x="dataset", y="elongation",
        data=data,
        order=plot_order
    )
    _fmt_barplot(ax, r"Mean elongation rate (\si{\micro\metre\per\hour})")

    ax = plt.subplot(224)
    sns.barplot(
        x="dataset", y="growth",
        data=data,
        order=plot_order
    )
    _fmt_barplot(ax, r"Mean growth rate (\si{\per\hour})")

    ax = plt.subplot(223)
    sns.countplot(
        x="dataset",
        data=data,
        order=plot_order
    )
    _fmt_barplot(ax, "n")

    plt.tight_layout()
    plt.savefig("parB_interspot/parB_interspot.pdf")

    g = sns.PairGrid(data, vars=["v", "growth", "elongation"], hue="dataset")
    g = g.map_diag(plt.hist)
    g = g.map_offdiag(plt.scatter)
    g = g.add_legend(bbox_to_anchor=(1.2, 0.55))
    g.savefig("parB_interspot/parB_interspot_data.pdf")
Example #13
0
def composite_qc(df_orig, size=(16, 12)):
    """ Plot composite QC figures
    """
    df = df_orig.rename(columns={"hli_calc_age_sample_taken": "Age",
                       "hli_calc_gender": "Gender",
                       "eth7_max": "Ethnicity",
                       "MeanCoverage": "Mean coverage",
                       "Chemistry": "Sequencing chemistry",
                       "Release Client": "Cohort",

                      })

    fig = plt.figure(1, size)
    ax1 = plt.subplot2grid((2, 7), (0, 0), rowspan=1, colspan=2)
    ax2 = plt.subplot2grid((2, 7), (0, 2), rowspan=1, colspan=2)
    ax3 = plt.subplot2grid((2, 7), (0, 4), rowspan=1, colspan=3)
    ax4 = plt.subplot2grid((2, 7), (1, 0), rowspan=1, colspan=2)
    ax5 = plt.subplot2grid((2, 7), (1, 2), rowspan=1, colspan=2)
    ax6 = plt.subplot2grid((2, 7), (1, 4), rowspan=1, colspan=3)

    sns.distplot(df["Age"].dropna(), kde=False, ax=ax1)
    sns.countplot(x="Gender", data=df, ax=ax2)
    sns.countplot(x="Ethnicity", data=df, ax=ax3,
                    order = df['Ethnicity'].value_counts().index)
    sns.distplot(df["Mean coverage"].dropna(), kde=False, ax=ax4)
    ax4.set_xlim(0, 100)
    sns.countplot(x="Sequencing chemistry", data=df, ax=ax5)
    sns.countplot(x="Cohort", data=df, ax=ax6,
                    order = df['Cohort'].value_counts().index)
    # Anonymize the cohorts
    cohorts = ax6.get_xticklabels()
    newCohorts = []
    for i, c in enumerate(cohorts):
        if c.get_text() == "Spector":
            c = "TwinsUK"
        elif c.get_text() != "Health Nucleus":
            c = "C{}".format(i + 1)
        newCohorts.append(c)
    ax6.set_xticklabels(newCohorts)

    for ax in (ax6,):
        ax.set_xticklabels(ax.get_xticklabels(), ha="right", rotation=30)

    for ax in (ax1, ax2, ax3, ax4, ax5, ax6):
        ax.set_title(ax.get_xlabel())
        ax.set_xlabel("")

    plt.tight_layout()

    root = fig.add_axes((0, 0, 1, 1))
    labels = ((.02, .96, "A"),
              (.3, .96, "B"),
              (.6, .96, "C"),
              (.02, .52, "D"),
              (.3, .52, "E"),
              (.6, .52, "F"))
    panel_labels(root, labels)
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
def plot_and_show_crime_type_histogram(dataframe):
    """
    Shows the histogram where each crime type is shown with the number
    of times that it was committed.
    :param dataframe: The dataframe that holds the data.
        It should have the column named "Category"
    :result: shows a histogram in a popup window.
    """
    #df2.Category.value_counts().plot(kind='bar')
    sns.countplot(y="Category", data=dataframe, palette="Greens_d")
    plt.suptitle("Crime Type Instances", fontsize=30)
    plt.ylabel("Type of Crime", fontsize=26)
    plt.xlabel("Number of Crimes Committed", fontsize=26)
    plt.show()
Example #15
0
def plothist(df, classes='target', ax=None, col=None, order=None, title=""):

    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

    if col is not None:
        if order is not None:
            sns.countplot(x=classes, hue=col, data=df, ax=ax, order=order)
        else:
            sns.countplot(x=classes, hue=col, data=df, ax=ax)

    if col is None:
        if order is not None:
            sns.countplot(x=classes, data=df, ax=ax, order=order)
        else:
            sns.countplot(x=classes, data=df, ax=ax)

    maxi = 0;
    saveval = []
    for p in ax.patches:
        x = p.get_bbox().get_points()[:, 0]
        y = p.get_bbox().get_points()[1, 1]
        saveval += [(y, 100. * y / df.shape[0])]
        if maxi < y:
            maxi = y
        ax.annotate('{:.1f} %'.format(100. * y / df.shape[0]), (x.mean(), y),
                    ha='center', va='bottom', fontsize=12)  # set the alignment of the text

    ax.set_ylim((0, 1.15 * maxi))
    ax.set_title(title)

    return ax
def show_variable_info(col, quiet = False):
    if not quiet:
        notnull = col.notnull()
        missing_cnt = len(col)-sum(notnull)
        print(col.name, "| Missing values: {0} ({1:0.2f} %)".format(missing_cnt,missing_cnt/len(col)))
        if col.dtype == 'float64':        
            sns.distplot(col[notnull])
        elif len(col.unique()) < 16:
            sns.countplot(col[notnull])
        else:
            print('Showing only first 16 levels from', len(col.unique()))
            col_cut = col.value_counts()[:16]
            ax = plt.axes()  
            sns.barplot(x = col_cut.index, y =  col_cut, ax = ax) 
            ax.set_ylabel('Count')   
def sex_analyze(is_plot=True):
    '''
    性别分析 将性别按照 male, fmale, child
    :return
    '''
    global  titanic_df
    global test_df

    titanic_df['Person'] = titanic_df[['Age', 'Sex']].apply(get_persion, axis=1)
    test_df['Person'] = test_df[['Age', 'Sex']].apply(get_persion, axis=1)

    print '-' * 40
    print titanic_df.head(n=20)
    print test_df.head(n=20)

    # 将 'Sex' 删除
    titanic_df.drop(['Sex'], axis=1, inplace=True)
    test_df.drop(['Sex'], axis=1, inplace=True)

    # 创建基于person的dummy向量
    # 删除掉 'Male'的原因是 'Male'有较低的存活率 所以这个因子起到的作用不大
    person_dummies_titanic = pd.get_dummies(titanic_df['Person'])
    person_dummies_titanic.columns = ['Child', 'Female', 'Male']
    person_dummies_titanic.drop(['Male'], axis=1, inplace=True)

    person_dummies_test = pd.get_dummies(test_df['Person'])
    person_dummies_test.columns = ['Child', 'Female', 'Male']
    person_dummies_test.drop(['Male'], axis=1, inplace=True)

    titanic_df = titanic_df.join(person_dummies_titanic)
    test_df = test_df.join(person_dummies_test)

    fig, (axis1, axis2) = plt.subplots(1, 2, figsize=(10, 5))

    if is_plot:
        # 绘制person的数量
        sns.countplot(x='Person', data=titanic_df, ax=axis1)

    if is_plot:
        # 绘制存活率
        person_perc = titanic_df[['Person', 'Survived']].groupby('Person', as_index=False).mean()
        sns.barplot(x='Person', y='Survived', data=person_perc,
                    ax=axis2,
                    order=['male', 'female', 'child'])

    # 将person扔掉
    titanic_df.drop(['Person'], axis=1, inplace=True)
    test_df.drop(['Person'], axis=1, inplace=True)
Example #18
0
    def plot_chemical_trajectory(self, environment, filename):
        """
        Plot the trajectory through chemical space.

        Parameters
        ----------
        environment : str
            the name of the environment for which the chemical space trajectory is desired
        """
        chemical_state_trajectory = self.extract_state_trajectory(environment)

        visited_states = list(set(chemical_state_trajectory))

        state_trajectory = np.zeros(len(chemical_state_trajectory))
        for idx, chemical_state in enumerate(chemical_state_trajectory):
            state_trajectory[idx] = visited_states.index(chemical_state)

        with PdfPages(filename) as pdf:
            sns.set(font_scale=2)
            fig = plt.figure(figsize=(28, 12))
            plt.subplot2grid((1,2), (0,0))
            ax = sns.scatterplot(np.arange(len(state_trajectory)), state_trajectory)
            plt.yticks(np.arange(len(visited_states)), visited_states)

            plt.title("Trajectory through chemical space in {}".format(environment))
            plt.xlabel("iteration")
            plt.ylabel("chemical state")
            plt.tight_layout()

            plt.subplot2grid((1,2), (0,1))
            ax = sns.countplot(y=state_trajectory)

            pdf.savefig(fig)
            plt.close()
Example #19
0
def explor(file, target):
    import matplotlib
    import matplotlib.pyplot as plt
    import os
    import seaborn as sns
    # Delete the old png
    outPutPath = settings.MEDIA_ROOT
    mediaFiles = os.listdir(path = outPutPath)
    for item in mediaFiles:
        tempPath = os.path.join(outPutPath,item)
        if (os.path.isdir(tempPath) == False):
            if (item.startswith("expl") or item.startswith("pairplot")):
                os.remove(tempPath)
    explPathOne = os.path.join(outPutPath,'expl.png')
    explPathTwo = os.path.join(outPutPath,'pairplot.png')
    matplotlib.use('Agg')
    sns.set(style="whitegrid", color_codes=True)
    if (file.dataset[target].dtype != "int64" 
        and file.dataset[target].dtype != "float64"):
        g = sns.countplot(x=target, data=file.dataset, palette="Greens_d")
        g.figure.subplots_adjust(bottom=0.4)
        for item in g.xaxis.get_major_ticks():
            item.label.set_fontsize(8)
            item.label.set_rotation(90)
        plt.tight_layout()
        
    else:
        file.dataset.hist(column = target)
    plt.savefig(explPathOne)
    plt.close('all')
    datatypes = set()
    for col in file.colNames:
        datatypes.add(str(file.dataset[col].dtype))
    if ((len(file.colNames) > 10) or (('int64' not in datatypes) and 
        ('float64' not in datatypes))):
        matplotlib.use('Agg')
        fig = plt.figure()
        ax = fig.add_subplot(111)
        fig = fig.subplots_adjust(top=0.85)
        ax.text(0.2, 0.8, 'Sometimes, you just cannot get what you want.', 
                style='italic',bbox={'facecolor':'red', 'alpha':0.5, 'pad':10})
        ax.text(0.2, 0.6, 'There are two possible reasons:', 
                style='italic',bbox={'facecolor':'blue', 'alpha':0.5, 'pad':5})
        ax.text(0.2, 0.5, '1. You have too many columns; ', 
                style='italic',bbox={'facecolor':'blue', 'alpha':0.5, 'pad':5})
        ax.text(0.2, 0.4, '2. All your columns are categorial variables', 
                style='italic',bbox={'facecolor':'blue', 'alpha':0.5, 'pad':5})
        ax.text(0.2, 0.3, 'Unhappy? Go to use other website, please!', 
                style='italic',bbox={'facecolor':'yellow','alpha':0.5, 'pad':5})
    else:
        matplotlib.use('Agg')
        sns.set()
        g = sns.pairplot(file.dataset, hue = target)
        g.fig.subplots_adjust(top=0.8, right = 0.8)
        g.fig.suptitle('Pair Plots of All Numberic Variables', 
                        fontsize=20,color="r",alpha=0.5)
        # plt.legend(prop={'size':6})  
    plt.savefig(explPathTwo)
    plt.close('all')
Example #20
0
 def univariate(self):
     path_to_uni_plots = os.path.join(self.directory, self.univariate_dir)
     if not os.path.exists(path_to_uni_plots):
         os.makedirs(path_to_uni_plots)        
     
     for num_col in self.numericals:
         x = self.train[num_col].dropna()
         plt.figure(figsize=(10, 8))
         sns.distplot(x, kde=False, rug=False)
         plt.savefig(os.path.join(path_to_uni_plots, num_col + '_histogram.png' ), bbox_inches='tight')  
         
     for cat_col in self.categoricals:
         if len( self.train[cat_col].unique() ) < 10:
             x = self.train[cat_col].dropna()
             plt.figure(figsize=(10, 8))
             sns.countplot(x=cat_col, data=self.train)
             plt.savefig(os.path.join(path_to_uni_plots, cat_col + '_bar.png' ), bbox_inches='tight')  
Example #21
0
	def plotExp(self,exp,myData):
		plt.figure();
#		print (exp[1]['xaxis'])
		g=sns.countplot(myData[exp[1]['xaxis']]);
		plt.savefig("plots/static/%d.png" %settings.count);
		plt.clf()
		plt.close()
		settings.count=settings.count+1;
def basicplots(df_copy):
    try:
        #make a vertical barplot of crime counts per day of week
        plt.figure(1)        
        sns.countplot(x="DayOfWeek", data=df, palette="Greens_d", order=week)
        #make a horizontal barplot of crime counts per PdDistrict
        plt.figure(2)
        sns.countplot(y="PdDistrict", data=df, palette="Greens_d")
        #plot X and Y coordinates
        plt.figure(3)
        plt.plot(df['X'], df['Y'], 'o')
        #remove outliers
        df_copy[((df_copy.X - df_copy.X.mean()) / df_copy.X.std()).abs() < 3]
        df_copy[((df_copy.Y - df_copy.X.mean()) / df_copy.Y.std()).abs() < 3]
        plt.show()    
    except Exception as e:
        print(e)
def family_analyze(is_plot=True):
    # 将父母和兄弟姐妹整合成一个特征
    titanic_df['Family'] = titanic_df['Parch'] + titanic_df['SibSp']
    print titanic_df['Family'].head(n=20)

    family_no = titanic_df['Family'].loc[titanic_df['Family'] > 0]
    print type(family_no)
    print family_no.head()
    family_no_1 = titanic_df['Family'][titanic_df['Family'] > 0]
    print type(family_no_1)
    print family_no_1.head()

    # 有家庭的设置为1 无家庭成员的设置成0
    titanic_df['Family'].loc[titanic_df['Family'] > 0] = 1
    titanic_df['Family'].loc[titanic_df['Family'] == 0] = 0

    #同样的方式处理测试集
    test_df['Family'] = test_df['Parch'] + test_df['SibSp']

    test_df['Family'].loc[test_df['Family'] > 0] = 1
    test_df['Family'].loc[test_df['Family'] == 0] = 0

    # 将'Parch'和'SibSp' 删除
    titanic_df.drop(['Parch', 'SibSp'], axis=1, inplace=True)
    test_df.drop(['Parch', 'SibSp'], axis=1, inplace=True)

    print '-' * 40
    print titanic_df.head()
    print test_df.head()

    # 绘制 'Family'的数量在0,1上面

    fig, (axis1, axis2) = plt.subplots(1, 2, sharex=True, figsize=(10,5))

    if is_plot:
        # 绘制count plot
        sns.countplot(x='Family', data=titanic_df, order=[0,1],ax=axis1)

    # 在 family='0' 或者 '1'的时候, surviced的情况
    family_perc = titanic_df[['Family', 'Survived']].groupby('Family', as_index=False).mean()


    if is_plot:
        sns.barplot(x='Family', y='Survived', data=family_perc, order=[0,1], ax=axis2)
        axis1.set_xticklabels([u'有家庭', u'孤身一人'], rotation=0)
Example #24
0
def draw_histograms(data, headings, data_set):
    """
        Chart relationships between Variables
    """

    #chart_categories = ["Age"]

    #create a folder for the dataset
    directory = os.path.dirname(os.path.join(os.getcwd(),"Results","Data Counts",data_set, ""))
    if not os.path.exists(directory):
        os.makedirs(directory)

    #convert to a pandas dataset
    pandas_data=pd.DataFrame(data = data, columns = headings)

    for i in range(len(chart_categories)):

        chart_category = chart_categories[i]
        chart_title = chart_titles[i]

        #get the slice
        index = np.argwhere(headings == chart_category)
        chart_column = data[ : , index[0][0]]


        #get counts

        plt.figure()
        plt.xlabel(chart_title)
        plt.ylabel("Count")
        plt.title("%s" % chart_title)

        try:
            #try converting to numbers
            chart_column = chart_column.astype(np.float)

            #create histogram
            hist, bin_edge = np.histogram(chart_column, 10)

            bin_middles = bin_edge[:-1] + np.diff(bin_edge)/2
        
            plt.hist(chart_column, 10, normed=False, histtype='bar', rwidth=0.8)

            pylab.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category))

            plt.close()

        except:
            #get unique values
            unique_categories, unique_counts = np.unique(chart_column, return_counts=True)

            sns_plot = sns.countplot(x=chart_category, data=pandas_data.sort(chart_category), palette="Greens_d");
            #plt.setp(sns_plot.get_xticklabels(), rotation=45)
            sns_plot.figure.autofmt_xdate()
            sns_plot.figure.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category))

            plt.close()
Example #25
0
def make_count(series, title):
    '''
    Takes pd series, title.
    Outputs a barplot of counts
    '''
    title = title.title()
    sns_plot = sns.countplot(x=series, palette = "RdBu")
    plt.title('Count of ' + title.replace("_", " "))
    plt.savefig('Count_' + title + '.png', format='png')
    plt.close()
def isoformCheck(path):
	gene='TP53'
	patientIso={}
	noPatIDonlyTups=[]
	os.chdir(path)
	for files in os.listdir(path):
		with open(files) as input:
			#extracts patient ID from file name
			patientID=''
			for i in range(len(files)):
				if files[i]=='-':
					break;
				else:
					patientID=patientID+files[i]
			#comment out header line if no header is included in input
			header=next(input)
			for line in input:
				line=line.split('\t')
				if len(line)<35:
					print line
					break;
				if (line[11]==gene) and (float(line[36].rstrip('\n'))>=15):
					noPatIDonlyTups.append([str(line[14]), str(line[17]), str(line[18])])
					if patientID in patientIso:
						patientIso[patientID]=patientIso[patientID]+[(str(line[14]), str(line[17]), str(line[18]))]
					else:
						patientIso[patientID]=[(str(line[14]), str(line[17]), str(line[18]))]


	print patientIso
	print len(patientIso)
	print noPatIDonlyTups
	for x in range(0, len(noPatIDonlyTups)):
		if noPatIDonlyTups[x][0]=='NON_SYNONYMOUS_CODING':
			noPatIDonlyTups[x][0]='NON_SYN'
	#print sorted(noPatIDonlyTups, key=lambda x: x[0])
	dataframe=pandas.DataFrame(noPatIDonlyTups, columns=['mutation type', 'mutation', 'isoform'])
	print dataframe

	with sns.plotting_context("notebook", font_scale=1.5):
		sns.countplot(y="mutation type", hue="isoform", data=dataframe, palette="Set2")
		sns.plt.show()
def plot_classification_frequency(df, category, file_name, convert_labels = False):
    '''
    Plots the frequency at which labels occur

    INPUT
        df: Pandas DataFrame of the image name and labels
        category: category of labels, from 0 to 4
        file_name: file name of the image
        convert_labels: argument specified for converting to binary classification

    OUTPUT
        Image of plot, showing label frequency
    '''
    if convert_labels == True:
        labels['level'] = change_labels(labels, 'level')

    sns.set(style="whitegrid", color_codes=True)
    sns.countplot(x=category, data=labels)
    plt.title('Retinopathy vs Frequency')
    plt.savefig(file_name)
Example #28
0
def draw_histograms(data, headings, data_set):
    """
        Chart relationships between Variables
    """
    chart_categories = ['course_grade', 'Assig_1_full_40', 'Assig_2_full_40', 'Assig_3_full_40', 'proj_1_100', 'proj_2_100', 'proj_3_100', 'final_exam_100', 'peer_feedback_100', 'birth_country', 'residence_country', 'gender', 'age', 'primary_language', 'english_fluency', 'time_zone', 'occupation', 'highest_education', 'expected_hours_spent', 'formal_class_prog_taken', 'C', 'C#', 'C++', 'Java', 'JavaScript', 'Lisp', 'Objective C', 'Perl', 'PHP', 'Python', 'Ruby', 'Shell', 'Swift', 'Visual Basic', 'Other (specify below)', 'years_programming', 'prior_omscs_classes_completed', 'besides_KBAI_how_many_classes', 'moocs_completed_outside_OMSCS', 'qtr_proj1_confidence', 'qtr_proj2_confidence', 'qtr_piazza_opinion', 'qtr_peerfeedback_opinion', 'qtr_on_piazza', 'qtr_email', 'qtr_hipchat', 'qrt_gplus', 'qtr_other_chat', 'qtr_phone', 'qtr_facebook', 'qtr_in_person', 'CS6210_Completed', 'CS8803_Completed', 'CS6250_Completed', 'CS7641_Completed', 'CS6300_Completed', 'CS6310_Completed', 'CS4495_Completed', 'CS6475_Completed', 'CS6505_Completed', 'CS6290_Completed', 'CS8803_Completed', 'CS6440_Completed', 'mid_proj2_confidence', 'mid_proj3_confidence', 'mid_piazza_opinion', 'mid_peerfeedback_opinion', 'mid_on_piazza', 'mid_email', 'mid_hipchat', 'qrt_gplus', 'mid_other_chat', 'mid_phone', 'mid_facebook', 'mid_in_person', 'final_proj3_confidence', 'hours_spent', 'lessons_watched', 'exercises_completed', 'forum_visit_frequency', 'final_on_piazza', 'final_email', 'final_hipchat', 'qrt_gplus', 'final_other_chat', 'final_phone', 'final_facebook', 'final_in_person', 'watch_out_order', 'fall_behind', 'get_ahead', 'rewatch_full_lesson', 'rewatch_partial_lesson', 'view_answer_after_1incorrect', 'repeat_exercise_until_correct', 'skip_exercise', 'correct_first_attempt', 'access_from_mobile', 'download_videos', 'piazza_answers', 'piazza_days', 'piazza_asks', 'piazza_posts', 'piazza_views', 'total_lecture_time', 'overal_lecture_views', 'lecture_1_views', 'lecture_2_views', 'lecture_3_views', 'lecture_4_views', 'lecture_5_views', 'lecture_6_views', 'lecture_7_views', 'lecture_8_views', 'lecture_9_views', 'lecture_10_views', 'lecture_11_views', 'lecture_12_views', 'lecture_13_views', 'lecture_14_views', 'lecture_15_views', 'lecture_16_views', 'lecture_17_views', 'lecture_18_views', 'lecture_19_views', 'lecture_20_views', 'lecture_21_views', 'lecture_22_views', 'lecture_23_views', 'lecture_24_views', 'lecture_25_views', 'lecture_26_views', 'lecture_1_pace', 'lecture_2_pace', 'lecture_3_pace', 'lecture_4_pace', 'lecture_5_pace', 'lecture_6_pace', 'lecture_7_pace', 'lecture_8_pace', 'lecture_9_pace', 'lecture_10_pace', 'lecture_11_pace', 'lecture_12_pace', 'lecture_13_pace', 'lecture_14_pace', 'lecture_15_pace', 'lecture_16_pace', 'lecture_17_pace', 'lecture_18_pace', 'lecture_19_pace', 'lecture_20_pace', 'lecture_21_pace', 'lecture_22_pace', 'lecture_23_pace', 'lecture_24_pace', 'lecture_25_pace', 'lecture_26_pace', 'overall_pace']
    #chart_categories = ["Age"]

    #create a folder for the dataset
    directory = os.path.dirname(os.path.join(os.getcwd(),"Results","Data Counts",data_set, ""))
    if not os.path.exists(directory):
        os.makedirs(directory)

    #convert to a pandas dataset
    pandas_data=pd.DataFrame(data = data, columns = headings)

    for chart_category in chart_categories:

        #get the slice
        index = np.argwhere(headings == chart_category)
        chart_column = data[ : , index[0][0]]


        #get counts

        plt.figure()
        plt.xlabel(chart_category)
        plt.ylabel("Count")
        plt.title("%s Count" % chart_category)

        try:
            #try converting to numbers
            chart_column = chart_column.astype(np.float)

            #create histogram
            hist, bin_edge = np.histogram(chart_column, 10)

            bin_middles = bin_edge[:-1] + np.diff(bin_edge)/2
        
            plt.hist(chart_column, 10, normed=False, histtype='bar', rwidth=0.8)

            pylab.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category))

            plt.close()

        except:
            #get unique values
            unique_categories, unique_counts = np.unique(chart_column, return_counts=True)

            sns_plot = sns.countplot(x=chart_category, data=pandas_data, palette="Greens_d");
            #plt.setp(sns_plot.get_xticklabels(), rotation=45)
            sns_plot.figure.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category))

            plt.close()
def segments_countplot(data_sf, x=None, y=None, hue=None, 
                       order=None, hue_order=None, figsize_tuple= None, title=None,
                       seaborn_style='whitegrid', seaborn_palette='deep', color='b', 
                       **kwargs):
    '''Function for fancy seaborn barplot:
    
    Parameters
    ----------
    data_sf: SFrame
        SFrame for plotting. If x and y are absent, this is interpreted as wide-form.
        Otherwise it is expected to be long-form.
    x, y, hue: seaborn countplot names of variables in data or vector data, optional
        Inputs for plotting long-form data. See examples for interpretation.
    order, hue_order: seaborn countplot lists of strings, optional
        Order to plot the categorical levels in, otherwise the levels are inferred from the data objects.
    figsize_tuple: tuple of integers, optional, default: None
        width, height in inches. If not provided, defaults to rc figure.figsize.
    title: string
        Provides the countplot title.
    seaborn_style: dict, None, or one of {darkgrid, whitegrid, dark, white, ticks}
        Set the aesthetic style of the plots through the seaborn module.
        A dictionary of parameters or the name of a preconfigured set.
    seaborn_palette: {deep, muted, pastel, dark, bright, colorblind}
        Change how matplotlib color shorthands are interpreted.
        Calling this will change how shorthand codes like 'b' or 'g' 
        are interpreted by matplotlib in subsequent plots.
    color: matplotlib color, optional
        Color for all of the elements, or seed for light_palette() 
        when using hue nesting in seaborn.barplot().
    kwargs : key, value mappings
        Other keyword arguments which are passed through (a)seaborn.countplot API 
        and/or (b)plt.bar at draw time.
    '''
    # define the plotting style
    sns.set(style=seaborn_style)
    
    # initialize the matplotlib figure
    plt.figure(figsize=figsize_tuple)
    
    # transform the SFrame into a Pandas DataFrame
    data_df = data_sf.to_dataframe()

    # plot the segments counts
    ax = sns.countplot(x=x, y=y, hue=hue, data=data_df, order=order, hue_order=hue_order,
                       orient='v', palette=seaborn_palette, color=color, **kwargs)
    
    # add informative axis labels, title
    # make final plot adjustments
    plt.title(title, {'fontweight': 'bold'})
    sns.despine(left=True, bottom=True)
    plt.show()
Example #30
0
def main(argv):
    if len(argv) != 2 and len(argv) != 4:
        print("Usage: {} <training_data.csv> [<testing_data.csv> <output_results>]".format(argv[0]))
        exit(2)

    animals, outcomes = import_training(argv[1])
    forest = first_pass(animals, outcomes)

    naive(animals, outcomes)

    if len(argv) == 4:
        test_data = import_testing(argv[2])
        result = output(forest, test_data, argv[3])
        # test_data['OutcomeType'] = pd.Series(result, index=test_data.index)
        result = pd.DataFrame({"result": result})
        test_data = test_data.join(result)
        test_data["SexuponOutcome"] = test_data.SexuponOutcome.apply(cats.num_to_value)

        print(test_data)

        sb.countplot(test_data.Hour, hue=test_data.result,
            hue_order=["Return_to_owner", "Euthanasia", "Adoption", "Transfer", "Died"])
        plt.show()
iris = pd.read_csv('../Datasets/Iris.csv')

# Get Column List from Iris
print(iris.columns)

# Pairplot of Iris
sns.pairplot(iris)

# Don't Forget to add this line when we want to plot any graph
plt.show()

# Get Unique Class
print(iris['Species'].unique())

# Show Distribution of each class
sns.countplot(iris['Species'])
plt.show()

# Explore Sepal Length of Each Class
iris_versicolor = iris.loc[
    lambda data: iris['Species'] == 'Iris-versicolor', :]
iris_setosa = iris.loc[lambda data: iris['Species'] == 'Iris-setosa', :]
iris_virginica = iris.loc[lambda data: iris['Species'] == 'Iris-virginica', :]

sns.distplot(iris_versicolor['SepalLengthCm'], hist=False, color='blue')
sns.distplot(iris_setosa['SepalLengthCm'], hist=False, color='green')
sns.distplot(iris_virginica['SepalLengthCm'], hist=False, color='red')
plt.show()

sns.lmplot(x='SepalLengthCm',
           y='SepalWidthCm',
Example #32
0
import seaborn as sns
import matplotlib.pyplot as plt
tip_data = sns.load_dataset('tips')
#sns.scatterplot(x="tip", y="total_bill", data=tip_data, hue='size', size='size', sizes=(70, 280), style='size', palette='Set3', color=".5", marker="*")
#sns.lineplot(x="tip", y="total_bill", data=tip_data, hue="size", markers=True, dashes=False)
#sns.catplot(x='tip', y='total_bill', data=tip_data, row='size', kind='smoker')
#sns.barplot(x='tip', y='total_bill', data=tip_data, hue='size', ci=68)
sns.countplot(x='tip',
              data=tip_data,
              saturation=1.4,
              palette='Set3',
              linewidth=5,
              edgecolor=sns.color_palette("dark", 3))
plt.show()
# -*- coding: utf-8 -*-
"""first_lesson_home_work_3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/17z2eYAMXU5svbEpFP3vaT2WkXSleRlXl
"""

import pandas as pd
import seaborn as sns
pokemon = pd.read_csv("Pokemon.csv", index_col=0)
pokemon.head()

sns.countplot(pokemon['Generation'])

sns.distplot(pokemon['HP'])

sns.jointplot(x='Attack', y='Defense', data=pokemon)

sns.jointplot(x='Attack', y='Defense', data=pokemon, kind='hex', gridsize=20)

sns.kdeplot(pokemon['HP'], pokemon['Attack'])

sns.boxplot(x='Legendary', y='Attack', data=pokemon)

sns.violinplot(x='Legendary', y='Attack', data=pokemon)
Example #34
0
# print(books.loc[books.publisher.isnull(), :])
books.loc[(books.ISBN == '193169656X'), 'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'), 'publisher'] = 'other'
# 用户数据集
# print(users.shape)
# print(users.head())
# print(users.dtypes)
# print(users.userID.values)
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.NAN
users.Age = users.Age.fillna(round(users.Age.mean()))
users.Age = users.Age.astype(np.int32)
# print(sorted(users.Age.unique()))
# print(ratings.shape)
n_users = users.shape[0]
n_books = books.shape[0]
# print(n_users*n_books)
# print(ratings.head())
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]
ratings_new = ratings_new[ratings_new.userID.isin(users.userID)]
# print("number of users: " + str(n_users))
# print("number of books: " + str(n_books))
sparsity = 1.0 - len(ratings_new) / float(n_users * n_books)
print('图书交叉数据集的稀疏级别是 ' + str(sparsity * 100) + ' %')
ratings_explicit = ratings_new[ratings_new.bookRating != 0]
ratings_implicit = ratings_new[ratings_new.bookRating == 0]
# print(ratings_new.shape)
#print(ratings_explicit.shape)
# print(ratings_implicit.shape)
sns.countplot(data=ratings_explicit, x='bookRating')
plt.show()
                     columns="Embarked",
                     values="Ticket",
                     aggfunc="count")
ax = tbl.T.plot(kind='bar')

# Survivor split Deck
tbl = pd.pivot_table(train_df,
                     index="Survived",
                     columns="Deck",
                     values="Ticket",
                     aggfunc="count")
ax = tbl.T.plot(kind='bar')  # unknown deck for vast majority of people?

# Survival countplots all in one
fig, axs = plt.subplots(2, 3, figsize=(10, 8))
sns.countplot(data=train_df, y="Survived", hue="Pclass", ax=axs.flatten()[0])
axs.flatten()[0].legend(title="Pclass", loc=4)
sns.countplot(data=train_df, y="Survived", hue="Sex", ax=axs.flatten()[1])
axs.flatten()[1].legend(title="Sex", loc=4)
axs.flatten()[1].set_ylabel('')
sns.countplot(data=train_df, y="Survived", hue="Deck", ax=axs.flatten()[2])
axs.flatten()[2].set_ylabel('')
axs.flatten()[2].legend(title="Deck", loc=4)
sns.countplot(data=train_df, y="Survived", hue="Parch", ax=axs.flatten()[3])
axs.flatten()[3].legend(title="Parch", loc=4)
sns.countplot(data=train_df, y="Survived", hue="SibSp", ax=axs.flatten()[4])
axs.flatten()[4].set_ylabel('')
axs.flatten()[4].legend(title="SibSp", loc=4)
sns.countplot(data=train_df, y="Survived", hue="Embarked", ax=axs.flatten()[5])
axs.flatten()[5].set_ylabel('')
axs.flatten()[5].legend(title="Embarked", loc=4)
        data.append((category, os.path.join(data_dir, category, file)))

df = pd.DataFrame(data, columns=['class', 'file_path'])
len_df = len(df)
print(f"There are {len_df} images")

print(df['class'].value_counts())

# Figure 1
plt.figure()
df['class'].value_counts().plot(kind='bar')
plt.title('Class counts')

# Figure 2
plt.figure()
_ = sns.countplot(y=df['class'])
plt.title('Class counts')


data_dir = pathlib.Path(data_dir)
image_count = len(list(data_dir.glob('*/*.jpg')))

CLASS_NAMES = np.array([item.name for item in data_dir.glob('*') if item.name != ".DS_Store"])

image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)
BATCH_SIZE = 32
IMG_HEIGHT = 224
IMG_WIDTH = 224
STEPS_PER_EPOCH = np.ceil(image_count / BATCH_SIZE)

train_data_gen = image_generator.flow_from_directory(directory=str(data_dir),
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('titanic_train.csv')
print(train.head())

sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.show()

print(train.isnull())

sns.set_style('whitegrid')
sns.countplot(x='Survived', data=train, palette='RdBu_r')
plt.show()

sns.set_style('whitegrid')
sns.countplot(x='Survived', hue='Sex', data=train, palette='RdBu_r')
plt.show()

sns.set_style('whitegrid')
sns.countplot(x='Survived', hue='Pclass', data=train, palette='rainbow')
plt.show()

sns.distplot(train['Age'].dropna(), kde=False, color='darkred', bins=30)
plt.show()

train['Age'].hist(bins=30, color='darkred', alpha=0.7)
plt.show()
Example #38
0
tweet_lengths = [
    len(tokens) for tokens in df_topic_modeling['topic_modeling_text']
]
vocab = sorted(list(set(all_words)))

# Printing the answer -
print('{} words total, with a vocabulary size of {}'.format(
    len(all_words), len(vocab)))
print('Max tweet length is {}'.format(max(tweet_lengths)))

# Taking only a part of it -
tweet_lengths = [num for num in tweet_lengths if num < 25]

# Plotting the distribution of the word count of tweets -
fig1 = plt.figure(figsize=(15, 8))
sns.countplot(tweet_lengths)
plt.title('Tweet Length Distribution', fontsize=18)
plt.xlabel('Words per Tweet', fontsize=14)
plt.ylabel('Number of Tweets', fontsize=14)
plt.savefig('tweets_distribution.png')
plt.show()

# Since we have the word count, we can break the tweets data into teams so that we only have to load a small part of data at a time.
# We can further use cache function of streamlit to ensure once a data for a particular team is loaded, we don't have to lead it again.
teams = list(df_sentiment['team'].unique())

for selected_team in teams:
    temp = df_topic_modeling.loc[df_topic_modeling['team'] == selected_team]
    pickle.dump(temp, open(f'{selected_team}.p', 'wb'))

del (temp, teams, selected_team)
# combine train and val and divide them again with 8:2 ratio
temp = np.concatenate([train, val], axis=0)
len(temp)
train, val = train_test_split(temp, test_size=0.2, random_state=0)
print('the number of images in training set:', len(train))
print('the number of images in validation set:', len(val))

# compare the number of cases and non-cases
l = []
for i in train:
    if (i[1] == 0):
        l.append("Pneumonia")
    else:
        l.append("Normal")
sns.set_style('darkgrid')
sns.countplot(l)

# visualize images
plt.figure(figsize=(5, 5))
plt.imshow(train[0][0], cmap='gray')
plt.title(labels[train[0][1]])

plt.figure(figsize=(5, 5))
plt.imshow(train[-1][0], cmap='gray')
plt.title(labels[train[-1][1]])

# separate features and labels
x_train = []
y_train = []

x_val = []
Example #40
0
                         'traffic_volume_corr.png'),
            format='png')
plt.close()

#%% plot histogram
figure = plt.figure()
rawData.hist()
plt.tight_layout()
plt.savefig(os.path.join(cfg.default.traffic_figures,
                         'traffic_volume_hist.png'),
            format='png')
plt.close(figure)

#%% count plots of categorical
plt.figure()
sns.countplot(y='weather_main', data=rawData)
plt.tight_layout()
plt.savefig(os.path.join(cfg.default.traffic_figures,
                         'weather_main_count.png'),
            format='png')
plt.close(figure)

plt.figure()
sns.countplot(y='weather_description', data=rawData)
plt.tight_layout()
plt.savefig(os.path.join(cfg.default.traffic_figures,
                         'weather_description_count.png'),
            format='png')
plt.close(figure)

plt.figure()


# Drop 'label' column

X_train = train.drop(labels = ["label"],axis = 1) 



# free some space

del train 



g = sns.countplot(Y_train)



Y_train.value_counts()
# Check the data

X_train.isnull().any().describe()
test.isnull().any().describe()
# Normalize the data

X_train = X_train / 255.0

test = test / 255.0
# Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
Example #42
0
def general():

    if flask.request.method == 'GET':
        return (flask.render_template('general_search.html'))

    if flask.request.method == 'POST':

        reviews = []
        search_query = ''

        search_query = request.form.get("search_query")
        search_query = search_query.replace(' ', '+')

        base_url = "https://www.amazon.in/s?k="

        url = base_url + search_query

        header = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'referer':
            'https://www.amazon.in/s?k=nike+shoes+men&crid=28WRS5SFLWWZ6&sprefix=nike%2Caps%2C357&ref=nb_sb_ss_organic-diversity_2_4'
        }

        search_response = requests.get(url, headers=header)
        search_response.status_code

        #function to get the content of the page of required query
        #orig search page with all products

        cookie = {}  # insert request cookies within{}

        def getAmazonSearch(search_query):
            url = "https://www.amazon.in/s?k=" + search_query
            #print(url)
            page = requests.get(url, headers=header)
            if page.status_code == 200:
                return page
            else:
                return "Error"

        #function to get the contents of individual product pages using 'data-asin' number (unique identification number)
        #individual product page

        def Searchasin(asin):
            url = "https://www.amazon.in/dp/" + asin
            #print(url)
            page = requests.get(url, cookies=cookie, headers=header)
            if page.status_code == 200:
                return page
            else:
                return "Error"

        #function to pass on the link of 'see all reviews' and extract the content
        #review page

        def Searchreviews(review_link):
            url = "https://www.amazon.in" + review_link
            #print(url)
            page = requests.get(url, cookies=cookie, headers=header)
            if page.status_code == 200:
                return page
            else:
                return "Error"

        #EXTRACT ASIN
        data_asin = []
        response = getAmazonSearch(search_query)
        soup = BeautifulSoup(response.content)
        for i in soup.findAll(
                "div",
            {
                'class': [
                    "sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20",
                    "s-result-item s-asin sg-col-0-of-12 sg-col-16-of-20 sg-col sg-col-12-of-16"
                ]
            }):
            data_asin.append(i['data-asin'])

        #extract price & avg rating
        price = []
        average_rating = []
        for i in range(len(data_asin)):
            response = Searchasin(data_asin[i])

            soup = BeautifulSoup(response.content)

            h1 = 0
            for l in soup.findAll("span", {'data-hook': "rating-out-of-text"}):
                h1 = l.text
                h1 = h1[:h1.index(' ')]
                h1 = float(h1)
                #average_rating.append(l.text)
            average_rating.append(h1)

            h2 = 0
            for l in soup.findAll(
                    "span",
                {'id': ["priceblock_ourprice", "priceblock_dealprice"]}):
                h2 = l.text
                h2 = h2.replace(',', '')
                h2 = h2[1:h2.index('.')]
                h2 = float(h2)
                #price.append("₹"+l.text[2:])
            price.append(h2)

        #EXTRACT PRODUCT NAME

        product_name = []
        response = getAmazonSearch(search_query)
        soup = BeautifulSoup(response.content)
        for i in soup.findAll(
                "span", {
                    'class': [
                        "a-size-medium a-color-base a-text-normal",
                        "a-size-base-plus a-color-base a-text-normal"
                    ]
                }):
            product_name.append(i.text)

        #make length of all equal
        data_asin1 = []
        product_name1 = []
        price1 = []
        average_rating1 = []

        for i in range(len(data_asin)):
            if (data_asin[i] == '0' or product_name[i] == '0' or price[i] == 0
                    or average_rating[i] == 0):
                o = 0
            else:
                data_asin1.append(data_asin[i])
                product_name1.append(product_name[i])
                price1.append(price[i])
                average_rating1.append(average_rating[i])

        #EXTRACT SEE ALL REVIEW LINK

        link = []
        data_asin2 = []
        product_name2 = []
        price2 = []
        average_rating2 = []

        for i in range(len(data_asin1)):
            response = Searchasin(data_asin1[i])
            soup = BeautifulSoup(response.content)

            for l in soup.findAll("a",
                                  {'data-hook': "see-all-reviews-link-foot"}):
                if (
                        l['href']
                ):  #choose only those products whose see all reviews option is available
                    link.append(l['href'])
                    data_asin2.append(data_asin1[i])
                    product_name2.append(product_name1[i])
                    price2.append(price1[i])
                    average_rating2.append(average_rating1[i])

        link1 = []
        [link1.append(x) for x in link if x not in link1]

        data_asin3 = []
        product_name3 = []
        price3 = []
        average_rating3 = []
        k = 1

        for i in range(len(data_asin2)):
            c = data_asin2[i]
            k = 1
            for j in range(i + 1, len(data_asin2)):
                if (c == data_asin2[j]):
                    k = 0
                    break

            if (k == 1):
                data_asin3.append(data_asin2[i])
                product_name3.append(product_name2[i])
                price3.append(price2[i])
                average_rating3.append(average_rating2[i])

        reviews = []
        brand_name = []
        brand_name1 = []
        #product_name=[]
        product_name4 = []
        rating = []
        review_length = [0]
        length1 = 0
        price4 = []
        average_rating4 = []

        for j in range(len(link1)):
            #for j in range(6):
            for k in range(1, 3):
                response = Searchreviews(link1[j] + '&pageNumber=' + str(k))
                soup = BeautifulSoup(response.content)

                #for i in soup.findAll("a",{'class':"a-size-base a-link-normal"}):
                #brand_name.append(i.text)

                #for i in soup.findAll("a",{'data-hook':"product-link"}):
                #product_name.append(i.text)

                for i in soup.findAll("span", {'data-hook': "review-body"}):
                    reviews.append(i.text)
                    price4.append(price3[j])
                    product_name4.append(product_name3[j])
                    average_rating4.append(average_rating3[j])

                    pos = link1[j].index('-')
                    brand_name1.append(link1[j][1:pos])

                    #pos1 = link[j].index('/',1)
                    #product_name1.append(link[j][1:pos1])

                review_length.append(len(reviews))

                #for h in range(review_length[j+1]-review_length[j]):
                #product_name1.append(product_name[j])
                #brand_name1.append(brand_name[j])

                for i in soup.findAll("i", {
                        'data-hook':
                    ["review-star-rating", "cmps-review-star-rating"]
                }):
                    if (i.text):
                        rating.append(i.text)

        rating1 = []
        average_rating5 = []
        for i in range(len(rating)):
            rating1.append(int(rating[i][0]))
            average_rating5.append(average_rating4[i])

        rev = {
            'Brand': brand_name1,
            'Product': product_name4,
            'Price': price4,
            'Average Rating': average_rating5,
            'Reviews': reviews,
            'Review Rating': rating1
        }

        review_data = pd.DataFrame.from_dict(rev)
        pd.set_option('max_colwidth', 800)

        review_data['Brand'] = review_data['Brand'].str.upper()
        review_data['Product'] = review_data['Product'].str.upper()

        def sentiment(n):
            if n > 2:
                #return 1
                return 'Positive'
            else:
                #return 0
                return 'Negative'
            #return 1 if n >= 3 return 0 elif n==3 else 0

        review_data['Sentiment'] = review_data['Review Rating'].apply(
            sentiment)

        reviews = review_data.head()

        #Most common rating in reviews.

        plt.figure(figsize=(11, 6))
        sns.countplot(review_data['Review Rating'])
        #review_data['rating'].value_counts().sort_index().plot(kind='bar')
        plt.title('Distribution of Rating')
        plt.xlabel('Rating')
        plt.ylabel('Number of Reviews')
        #plt.savefig(/usr/src/app/\\templates\\rating_distribution.png')
        plt.savefig('./static/rating_distribution.png')

        if (review_data.Brand.nunique() > 2):

            #Most reviewed brand
            plt.figure(figsize=(11, 6))
            sns.countplot(
                y="Brand",
                data=review_data,
                order=review_data['Brand'].value_counts().iloc[:10].index,
                palette="Wistia_r")
            plt.title('Distribution of Brands')
            plt.ylabel('Brands')
            plt.xlabel('Number of Reviews')
            plt.savefig('./static/brand_distribution.png')
            plt.close()

        else:

            #Most reviewed product
            plt.figure(figsize=(11, 6))
            sns.countplot(
                y="Product",
                data=review_data,
                order=review_data['Product'].value_counts().iloc[:10].index,
                palette="Wistia_r")
            plt.title('Distribution of Brands')
            plt.ylabel('Product Name')
            plt.xlabel('Number of Reviews')
            plt.savefig('./static/brand_distribution.png')
            plt.close()

        #Price Distribution
        plt.figure(figsize=(11, 6))
        plt.title('Price Distribution')
        x = review_data['Price']
        sns.distplot(x, bins='auto', kde=False, color='g')
        plt.ylabel('No of Products')
        plt.xlabel('Price (₹)')
        plt.savefig('./static/price_distribution.png')

        if (review_data.Brand.nunique() > 2):

            #Highest avg_rating
            plt.figure(figsize=(11, 6))
            x = review_data.nlargest(len(review_data), ['Average Rating'])
            plt.barh(x['Brand'], x['Average Rating'], color='navajowhite')
            sns.barplot(y="Brand",
                        x="Average Rating",
                        data=review_data,
                        palette="cool_r")
            #plt.xticks(rotation=90)
            plt.ylabel('Brand Name')
            plt.xlabel('Rating')
            plt.title('Average Ratings of Brands')
            plt.savefig('./static/avgrating_brands.png')

        else:

            #Highest avg_rating
            plt.figure(figsize=(11, 6))
            x = review_data.nlargest(len(review_data), ['Average Rating'])
            plt.barh(x['Product'], x['Average Rating'], color='navajowhite')
            sns.barplot(y="Product",
                        x="Average Rating",
                        data=review_data,
                        palette="cool_r")
            #plt.xticks(rotation=90)
            plt.ylabel('Product Name')
            plt.xlabel('Rating')
            plt.title('Average Ratings of Products')
            plt.savefig('./static/avgrating_brands.png')

        if (review_data.Brand.nunique() > 2):

            #Sentiment
            plt.figure(figsize=(11, 6))
            sns.countplot(
                y="Brand",
                data=review_data,
                hue="Sentiment",
                order=review_data['Brand'].value_counts().iloc[:10].index,
                palette="Reds_r")
            plt.title('Sentiments of Brands')
            plt.ylabel('Brands')
            plt.xlabel('Number of Reviews')
            plt.savefig('./static/sentiment.png')

        else:

            #Sentiment
            plt.figure(figsize=(11, 6))
            sns.countplot(
                y="Product",
                data=review_data,
                hue="Sentiment",
                order=review_data['Product'].value_counts().iloc[:10].index,
                palette="Reds_r")
            plt.title('Sentiments of Brands')
            plt.ylabel('Product')
            plt.xlabel('Number of Reviews')
            plt.savefig('./static/sentiment.png')

        review_data['Review'] = review_data['Reviews'].str.lower()
        all_reviews = review_data['Reviews'].str.split(' ')
        all_reviews_cleaned = []

        for text in all_reviews:
            text = [x.strip(string.punctuation) for x in text]
            all_reviews_cleaned.append(text)

        text_review = [" ".join(text) for text in all_reviews_cleaned]
        final_text_review = " ".join(text_review)

        wordcloud_spam = WordCloud(
            background_color="white").generate(final_text_review)
        plt.figure(figsize=(11, 6))
        plt.imshow(wordcloud_spam, interpolation='bilinear')
        plt.axis("off")
        plt.title('Most common words appearing in the reviews')
        plt.savefig('./static/word_cloud.png')

        def cleanText(raw_text,
                      remove_stopwords=True,
                      stemming=False,
                      split_text=False):

            #text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
            letters_only = re.sub("[^a-zA-Z]", " ",
                                  raw_text)  # remove non-character
            words = letters_only.lower().split()  # convert to lower case
            if remove_stopwords:  # remove stopword
                stops = set(stopwords.words("english"))
                words = [w for w in words if not w in stops]

            if stemming == True:  # stemming
                stemmer = PorterStemmer()
                #stemmer = SnowballStemmer('english')
                words = [stemmer.stem(w) for w in words]

            if split_text == True:  # split text
                return (words)

            return (" ".join(words))

        X_train, X_test, y_train, y_test = train_test_split(
            review_data['Reviews'],
            review_data['Sentiment'],
            test_size=0.2,
            random_state=0)

        # Preprocess text data in training set and validation set
        X_train_cleaned = []
        X_test_cleaned = []

        for d in X_train:
            X_train_cleaned.append(cleanText(d))

        for d in X_test:
            X_test_cleaned.append(cleanText(d))

        tfid = TfidfVectorizer()
        tf_xtr = tfid.fit_transform(X_train)
        tf_xte = tfid.transform(X_test)
        model_tf = LogisticRegression()
        model_tf.fit(tf_xtr, y_train)
        feature_names = np.array(tfid.get_feature_names())
        sorted_coef_index = model_tf.coef_[0].argsort()

        negative = feature_names[sorted_coef_index[:30]].tolist()
        positive = feature_names[sorted_coef_index[:-31:-1]].tolist()

        #text_review = [" ".join(text) for text in negative]
        final_text_review = " ".join(positive)

        wordcloud_spam = WordCloud(
            max_font_size=50,
            background_color="white").generate(final_text_review)
        plt.figure(figsize=(11, 6))
        plt.imshow(wordcloud_spam, interpolation='bilinear')
        plt.axis("off")
        plt.title('Most common words appearing in positive reviews')
        plt.savefig('./static/positive.png')

        final_text_review = " ".join(negative)

        wordcloud_spam = WordCloud(
            max_font_size=50,
            background_color="white").generate(final_text_review)
        plt.figure(figsize=(11, 6))
        plt.imshow(wordcloud_spam, interpolation='bilinear')
        plt.axis("off")
        plt.title('Most common words appearing in negative reviews')
        plt.savefig('./static/negative.png')

        #reviews = data.to_dict()
        reviews = reviews.values.tolist()

        return flask.render_template('general_search.html',
                                     search_query=search_query,
                                     reviews=reviews,
                                     length=len(reviews),
                                     negative=negative,
                                     positive=positive)
Example #43
0
wbcd.head()

wbcd.shape

wbcd.describe()

wbcd = wbcd.drop('id', axis=1)

wbcd.head()

wbcd['diagnosis'].value_counts()

sns.heatmap(wbcd.isnull(), cmap='viridis', cbar=False, yticklabels=False)

sns.countplot(wbcd['diagnosis'])

from imblearn.over_sampling import SMOTE

x = wbcd.iloc[:, 1:30]

x.head()

y = wbcd.iloc[:, 0]

y.head()

x.columns

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
def main():
    map_characters1 = {0: 'No COVID', 1: 'Yes COVID'}
    dict_characters = map_characters1

    df = pd.DataFrame()

    print(dict_characters)

    model_file = "weights/covid.pb"

    weight_path1 = './drive/My Drive/Colab Notebooks/weights/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
    weight_path2 = './drive/My Drive/Colab Notebooks/weights/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'

    train_dir = "./drive/My Drive/Colab Notebooks/covid_dataset/train/"
    test_dir = "./drive/My Drive/Colab Notebooks/covid_dataset/test/"

    with tf.device('/device:GPU:0'):
        X_train, y_train = get_data(train_dir)
        X_test, y_test = get_data(test_dir)

        # Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0])
        y_trainHot = to_categorical(y_train, num_classes=2)
        y_testHot = to_categorical(y_test, num_classes=2)
        plotHistogram(X_train[1])
        plt.show()

        class_weight1 = class_weight.compute_class_weight(
            'balanced', np.unique(y_train), y_train)
        pretrained_model_1 = VGG16(weights=weight_path1,
                                   include_top=False,
                                   input_shape=(299, 299, 3))
        #pretrained_model_2 = InceptionV3(weights = weight_path2, include_top=False, input_shape=(299, 299, 3))

        optimizer1 = keras.optimizers.SGD(learning_rate=0.1)
        optimizer2 = keras.optimizers.Adam(lr=0.01, epsilon=0.0001)

        # Deal with imbalanced class sizes below
        # Make Data 1D for compatability upsampling methods
        X_trainShape = X_train.shape[1] * X_train.shape[2] * X_train.shape[3]
        X_testShape = X_test.shape[1] * X_test.shape[2] * X_test.shape[3]

        X_trainFlat = X_train.reshape(X_train.shape[0], X_trainShape)
        X_testFlat = X_test.reshape(X_test.shape[0], X_testShape)

        Y_train = y_train
        Y_test = y_test
        ros = RandomUnderSampler(sampling_strategy='auto')

        X_trainRos, Y_trainRos = ros.fit_sample(X_trainFlat, Y_train)
        X_testRos, Y_testRos = ros.fit_sample(X_testFlat, Y_test)

        Y_trainRosHot = to_categorical(Y_trainRos, num_classes=2)
        Y_testRosHot = to_categorical(Y_testRos, num_classes=2)

        df["labels"] = y_train
        lab = df['labels']
        dist = lab.value_counts()
        sns.countplot(lab)

        # Make Data 2D again
        for i in range(len(X_trainRos)):
            height, width, channels = IMG_SIZE, IMG_SIZE, 3
            X_trainRosReshaped = X_trainRos.reshape(len(X_trainRos), height,
                                                    width, channels)
        for i in range(len(X_testRos)):
            height, width, channels = IMG_SIZE, IMG_SIZE, 3
            X_testRosReshaped = X_testRos.reshape(len(X_testRos), height,
                                                  width, channels)

        # Plot Label Distribution
        dfRos = pd.DataFrame()
        dfRos["labels"] = Y_trainRos
        labRos = dfRos['labels']
        distRos = lab.value_counts()
        sns.countplot(labRos)

        class_weight2 = class_weight.compute_class_weight(
            'balanced', np.unique(Y_trainRos), Y_trainRos)
        print("New Class Weights: ", class_weight2)
        pretrainedNetwork(X_trainRosReshaped, Y_trainRosHot, X_testRosReshaped,
                          Y_testRosHot, pretrained_model_1, weight_path1,
                          class_weight2, 2, 100, optimizer2, map_characters1)

    return 1
Example #45
0
ax2.set_xlabel("Sprint Speed")
ax2.set_ylabel("Agility")

ax3.scatter(x_SprintSpeed, y_balance, s=23)
ax3.set_xlabel("Sprint Speed")
ax3.set_ylabel("Balance")

ax4.scatter(x_SprintSpeed, y_dribbling, s=23)
ax4.set_xlabel("Sprint Speed")
ax4.set_ylabel("Dribbling")

plt.subplots_adjust(top=0.5, right=0.8)

plt.show()

p = sns.countplot(x='Preferred Foot', data=df)
plt.show()

p = sns.countplot(x='Weak Foot', data=df)
plt.show()

p = sns.countplot(x='Position', data=df)
_ = plt.setp(p.get_xticklabels(), rotation=90)
plt.show()

top_10 = df.head(10)
p = sns.barplot(x='Name', y='Finishing', data=top_10)
_ = plt.setp(p.get_xticklabels(), rotation=90)
plt.show()

plt.figure(1, figsize=(15, 7))
# Config
os.chdir("/home/jovyan/work")
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 3)

# Preparation
data = pd.read_csv("./data/times_magazine.csv")
print(tabulate(data.head(), headers="keys", tablefmt="psql"))

# Modeling
N = len(data.Female)
lam_ = data.Female.mean()
with pm.Model() as model:
    lam_1 = pm.Exponential("lam_1", lam_)
    lam_2 = pm.Exponential("lam_2", lam_)
    tau = pm.DiscreteUniform("tau", lower=1923, upper=1923+N)
    idx = np.arange(1923, 1923+N)
    lam = pm.math.switch(tau > idx, lam_1, lam_2)
    female = pm.Poisson("female", lam, observed=data.Female)
    step = pm.Metropolis()
    trace = pm.sample(20000, tune=5000, step=step)

# Plot
fig, ax = plt.subplots(nrows=1, ncols=2)
sns.distplot(trace["lam_1"], label="λ1", ax=ax[0])
sns.distplot(trace["lam_2"], label="λ2", ax=ax[0])
sns.countplot(trace["tau"], ax=ax[1])
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig("./results/3-14-times-magazine.png")
    "Date", "Death (1 = Yes)", "Injury (1 = Yes)", "Time", "State",
    "WaterConditions", "Wind", "Visibility", "DayofWeek", "AccidentCause",
    "AccidentEvent", "OperatorGender", "VesselType", "Operation", "Activity",
    "DeceasedGender", "CauseofDeath", "DeceasedPFDWorn", "DeceasedRole",
    "InjuredGender", "InjuryType", "InjuredRole", "TotalDamage"
]]  # creating as smaller dataframe to work catagorical features
# The intent here is to uncover any intersting insights with repect to day of month a person was contacted, the amount of times a person was contacted, the duration of the contact, if the person previously participated in campaings, the balance of the clients account, and how these
# interact with the target feature.
dfcat.head()  # inspect dfplot
dfcat.shape  # inspect dfplot
dfcat.columns

#Accident Event (Decending)
sns.set(style='white')
sns.countplot(x="AccidentEvent",
              data=dfcat,
              order=dfcat["AccidentEvent"].value_counts().index).set(
                  xlabel="AccidentEvent", ylabel="Count")
plt.xticks(rotation=90, size=7)

#Accident Cause (Decending)
sns.set(style='white')
sns.countplot(x="AccidentCause",
              data=dfcat,
              order=dfcat["AccidentCause"].value_counts().index).set(
                  xlabel="Accident Cause", ylabel="Count")
plt.xticks(rotation=90, size=7)

#Day of week
sns.set(style='white')
order = [
    "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
#Data manipulations to clean the dataset. Let's start with the column TotalCharges
telecom_df["TotalCharges"] = telecom_df["TotalCharges"].replace(" ", np.nan)
telecom_df["TotalCharges"] = telecom_df["TotalCharges"].astype(float)
telecom_df = telecom_df[telecom_df["TotalCharges"].notnull()]
telecom_df = telecom_df.reset_index()[telecom_df.columns]
# Next we will clean the data for column 'MultipleLines'
telecom_df["MultipleLines"] = telecom_df["MultipleLines"].replace(
    {"No phone service": "No"})
telecom_df["MultipleLines"].unique()
cols_list = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport",
    "StreamingTV", "StreamingMovies"
]

for i in cols_list:
    sns.countplot(data=telecom_df, x=i)
    plt.show()
#As we see that, the columns from cols_list could be cleaned to have only 2 values each which are 'Yes' & 'No'
for i in cols_list:
    telecom_df[i] = telecom_df[i].replace({"No internet service": "No"})
#next we will clean tenure column - Currently tenure is integer column we can create seperate bins for the tenure and make a categorical column
print(telecom_df["tenure"].unique())
plt.hist(telecom_df['tenure'])
plt.show()


# Function to create categorical column for tenure
def tenure_cat(telecom_df):
    if telecom_df["tenure"] <= 12:
        return "tenure-0-12"
    elif (telecom_df["tenure"] > 12) & (telecom_df["tenure"] <= 24):
import pandas as pd
import numpy as np

#IMPORTING DATASET
data = pd.read_csv(
    '/content/gdrive/My Drive/Colab Notebooks/CreditCard_Fraud_Detection/creditcard.csv',
    index_col=False)
df = data

#ANALYSIS OF BAIS IN DATASET
import matplotlib.pyplot as plt
import seaborn as sns

print('Distribution of the Classes in the subsample dataset')
print(df['Class'].value_counts() / len(df))
sns.countplot('Class', data=df)
plt.title('Unequally Distributed Classes', fontsize=14)
plt.show()

#SINCE THE DATASET IS HIGHLY UNBALANCED - APPLY RESAMPLING AND SHUFFLING TECHNIUES AND THEN ANALYSING THE BAIS IN RESAMPLED AND RESHUFFLED DATA

#STEP 1 - SCALING THE TIME AND AMOUNT ATTRIBUTES USING ROBUSTSCALER
from sklearn import preprocessing
rob_scalar = preprocessing.RobustScaler()
df['Amt_Scaled'] = rob_scalar.fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time_Scaled'] = rob_scalar.fit_transform(df['Time'].values.reshape(-1, 1))
df.drop(['Time', 'Amount'], axis=1, inplace=True)

#STEP 2 - SMALL TWEAK IN DATASET - JUST PIVOTING THE POSITIONS OF TIME AND AMOUNT
amt_scl = df['Amt_Scaled']
time_scl = df['Time_Scaled']
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import metrics
from scipy.stats import mode
from scipy import stats

insurance=pd.read_csv('C:/Users/Eswar Chowdary/Desktop/Projects/Health insurance-1/Insurance Dataset .csv')


insurance['result'].value_counts() #checking the data is imbalance or not.

insurance.isnull().sum()#checking for the null values

sb.countplot(x='areaservice',data=insurance,palette='hls')
sb.countplot(x='certificatenum',data=insurance,palette='hls')
sb.countplot(x='hospitalcounty',data=insurance,palette='hls')
sb.countplot(x='hospitalid',data=insurance,palette='hls')
sb.countplot(x='hospitalname',data=insurance,palette='hls')
sb.countplot(x='age',data=insurance,palette='hls')
sb.countplot(x='zipcode',data=insurance,palette='hls')
sb.countplot(x='gender',data=insurance,palette='hls')
sb.countplot(x='culturalgroup',data=insurance,palette='hls')
sb.countplot(x='ethnicity',data=insurance,palette='hls')
sb.countplot(x='daysspendhospital',data=insurance,palette='hls')
sb.countplot(x='admissiontype',data=insurance,palette='hls')
sb.countplot(x='homeorselfcare',data=insurance,palette='hls')
sb.countplot(x='yeardischarge',data=insurance,palette='hls')
sb.countplot(x='ccsdiagnosiscode',data=insurance,palette='hls')
sb.countplot(x='ccsdiagnosisdescription',data=insurance,palette='hls')
Example #51
0
print(relation)
sns.pairplot(X_train)

# Code ends here

# --------------
import seaborn as sns
import matplotlib.pyplot as plt

# Code starts here
cols = ['children', 'sex', 'region', 'smoker']
fig, axes = plt.subplots(2, 2)
for i in range(0, 2):
    for j in range(0, 2):
        col = cols[i * 2 + j]
        sns.countplot(x=X_train[col], hue=y_train, ax=axes[i, j])

# Code ends here

# --------------
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# parameters for grid search
parameters = {'C': [0.1, 0.5, 1, 5]}

# Code starts here
lr = LogisticRegression(random_state=9)
grid = GridSearchCV(estimator=lr, param_grid=parameters)
grid.fit(X_train, y_train)
Example #52
0
# CABIN
print('Percent of missing "Cabin" records is %.2f%%' %
      ((train_df['Cabin'].isnull().sum() / train_df.shape[0]) * 100))
# since 77% of Cabin data is missing, we will omit this information in our model

# EMBARKED
print('Per  cent of missing "Embarked" records is %.2f%%' %
      ((train_df['Embarked'].isnull().sum() / train_df.shape[0]) * 100))
# oly 2 data are missing out of 1200 so we will impute the most popular port

print(
    'Boarded passengers grouped by port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton):'
)
print(train_df['Embarked'].value_counts())
# palette='Set2' defines the style
sns.countplot(x='Embarked', data=train_df, palette='Set2')
# plt.show()
# the most popular Embarked port is 'S' so Southampton, so I will impute this value into the mising ones

# DATA Wrangling
train_data = train_df.copy()
# inserting median of Age into the missing values:
train_data['Age'].fillna(train_df['Age'].median(skipna=True), inplace=True)
train_data['Embarked'].fillna(train_df['Embarked'].value_counts().idxmax(),
                              inplace=True)
train_data.drop('Cabin', axis='columns', inplace=True)

# check if all the NULL values are gone
print(train_data.isnull().sum())

# Mergining two variables which are possibly multicollinear into one Variable ('TravellingAlone")
Example #53
0
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from imblearn.metrics import geometric_mean_score, specificity_score, sensitivity_score

# Glass Type Identification KEEL Dataset (~65%)
gls_df = pd.read_csv(
    'Glass2.csv',
    names=["RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Class"],
    index_col=False)

# Check for Dataset Imbalance and Plot Class Distribution
gls_total = gls_df["Class"]
gls_neg = gls_total.value_counts()[0] / len(gls_total)
gls_pos = gls_total.value_counts()[1] / len(gls_total)
sns.countplot('Class', data=gls_df)
plt.title('Glass Dataset Class Distribution')
plt.savefig('Glass_Imbalance_Dist.png', dpi=300)
print("Class '0': ", round(gls_neg, 2), "%")
print("Class '1': ", round(gls_pos, 2), "%")
print("Imbalance Ratio: ", round(gls_neg / gls_pos, 2))

# Split dataset into 70/30 train-test ratio
x = ["RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe"]
y = ["Class"]
x_gls = gls_df[x]
y_gls = gls_df[y]
x_train, x_test, y_train, y_test = train_test_split(x_gls,
                                                    y_gls,
                                                    test_size=0.3)
maillists = pd.read_csv(maillists)

migrations = 'https://raw.githubusercontent.com/Nedu/RecommenderSystemTeamJava/master/data/migrations.csv'
migrations = pd.read_csv(migrations)

thoughts = 'https://raw.githubusercontent.com/Nedu/RecommenderSystemTeamJava/master/data/thoughts.csv'
thoughts = pd.read_csv(thoughts)

user_settings = 'https://raw.githubusercontent.com/Nedu/RecommenderSystemTeamJava/master/data/user_settings.csv'
user_settings = pd.read_csv(user_settings)

thoughts.head()    #checking first 5 rows

thoughts.nunique()  #checking  number of unique in each columns

sns.countplot(x='user_id',data=thoughts) #this counts the amount of time a user_id appears

corr=thoughts.corr()
sns.heatmap(corr.corr(),cmap='coolwarm',annot=True)   #check if positively or negetively corrralated

plt.figure(figsize = (15,10))
sns.countplot(y='created_at',hue ='user_id',data=thoughts) #counts the amount of created at was found also spliting it categories under user_id through hue

plt.figure(figsize = (15,10))
sns.countplot(y='updated_at',hue ='user_id',data=thoughts)

contact_settings.head()

extfeeds.head()

extfeeds.shape

# In[5]:


test_data.describe()


# In[6]:


# Purely going with the gender for prediction first
# With the both dataset we could see that age has missing columns
# We can either drop out them or fill the missing values with mean value
import seaborn as sns
sns.countplot(data=train_data,x='Sex',hue='Survived')


# In[7]:


train_data['Age'].fillna((train_data['Age'].mean()), inplace=True)


# In[8]:


train_data.describe(include='all')


# In[9]:
Example #56
0
saveplot_highlow_balstatus(data, 'loan')
# => 有貸款的人低餘額人數比例較高餘額人數比例高
saveplot_highlow_balstatus(data, 'marital')
# => 離婚的人低餘額人數比例較高餘額人數比例高
saveplot_highlow_balstatus(data, 'job')
# => 退休人士、管理階層、學生與自僱者的高餘額人數比例與低於額人數比例差異較多
saveplot_highlow_balstatus(data, 'default')
# => 違約的人低餘額人數比例較高餘額人數比例高
saveplot_highlow_balstatus(data, 'housing')
# => 有房產的人低餘額人數比例較高餘額人數比例高

# create new column "duration_level"
data['duration_level'] = data['duration'].apply(
    lambda x: 'above' if x >= data['duration'].mean() else 'below')

ax = sns.countplot(y='deposit', hue='duration_level', data=data)
#show percentage on countplot
c = 1
for p in ax.patches:
    if c <= 2:
        total = len(data[data['deposit'] == 'yes'])
    else:
        total = len(data[data['deposit'] == 'no'])
    percentage = '{:.1f}%'.format(100 * p.get_width() / total)
    x = p.get_x() + p.get_width() + 0.02
    y = p.get_y() + p.get_height() / 2
    ax.annotate(percentage, (x, y))
    c += 1
plt.title('The distribubtion of deposit by duration status')
plt.savefig('deposit_count_bydurationstatus.png', dpi=300)
# => 在開定存的人中,有56.2%的人通話時間高於平均;
Example #57
0
# In[7]:

#If the season column is "None" them it is most likely a movie, lets add another column to our dataframe
#my_history[my_history['season'].isna()]
df['show_type'] = df.apply(lambda x: 'Movie'
                           if pd.isnull(x['season']) else 'TV Show',
                           axis=1)
df

# In[ ]:

# In[8]:

plt.figure(figsize=(12, 8))
sns.countplot(x='show_type', data=df)

# ### Moro watches to a significant level, more tv shows than movies.

# In[9]:

plt.figure(figsize=(18, 10))
sns.countplot(y='show_name',
              data=df,
              order=df['show_name'].value_counts().iloc[:10].index)

# ### I thought 'Friends' would be taking the lead, but look at that... Moro must really love Fresh Princes ;)

# In[10]:

plt.figure(figsize=(12, 8))
Example #58
0
             ax=axes[0]).set_title('Employee Satisfaction Distribution')
axes[0].set_ylabel('Employee Count')

# Graph Employee Evaluation
sns.distplot(df.evaluation, kde=False, color="r",
             ax=axes[1]).set_title('Employee Evaluation Distribution')
axes[1].set_ylabel('Employee Count')

# Graph Employee Average Monthly Hours
sns.distplot(
    df.averageMonthlyHours, kde=False, color="b",
    ax=axes[2]).set_title('Employee Average Monthly Hours Distribution')
axes[2].set_ylabel('Employee Count')

f, ax = plt.subplots(figsize=(15, 4))
sns.countplot(y="salary", hue='turnover',
              data=df).set_title('Employee Salary Turnover Distribution')

# Employee distribution
# colors for different department
color_types = [
    '#78C850', '#F08030', '#6890F0', '#A8B820', '#A8A878', '#A040A0',
    '#F8D030', '#E0C068', '#EE99AC', '#C03028', '#F85888', '#B8A038',
    '#705898', '#98D8D8', '#7038F8'
]
f, ax = plt.subplots(figsize=(15, 4))

# Count Plot (a.k.a. Bar Plot)
sns.countplot(
    x='department', data=df,
    palette=color_types).set_title('Employee Department Distribution')
Example #59
0
plt.xlabel("t")
plt.ylabel("Passengers")
plt.plot(airline["t_square"], airline["Passengers"], "bo")
plt.xlabel("t_square")
plt.ylabel("Passengers")
# table
pd.crosstab(airline["log_Passengers"], airline["Passengers"])
pd.crosstab(airline["t"], airline["Passengers"])
pd.crosstab(airline["t_square"], airline["Passengers"])
## Barplot
pd.crosstab(airline["log_Passengers"], airline["Passengers"]).plot(kind="bar",
                                                                   width=1.85)
pd.crosstab(airline["t"], airline["Passengers"]).plot(kind="bar", width=1.85)
pd.crosstab(airline["t_square"], airline["Passengers"]).plot(kind="bar",
                                                             width=1.85)
sns.countplot(x="Passengers", data=airline, palette="hls")
sns.countplot(x="log_Passengers", data=airline, palette="hls")
sns.countplot(x="t", data=airline, palette="hls")
sns.countplot(x="t_square", data=airline, palette="hls")
# getting boxplot of Delivery Time with respect to each category of Sorting Time
sns.boxplot(x="log_Passengers", y="Passengers", data=airline, palette="hls")
sns.boxplot(x="t", y="Passengers", data=airline, palette="hls")
sns.boxplot(x="t_square", y="Passengers", data=airline, palette="hls")
sns.pairplot(
    airline.iloc[:, 0:17]
)  # histogram of each column and scatter plot of each variable with respect to other columns
sns.pairplot(airline, hue="Passengers", size=2)
airline["Passengers"].value_counts()
airline["log_Passengers"].value_counts()
airline["t"].value_counts()
airline["t_square"].value_counts()
df_tmp.loc['2020-09-22':, 'rac'] = True  # type: ignore
df_tmp = df_tmp.reset_index()
df_tmp['hour'] = df_tmp['datetime'].dt.hour

plt.figure(figsize=(10, 10))
sns.set(style="whitegrid",
        palette=sns.color_palette("muted", n_colors=6, desat=1.0))
sns.barplot(y=df_tmp['hour'], x=df_tmp['count'], hue=df_tmp['rac'], orient='h')
plt.draw()

# %%
df_melt = pd.melt(df, value_vars=['rac'], value_name='ractopamine')
plt.figure(figsize=(10, 10))
sns.set(style="whitegrid",
        palette=sns.color_palette("muted", n_colors=6, desat=1.0))
ax = sns.countplot(data=df_melt, x='ractopamine', hue='ractopamine')

for p in ax.patches:
    ax.annotate(f'\n{p.get_height()}', (p.get_x() + 0.2, p.get_height()),
                ha='center',
                va='top',
                color='white',
                size=18)

plt.draw()

# %%
# using sklearn's MinMaxScaler
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1))

df_train = df.iloc[:, 3:-1].copy()