Exemple #1
0
def get_outliers(year_str, master_dict):
    #     # make a local reference of the master dictionary of all the data
    #     master = allYears_listData_dict()
    #     #make the complete dataframe of all years
    df_allSalary = df_allYears_salary(master_dict)

    # Find out what the upper bound for top outliers is:
    std = df_allSalary.loc[:, year_str].std()
    mean = df_allSalary.loc[:, year_str].mean()
    upper_bound = mean + (2 * std)
    # making a copy
    # df_copy = df_allSalary.loc[:,year_str].copy()
    # df_copy = [df_copy > upper_bound]
    # print df_copy

    # Initialize variables for outliers
    index_outliers = []  # collects the index of outlier
    salary_outliers = []  #gets the float salary of outliers
    positions_outliers = []
    name_outliers = []

    # get the list of salary data, & see if salary is greater than upper_bound
    # This gets the outliers & upl employees
    year_salary = master_dict[year_str]['salary_float']
    for i in range(len(year_salary)):
        if ((year_salary[i] > upper_bound) and (year_salary[i] != "" or 0)):
            salary_outliers.append(year_salary[i])
            index_outliers.append(i)
        else:
            pass

    # get the names of the outliers
    year_names = master_dict[year_str]['employee_list']
    for i in range(len(index_outliers)):
        index = index_outliers[i]
        name_outliers.append(year_names[index])
#     pprint.pprint(name_outliers)

# get the positions of the outliers
    year_positions = master_dict[year_str]['position_list']
    for i in range(len(index_outliers)):
        index = index_outliers[i]
        positions_outliers.append(year_positions[index])
#     pprint.pprint(positions_outliers)
######## DEBUGGING #################
# print len(positions_outliers)
# print len(index_outliers)
# print len(salary_outliers)
# print len(name_outliers)
# print positions_outliers[-1]
# print index_outliers[-1]
# print salary_outliers[-1]
# print name_outliers[-1]

# create a dataframe
    data = zip(name_outliers, positions_outliers, salary_outliers)
    pprint.pprint(data)
    df = pd.DataFrame(data, columns=["Name", "Position", "Salary"])
    df = df.sort(['Salary'], ascending=False)
    return df
def get_outliers(year_str, master_dict):
#     # make a local reference of the master dictionary of all the data
#     master = allYears_listData_dict()
#     #make the complete dataframe of all years
    df_allSalary= df_allYears_salary(master_dict)
    
    # Find out what the upper bound for top outliers is:
    std = df_allSalary.loc[:, year_str].std()
    mean = df_allSalary.loc[:, year_str].mean()
    upper_bound = mean + (2*std)
    # making a copy
    # df_copy = df_allSalary.loc[:,year_str].copy()
    # df_copy = [df_copy > upper_bound]
    # print df_copy
    
    # Initialize variables for outliers
    index_outliers = [] # collects the index of outlier
    salary_outliers = []  #gets the float salary of outliers
    positions_outliers = []
    name_outliers = []
    
    # get the list of salary data, & see if salary is greater than upper_bound
    # This gets the outliers & upl employees
    year_salary = master_dict[year_str]['salary_float']
    for i in range(len(year_salary)):
        if ((year_salary[i] > upper_bound) and (year_salary[i]!= "" or 0)) :
            salary_outliers.append(year_salary[i])
            index_outliers.append(i)
        else:
            pass
    
    # get the names of the outliers
    year_names = master_dict[year_str]['employee_list']
    for i in range(len(index_outliers)):
        index = index_outliers[i]
        name_outliers.append(year_names[index])
#     pprint.pprint(name_outliers)
        
    # get the positions of the outliers
    year_positions = master_dict[year_str]['position_list']
    for i in range(len(index_outliers)):
        index = index_outliers[i]
        positions_outliers.append(year_positions[index])
#     pprint.pprint(positions_outliers)
######## DEBUGGING #################
    # print len(positions_outliers)
    # print len(index_outliers)
    # print len(salary_outliers)
    # print len(name_outliers)
    # print positions_outliers[-1]
    # print index_outliers[-1]
    # print salary_outliers[-1]
    # print name_outliers[-1]

    # create a dataframe
    data = zip(name_outliers, positions_outliers, salary_outliers)
    pprint.pprint(data)
    df = pd.DataFrame(data, columns=["Name", "Position", "Salary"])
    df = df.sort(['Salary'], ascending=False)
    return df
def boxplot_salary_allYears():
    # make the master dictionary of all the data
    master = allYears_listData_dict()
    # make a dataframe of all the salaries for a year, for all the data
    df = df_allYears_salary(master)
    plt.figure()
    df.boxplot(return_type='axes')
    ## add title and label axis
    plt.ylabel('Salary (USD)')
    plt.xlabel("School Year")
    plt.title("Distribution of UVM employee salary from 1996 - 2014")
    plt.show(block=True)  # this is code is needed if run outside of canopy
def boxplot_salary_allYears():      
    # make the master dictionary of all the data  
    master = allYears_listData_dict()
    # make a dataframe of all the salaries for a year, for all the data
    df = df_allYears_salary(master)
    plt.figure()
    df.boxplot(return_type='axes')
    ## add title and label axis
    plt.ylabel('Salary (USD)')
    plt.xlabel("School Year")
    plt.title("Distribution of UVM employee salary from 1996 - 2014")
    plt.show(block=True)  # this is code is needed if run outside of canopy