def main(): df = utils.dataframe('dataset_train.csv') colors = utils.colors() df = df.drop( columns=['Index', 'Hogwarts House', 'First Name', 'Last Name']) df['Best Hand'] = df['Best Hand'].replace(to_replace=['Left', 'Right'], value=[1, 2]) df['Year'] = df['Birthday'].apply(lambda x: int(x[0:4])) df['Month'] = df['Birthday'].apply(lambda x: int(x[5:7])) df['Day'] = df['Birthday'].apply(lambda x: int(x[8:10])) df['Birthday'] = df['Birthday'].apply(lambda x: time.mktime( datetime.datetime.strptime(x, "%Y-%m-%d").timetuple())) nb_columns = 0 for column in df: nb_columns += 1 i = 1 for column in df: plt.subplot(4, math.ceil(nb_columns / 4 + 1), i) i += 1 plt.hist(df[column], bins=bin(df[column]), color=colors[column], label=column, alpha=0.75, edgecolor='black', linewidth=0.5) plt.legend(loc='upper right') plt.show()
def main(): df = utils.dataframe('dataset_train.csv') colors = utils.colors_house() df = df.drop(columns=['Index', 'First Name', 'Last Name', 'Birthday', 'Best Hand']) colors = [colors['Ravenclaw'], colors['Slytherin'], colors['Gryffindor'], colors['Hufflepuff']] sns.set_palette(sns.color_palette(colors)) sns.pairplot(df, hue="Hogwarts House") plt.show()
def main(): #matplotlib.use('webagg') iterable = [ 'Astronomy', 'Herbology', 'Ancient Runes', 'Divination', ] plt.style.use('dark_background') colors_house = utils.colors_house() df = utils.dataframe('dataset_train.csv') colors = utils.colors() df_house = df['Hogwarts House'] df = df.drop(columns=[ 'Index', 'First Name', 'Last Name', 'Arithmancy', 'Defense Against the Dark Arts', 'Muggle Studies', 'History of Magic', 'Potions', 'Care of Magical Creatures', 'Flying', 'Charms', 'Transfiguration', ]) allIterables = list(itertools.combinations(iterable, 2)) nb_columns = 0 for column in allIterables: nb_columns += 1 i = 1 for el in allIterables: plt.subplot(math.ceil(nb_columns / 3 + 1), 3, i) i += 1 plt.scatter(df[el[0]], df[el[1]], c=df['Hogwarts House'].map(colors_house), alpha=0.25, label=[el[0][:10], el[1][:10]], marker='o', s=2) plt.xticks([]) plt.yticks([]) plt.legend(loc='upper right', fontsize=5) plt.show()
def main(): df2 = pd.DataFrame() if not (len(sys.argv) == 1 + 1): print('\033[91m' + '✘ Error: ' + '\033[0m' + 'CSV file is missing, please add his path as argument') sys.exit() df = utils.dataframe(sys.argv[1]) columnsNamesArr = df.columns.values listOfColumnNames = list(columnsNamesArr) dico_numerals = {} for label in listOfColumnNames: if df[label].dtypes == str or df[label].dtypes == object: continue dico_numerals[label] = { 'count': 0, 'mean': 0, 'std': 0, 'min': float('inf'), '25%': 0, '50%': 0, '75%': 0, 'max': float('-inf'), 'total': 0 } for label in listOfColumnNames: if df[label].dtypes == str or df[label].dtypes == object: continue for index, row in df.iterrows(): if row[label] > dico_numerals[label]['max']: dico_numerals[label]['max'] = row[label] if row[label] < dico_numerals[label]['min']: dico_numerals[label]['min'] = row[label] if np.isnan(row[label]) != True: dico_numerals[label]['count'] += 1 dico_numerals[label]['total'] += row[label] if (dico_numerals[label]['count'] > 0): dico_numerals[label]['mean'] = dico_numerals[label][ 'total'] / dico_numerals[label]['count'] dico_numerals[label]['25%'] = utils.calc_quantile( df[label].dropna(), 0.25) dico_numerals[label]['50%'] = utils.calc_quantile( df[label].dropna(), 0.5) dico_numerals[label]['75%'] = utils.calc_quantile( df[label].dropna(), 0.75) dico_numerals[label]['std'] = utils.stdev(df[label].dropna()) describe(dico_numerals)
def main(): iterable = [ "Birthday", "Best Hand", "Arithmancy", "Astronomy", "Herbology", "Defense Against the Dark Arts", "Divination", "Muggle Studies", "Ancient Runes", "History of Magic", "Transfiguration", "Potions", "Care of Magical Creatures", "Charms", "Flying", "Year", "Month", "Day" ] df = utils.dataframe('dataset_train.csv') colors = utils.colors() df = df.drop(columns=['Index', 'Hogwarts House', 'First Name', 'Last Name']) df['Best Hand'] = df['Best Hand'].replace(to_replace=['Left', 'Right'], value=[1, 2]) df['Year'] = df['Birthday'].apply(lambda x: int(x[0:4])) df['Month'] = df['Birthday'].apply(lambda x: int(x[5:7])) df['Day'] = df['Birthday'].apply(lambda x: int(x[8:10])) df['Birthday'] = df['Birthday'].apply(lambda x: time.mktime(datetime.datetime.strptime(x, "%Y-%m-%d").timetuple())) allIterables = list(itertools.combinations(iterable, 2)) nb_columns = 0 for column in allIterables: nb_columns += 1 i = 1 for el in allIterables: plt.subplot(7, math.ceil(nb_columns / 7 + 1), i) i += 1 plt.scatter(df[el[0]], df[el[1]], color=utils.combine_hex_values(colors[el[0]], colors[el[1]]), alpha=0.25, label=[el[0], el[1]], s=3) plt.xticks([]) plt.yticks([]) plt.legend(loc='upper right', fontsize=5) plt.show()
def main(): df2 = pd.DataFrame() if not (len(sys.argv) == 1+1): print('\033[91m' + '✘ Error: ' + '\033[0m' + 'CSV file is missing, please add his path as argument') sys.exit() df = utils.dataframe(sys.argv[1]) columnsNamesArr = df.columns.values listOfColumnNames = list(columnsNamesArr) dico_objects = {} for label in listOfColumnNames: if df[label].dtypes == str or df[label].dtypes == object: dico_objects[label] = { 'count': 0, 'unique': 0, 'top': 0, 'freq': 0, 'all': {} } for label in listOfColumnNames: if df[label].dtypes == str or df[label].dtypes == object: for index, row in df.iterrows(): if (isinstance(row[label], float) and np.isnan(row[label])): continue dico_objects[label]['count']+=1 if row[label] in dico_objects[label]['all']: dico_objects[label]['all'][row[label]]+=1 else: dico_objects[label]['all'][row[label]] = 1 for label in dico_objects: i = 0 for k in dico_objects[label]['all']: i+=1 if dico_objects[label]['all'][k] > dico_objects[label]['freq']: dico_objects[label]['freq'] = dico_objects[label]['all'][k] dico_objects[label]['top'] = k dico_objects[label]['unique'] = i describe(dico_objects)
def main(): b_df = utils.dataframe('dataset_train.csv') colors = utils.colors_house() colors = [ colors['Ravenclaw'], colors['Slytherin'], colors['Gryffindor'], colors['Hufflepuff'] ] sns.set_palette(sns.color_palette(colors)) b_df['Year'] = b_df['Birthday'].apply(lambda x: int(x[0:4])) b_df['Best Hand'] = b_df['Best Hand'].replace(to_replace=['Left', 'Right'], value=[1, 2]) b_df['Year'] = b_df['Birthday'].apply(lambda x: int(x[0:4])) b_df['Month'] = b_df['Birthday'].apply(lambda x: int(x[5:7])) b_df['Day'] = b_df['Birthday'].apply(lambda x: int(x[8:10])) b_df['Birthday'] = b_df['Birthday'].apply(lambda x: time.mktime( datetime.datetime.strptime(x, "%Y-%m-%d").timetuple())) df = pd.DataFrame() df['Hogwarts House'] = b_df['Hogwarts House'] df['Best Hand'] = b_df['Best Hand'] df['Year'] = b_df['Year'] df['Month'] = b_df['Month'] df['Day'] = b_df['Day'] df['Birthday'] = b_df['Birthday'] df['First Name'] = b_df['First Name'].apply(lambda x: int(len(x))) df['Last Name'] = b_df['Last Name'].apply(lambda x: int(len(x))) df['Last Name Vowel'] = b_df['Last Name'].apply( lambda x: int(count_vowel(x))) df['First Name Vowel'] = b_df['First Name'].apply( lambda x: int(count_vowel(x))) df['Last Name Consonant'] = b_df['Last Name'].apply( lambda x: int(len(x) - count_vowel(x))) df['First Name Consonant'] = b_df['First Name'].apply( lambda x: int(len(x) - count_vowel(x))) sns.pairplot(df, hue="Hogwarts House") plt.show()
def main(): #matplotlib.use('webagg') ''' 'Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts', 'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 'Potions', 'Care of Magical Creatures':, 'Charms', 'Flying', ''' iterable = [ 'Astronomy', 'Herbology', 'Ancient Runes', 'Divination', ] plt.style.use('dark_background') colors_house = utils.colors_house() df = utils.dataframe('dataset_train.csv') colors = utils.colors() df_house = df['Hogwarts House'] allIterables = list(itertools.combinations(iterable, 2)) nb_columns = 0 for column in allIterables: nb_columns += 1 i = 1 for el in allIterables: plt.subplot(math.ceil(nb_columns / 3 + 1), 3, i) i += 1 plt.scatter(df[el[0]], df[el[1]], c=df['Hogwarts House'].map(colors_house), alpha=0.25, label=[el[0][:10], el[1][:10]], marker='o', s=2) df_errors = df.loc[[ 184, 200, 255, 339, 443, 445, 456, 504, 515, 618, 681, 704, 815, 820, 824, 915, 941, 1078, 1098, 1113, 1191, 1282, 1419, 1435, 1444, 1446, 1448, 1515, 1525, 1596 ]] plt.scatter(df_errors[el[0]], df_errors[el[1]], c=df_errors['Hogwarts House'].map(colors_house), marker='x') plt.xticks([]) plt.yticks([]) plt.legend(loc='upper right', fontsize=5) plt.show() print()