def convert_data(df, col): ''' ''' #replace when finished tprint("Unfortunately this feature is not currently supported") tprint("Stay tuned for future releases and updates") cli.cls()
def modify_column_names(df): ''' ''' modding = True while modding: #THIS NEEDS TO BECOME A FXN CALL... SO CAN USE W/ named = False df.columns = fix_current_columns(df) tprint("Would you like to modify a column's name?") modding = cli.ask_yes(False) if modding: print() col = input( "Choose the number of the column to modify (1, 2, 3...): ") try: i = int(col.strip()) - 1 cli.cls(verbose=False) tprint("Renaming column: " + df.columns[i]) print() new = input("Type new name: ").strip() new = make_compatible(new) if new: tprint("Converting name for compatibility") tprint("Would you like to rename " + df.columns[i] + " to " + new + " ?") if cli.ask_yes(True): change_one_column(new, df, i) else: tprint("No new column name detected") except Exception as e: #debugging tprint("Error renaming column") #print(e) cli.cls()
def clean_numeric(df, col): ''' ''' tprint( "Would you like to see summary statistics for the data in this column?" ) if cli.ask_yes(True): tprint(df[col].describe()) cli.cls()
def select_value(df, col): ''' ''' rv = "" chosen = False print() while not chosen: rv = show_values(df, col, output=True) #NEEDS COL chosen = cli.ask_meant(rv, default=True) if not chosen: cli.cls(verbose=False) return rv
def show_values(df, col, output=False, values=[]): ''' if output returns chosen value ''' i = 0 if not values: #THIS IS A BIT SILLY values = sorted(df[col].unique()) pages = math.ceil(len(values) / 10) print("Current unique values: (Page 1 of %d)" % pages) for v in values: i += 1 print("\t" + str(i) + ") " + v) if not output: if i % 10 == 0 and i < len(values): tprint("Show more values?") if cli.ask_yes(True): cli.cls(False) print("Current unique values: (Page %d of %d)" % (i / 10 + 1, pages)) else: break else: if i % 10 == 0 or i == len(values): print() more = i < len(values) choosing = True while choosing: #currently always true and break/return out if more: c = input( "Type the number of a value (1, 2...) or press Enter to view more values: " ) else: c = input( "Type the number of a value to select it (1, 2, 3...): " ) try: c = int(c) rv = str(values[c - 1]) return rv except: tprint("No value selected") if more: tprint("View more values?") if cli.ask_yes(True): cli.cls(False) print( "Current unique values: (Page %d of %d)" % (i / 10 + 1, pages)) break
def clean_columns(df): ''' ''' #could also just iterate through columns working = True while working: #NEED TO THINK ABOUT BOOL HERE, CURRENTLY NO END COND. print("DATA CLEANING") col, working = choose_col(df, working) #df.iloc[:,i] cli.cls() if working and col: #lil ugly drop_na(df, col) cli.cls() numeric = assess_data_type(df[col]) if numeric: clean_numeric(df, col) else: check_mixed_data(df, col)
def start_record(df, col, default=True): ''' ''' print("Would you like to create a new organization?") new = cli.ask_yes(default) print() if not new: #cli.ask_continue(False) cli.cls(verbose=False) return named = False while not named: name = input("Type organization name or abbreviation:\t") name = name.upper().strip() print() print("Do you want to name this record: %s ?" % name) named = cli.ask_yes(True) cli.cls(verbose=False) fname = name.replace(" ", "_") + ".txt" #maybe put failsafe guess here... or have option to merge records later #^ probably latter tbh. add_new_fixes(df, col, fname)
def give_new_names(df): ''' ''' names = [] num_cols = len(df.columns) for i in range(num_cols): default_name = "column_" + str(i + 1) print("Column " + str(i + 1) + " of " + str(num_cols) + " is currently named: " + default_name) show_first_values(df, i) chosen = False while not chosen: name = input( "Enter a new name for this column or press Enter to keep the default: " ) name = name.strip() if name: com_name = name.lower().replace(" ", "_") tprint("Modifying input for compatibility: " + name + " -> " + com_name) tprint("Would you like to name this column " + com_name + " ?") chosen = cli.ask_yes(True) if chosen: tprint("Setting column name") if name in names: print( "Name already used for a column. Please choose another" ) else: names.append(name) else: chosen = ask_default(names, default_name, False) else: chosen = ask_default(names, default_name, True) cli.cls() #setting the new names tprint("Adding column names") df.columns = names cli.cls()
def review_guesses(df, col, guesses): ''' ''' #need to think about consolidating if there are a bunch of similar #build so g can contain 2+ values for g in guesses: print("Similar Value (Number in data):") i = 1 #exists is a p-jank solution for not having consolidate_guesses... exists = 0 for v in g: num_v = len(df[df[col] == v]) if num_v: exists += 1 print("\t" + str(i) + ") " + v + "\t(%d)" % num_v) i += 1 if exists <= 1: cli.cls(False) continue tprint("Would you like to update one or more of these values?") if cli.ask_yes(False): fix_guess(g, df, col) cli.cls(True)
def clean_strings(df, col): ''' ''' tprint("Removing excess white space from values in %s" % col) df[col] = df[col].str.strip() df[col] = df[col].str.replace(" ,", ",") df[col] = df[col].str.replace(",,", ",") df[col] = df[col].str.replace(" ", " ") df[col] = df[col].str.replace("\t", " ") cli.cls() u = df[col].unique() num_unique = len(u) print("Column " + col + " contains " + str(num_unique) + " unique values") if num_unique > WARN_LEVEL: tprint( "Large numbers of unique values may take significant time to analyze, depending on their length, your system and settings" ) tprint("Would you like to search for possible errors in the data?") guesses = [] if cli.ask_yes(True): cli.cls(False) guesses = consolidate_guesses( guess_linkages(u)) #doing a lot w/ generator if guesses: review_guesses(df, col, guesses) tprint("All automatic revisions reviewed") else: tprint("Our review did not detect any likely errors") cli.cls() ###################### #DO THE MANUAL VERSION print("Current unique values in %s:" % col) show_values(df, col) cli.cls() print("Would you like to further clean or modify the values in %s?" % col) if cli.ask_yes(True): previous = [] #make previous update w/ confirmed automatic fixes... # ^ this will take some work fix_manual(df, col, previous)
def fix_manual(df, col, previous=[]): ''' ''' working = True while working: tprint("Would you like to load a record of previously used changes?") if cli.ask_yes(True): fixes = choose_record(df, col, previous) #REDO/RENAME if not fixes: fixes = add_new_fixes(df, col, previous) else: cli.cls() fixes = add_new_fixes(df, col, previous) print("Applying fixes") for old, new in fixes.items(): df.loc[df[col] == old, [col]] = new tprint("Fixes applied") cli.cls() show_values(df, col) tprint("Would you like to further modify the values in %s?" % col) working = cli.ask_yes(True) cli.cls()
def main(): #maybe wrap the whole thing in a try/except to help abstraction LOL while True: #INTRO, loads file with intro text cli.cls(verbose=False) cli.load_cli_intro() cli.cls() #OPENING FILE df, named, fname = initial.load_in() cli.cls() #(RE)NAMING COLUMNS: columns.rename_cols(df, named) cli.cls() #ADD AGES FOR KIDS -- THIS IS GETTING ROLLED INTO CLEAN SOMEHOW #calcdates.add_age(df) #cli.cls() #CLEAN THE STAFF #try: clean.clean_columns(df) #except Exception as e: #DEBUGGING # print(e) # input("") cli.cls() #WRITES TO CSV # Done but needs testing final.write_csv(df, fname) cli.cls() #Finish or restart cli.outro()
def add_new_fixes(df, col, previous): finished = False fixes = {} while not finished: #MAKE A FUNCTION FROM A BUNCH OF THIS SO CAN USE WITH EXISTING... tprint("Choose a value to replace") old = select_value(df, col) tprint( "Would you like to choose another existing value to replace: %s ?" % old) print( "(Otherwise you will be prompted to enter a custom replacement value)" ) if cli.ask_yes(True): cli.cls(False) tprint("Choose a value to replace '%s'" % old) new = select_value(df, col) else: chosen = False while not chosen: new = input("Enter custom value to replace %s:\t" % old) if new: tprint("Use %s ?" % new) chosen = cli.ask_yes(True) cli.cls(verbose=False) if old and new: tprint("You chose: " + old + " -> " + new) tprint("Confirm this replacement?") if cli.ask_yes(True): tprint("Confirmed") fixes[old] = new cli.cls() if fixes: print("Your chosen replacements:") tprint("\tCurrent\tReplaced") sort_fixes = sorted(fixes.items()) for old, new in sort_fixes: print("\t" + old + "\t" + new) tprint("Would you like to add another replacement?") if cli.ask_yes(True): cli.cls() continue #*Slightly* jank tprint( "Would you like to save a record of these replacements for future use?" ) if cli.ask_yes(True): if previous: tprint( "Would you like to include the changes you selected from our suggestions in this record?" ) if cli.ask_yes(): for p in previous: fixes[p[1]] = p[0] sort_fixes = sorted(fixes.items()) cli.cls() named = False while not named: name = input("Choose a name for this record:\t") name = name.lower().strip() tprint("Do you want to name this record: %s ?" % name) named = cli.ask_yes(True) cli.cls(verbose=False) with open("data/" + name + ".txt", 'w') as f: for old, new in sort_fixes: f.write(old + '///' + new) if old != sort_fixes[-1]: f.write("\n") finished = True return fixes