def modify_column_names(df): ''' ''' modding = True while modding: #THIS NEEDS TO BECOME A FXN CALL... SO CAN USE W/ named = False df.columns = fix_current_columns(df) tprint("Would you like to modify a column's name?") modding = cli.ask_yes(False) if modding: print() col = input( "Choose the number of the column to modify (1, 2, 3...): ") try: i = int(col.strip()) - 1 cli.cls(verbose=False) tprint("Renaming column: " + df.columns[i]) print() new = input("Type new name: ").strip() new = make_compatible(new) if new: tprint("Converting name for compatibility") tprint("Would you like to rename " + df.columns[i] + " to " + new + " ?") if cli.ask_yes(True): change_one_column(new, df, i) else: tprint("No new column name detected") except Exception as e: #debugging tprint("Error renaming column") #print(e) cli.cls()
def show_values(df, col, output=False, values=[]): ''' if output returns chosen value ''' i = 0 if not values: #THIS IS A BIT SILLY values = sorted(df[col].unique()) pages = math.ceil(len(values) / 10) print("Current unique values: (Page 1 of %d)" % pages) for v in values: i += 1 print("\t" + str(i) + ") " + v) if not output: if i % 10 == 0 and i < len(values): tprint("Show more values?") if cli.ask_yes(True): cli.cls(False) print("Current unique values: (Page %d of %d)" % (i / 10 + 1, pages)) else: break else: if i % 10 == 0 or i == len(values): print() more = i < len(values) choosing = True while choosing: #currently always true and break/return out if more: c = input( "Type the number of a value (1, 2...) or press Enter to view more values: " ) else: c = input( "Type the number of a value to select it (1, 2, 3...): " ) try: c = int(c) rv = str(values[c - 1]) return rv except: tprint("No value selected") if more: tprint("View more values?") if cli.ask_yes(True): cli.cls(False) print( "Current unique values: (Page %d of %d)" % (i / 10 + 1, pages)) break
def fix_guess(g, df, col): ''' ''' #BUILD SOMETHING TO SAVE THESE CHANGES #NEEDS RETURNS HERE AND IN SWITCH DATA print() c = input( "Choose the number of the correct value (1, 2...) or press Enter to use a custom value: " ) try: c = int(c.strip()) - 1 correct = g[c] switch_data(df, col, g, c, correct) except Exception as e: #print(e) #debugging chosen = False while not chosen: custom = input( "Enter a new correct value or press Enter to cancel: ") custom = custom.strip() if custom: tprint("Use %s ?" & custom) chosen = cli.ask_yes(True) if chosen: switch_data(df, col, g, -1, custom) #-1 is jank fix for how switch_data currently works else: chosen = True
def drop_na(df, col): ''' ''' tprint( "Would you like to remove the rows that do not contain a value for: %s?" % col) if cli.ask_yes(True): df.dropna(subset=[col])
def clean_numeric(df, col): ''' ''' tprint( "Would you like to see summary statistics for the data in this column?" ) if cli.ask_yes(True): tprint(df[col].describe()) cli.cls()
def ask_default(names, default_name, default=True): ''' ''' rv = False tprint("Would you like to use the default name?") if cli.ask_yes(default): names.append(default_name) rv = True return rv
def ask_convert(df, col): ''' ''' rv = False print("This column seems to contain text data") tprint("Is this column supposed to contain dates or numeric data?") if cli.ask_yes(False): rv = True #cli.cls() return rv
def clean_strings(df, col): ''' ''' tprint("Removing excess white space from values in %s" % col) df[col] = df[col].str.strip() df[col] = df[col].str.replace(" ,", ",") df[col] = df[col].str.replace(",,", ",") df[col] = df[col].str.replace(" ", " ") df[col] = df[col].str.replace("\t", " ") cli.cls() u = df[col].unique() num_unique = len(u) print("Column " + col + " contains " + str(num_unique) + " unique values") if num_unique > WARN_LEVEL: tprint( "Large numbers of unique values may take significant time to analyze, depending on their length, your system and settings" ) tprint("Would you like to search for possible errors in the data?") guesses = [] if cli.ask_yes(True): cli.cls(False) guesses = consolidate_guesses( guess_linkages(u)) #doing a lot w/ generator if guesses: review_guesses(df, col, guesses) tprint("All automatic revisions reviewed") else: tprint("Our review did not detect any likely errors") cli.cls() ###################### #DO THE MANUAL VERSION print("Current unique values in %s:" % col) show_values(df, col) cli.cls() print("Would you like to further clean or modify the values in %s?" % col) if cli.ask_yes(True): previous = [] #make previous update w/ confirmed automatic fixes... # ^ this will take some work fix_manual(df, col, previous)
def switch_data(df, col, g, c, correct): ''' ''' try: for i in range(len(g)): if i != c: tprint("Would you like to convert " + g[i] + " to " + correct + "?") if cli.ask_yes(True): df.loc[df[col] == g[i], [col]] = correct tprint("Converting " + g[i] + " -> " + correct, 0.5) except Exception as e: tprint("Error converting data. Data not converted")
def start_record(df, col, default=True): ''' ''' print("Would you like to create a new organization?") new = cli.ask_yes(default) print() if not new: #cli.ask_continue(False) cli.cls(verbose=False) return named = False while not named: name = input("Type organization name or abbreviation:\t") name = name.upper().strip() print() print("Do you want to name this record: %s ?" % name) named = cli.ask_yes(True) cli.cls(verbose=False) fname = name.replace(" ", "_") + ".txt" #maybe put failsafe guess here... or have option to merge records later #^ probably latter tbh. add_new_fixes(df, col, fname)
def fix_manual(df, col, previous=[]): ''' ''' working = True while working: tprint("Would you like to load a record of previously used changes?") if cli.ask_yes(True): fixes = choose_record(df, col, previous) #REDO/RENAME if not fixes: fixes = add_new_fixes(df, col, previous) else: cli.cls() fixes = add_new_fixes(df, col, previous) print("Applying fixes") for old, new in fixes.items(): df.loc[df[col] == old, [col]] = new tprint("Fixes applied") cli.cls() show_values(df, col) tprint("Would you like to further modify the values in %s?" % col) working = cli.ask_yes(True) cli.cls()
def choose_col(df, working): ''' ''' columns.fix_current_columns( df, list_only=True) #really ugly fix in columns atm print() col = input( "Choose the number of a column to clean (1, 2, 3...) or press Enter: ") try: i = int(col.strip()) - 1 #maybe should do something w/ the name... col_name = df.columns[i] except Exception as e: col_name = "" tprint("No column selected") tprint("Would you like to continue cleaning columns?") working = cli.ask_yes(True) #print(e) #debugging return col_name, working
def add_age(df): ''' ''' print("Would you like to calculate participants' ages?") age = cli.ask_yes(True) if not age: return print("Adding ages") converted = _convert_dates(df) if converted: y = [] for i in range(len(df)): y.append(df["date"].iloc[i].year - df["birthdate"].iloc[i].year) df["age"] = y print("Ages added") print() #May want try/except and prompt to continue else: print("Error converting dates. Ages not calculated") print()
def give_new_names(df): ''' ''' names = [] num_cols = len(df.columns) for i in range(num_cols): default_name = "column_" + str(i + 1) print("Column " + str(i + 1) + " of " + str(num_cols) + " is currently named: " + default_name) show_first_values(df, i) chosen = False while not chosen: name = input( "Enter a new name for this column or press Enter to keep the default: " ) name = name.strip() if name: com_name = name.lower().replace(" ", "_") tprint("Modifying input for compatibility: " + name + " -> " + com_name) tprint("Would you like to name this column " + com_name + " ?") chosen = cli.ask_yes(True) if chosen: tprint("Setting column name") if name in names: print( "Name already used for a column. Please choose another" ) else: names.append(name) else: chosen = ask_default(names, default_name, False) else: chosen = ask_default(names, default_name, True) cli.cls() #setting the new names tprint("Adding column names") df.columns = names cli.cls()
def review_guesses(df, col, guesses): ''' ''' #need to think about consolidating if there are a bunch of similar #build so g can contain 2+ values for g in guesses: print("Similar Value (Number in data):") i = 1 #exists is a p-jank solution for not having consolidate_guesses... exists = 0 for v in g: num_v = len(df[df[col] == v]) if num_v: exists += 1 print("\t" + str(i) + ") " + v + "\t(%d)" % num_v) i += 1 if exists <= 1: cli.cls(False) continue tprint("Would you like to update one or more of these values?") if cli.ask_yes(False): fix_guess(g, df, col) cli.cls(True)
def choose_record(df, col, previous): ''' returns dict of old -> new large parts of this fxn are currently deprecated, unclear if stable ''' org = "" #suggest = True rv = {} chosen = False #this is basically a C paradigm tbh while not chosen: #REDO STRINGS/SELECTION ''' if not org: print("Choose an organization from the list below:") for org in sorted(os.listdir("staff")): #REQUIRED DIR: staff print(org.strip(".txt").replace("_", " ")) print("Other") print() org = input("Organization:\t").strip().upper().replace(" ", "_") print() if org == "OTHER": start_record(df, True) org = "" continue #DO SOMETHING FOR OTHER -- requires making new file --> continue, maybe redisplay orgs else: ''' val = [n.strip(".txt") for n in os.listdir("data") if ".txt" in n] #that is some l-comp org = show_values(df, col, output=True, values=val) try: fname = org + ".txt" with open("data/" + fname) as f: data = f.readlines() for row in data: try: old, fix = row.split("///") rv[old] = fix.strip() except ValueError: #This may hurt abstraction too much as is tprint("Bypassing incorrectly formatted data") #print(row) #probably just cut this tbh chosen = True tprint(org + " data loaded") except FileNotFoundError: tprint("Error loading record") tprint("Would you like to start a new record?") if cli.ask_yes(True): chosen = True ''' print("Records not found") print() if suggest and org: likely = cli.guess(fname.strip(".txt"), os.listdir('staff')) if likely: corrected = cli.ask_meant(likely.strip(".txt")) if corrected: org = likely suggest = False continue else: org = "" cli.cls(verbose = False) else: cli.cls(verbose = False) #put rest INSIDE THIS BLOCK block so correction -> straight to return rv if not suggest: add_new_fixes(df, col, present) ''' return rv
def add_new_fixes(df, col, previous): finished = False fixes = {} while not finished: #MAKE A FUNCTION FROM A BUNCH OF THIS SO CAN USE WITH EXISTING... tprint("Choose a value to replace") old = select_value(df, col) tprint( "Would you like to choose another existing value to replace: %s ?" % old) print( "(Otherwise you will be prompted to enter a custom replacement value)" ) if cli.ask_yes(True): cli.cls(False) tprint("Choose a value to replace '%s'" % old) new = select_value(df, col) else: chosen = False while not chosen: new = input("Enter custom value to replace %s:\t" % old) if new: tprint("Use %s ?" % new) chosen = cli.ask_yes(True) cli.cls(verbose=False) if old and new: tprint("You chose: " + old + " -> " + new) tprint("Confirm this replacement?") if cli.ask_yes(True): tprint("Confirmed") fixes[old] = new cli.cls() if fixes: print("Your chosen replacements:") tprint("\tCurrent\tReplaced") sort_fixes = sorted(fixes.items()) for old, new in sort_fixes: print("\t" + old + "\t" + new) tprint("Would you like to add another replacement?") if cli.ask_yes(True): cli.cls() continue #*Slightly* jank tprint( "Would you like to save a record of these replacements for future use?" ) if cli.ask_yes(True): if previous: tprint( "Would you like to include the changes you selected from our suggestions in this record?" ) if cli.ask_yes(): for p in previous: fixes[p[1]] = p[0] sort_fixes = sorted(fixes.items()) cli.cls() named = False while not named: name = input("Choose a name for this record:\t") name = name.lower().strip() tprint("Do you want to name this record: %s ?" % name) named = cli.ask_yes(True) cli.cls(verbose=False) with open("data/" + name + ".txt", 'w') as f: for old, new in sort_fixes: f.write(old + '///' + new) if old != sort_fixes[-1]: f.write("\n") finished = True return fixes