Esempio n. 1
0
def modify_column_names(df):
    '''
    '''
    modding = True
    while modding:
        #THIS NEEDS TO BECOME A FXN CALL... SO CAN USE W/ named = False
        df.columns = fix_current_columns(df)
        tprint("Would you like to modify a column's name?")
        modding = cli.ask_yes(False)
        if modding:
            print()
            col = input(
                "Choose the number of the column to modify (1, 2, 3...): ")
            try:
                i = int(col.strip()) - 1
                cli.cls(verbose=False)
                tprint("Renaming column: " + df.columns[i])
                print()
                new = input("Type new name: ").strip()
                new = make_compatible(new)
                if new:
                    tprint("Converting name for compatibility")
                    tprint("Would you like to rename " + df.columns[i] +
                           " to " + new + " ?")
                    if cli.ask_yes(True):
                        change_one_column(new, df, i)
                else:
                    tprint("No new column name detected")
            except Exception as e:  #debugging
                tprint("Error renaming column")
            #print(e)
            cli.cls()
Esempio n. 2
0
def show_values(df, col, output=False, values=[]):
    '''
    if output returns chosen value
    '''
    i = 0
    if not values:  #THIS IS A BIT SILLY
        values = sorted(df[col].unique())
    pages = math.ceil(len(values) / 10)
    print("Current unique values: (Page 1 of %d)" % pages)
    for v in values:
        i += 1
        print("\t" + str(i) + ") " + v)
        if not output:
            if i % 10 == 0 and i < len(values):
                tprint("Show more values?")
                if cli.ask_yes(True):
                    cli.cls(False)
                    print("Current unique values: (Page %d of %d)" %
                          (i / 10 + 1, pages))
                else:
                    break
        else:
            if i % 10 == 0 or i == len(values):
                print()
                more = i < len(values)
                choosing = True
                while choosing:  #currently always true and break/return out
                    if more:
                        c = input(
                            "Type the number of a value (1, 2...) or press Enter to view more values: "
                        )
                    else:
                        c = input(
                            "Type the number of a value to select it (1, 2, 3...):  "
                        )
                    try:
                        c = int(c)
                        rv = str(values[c - 1])
                        return rv
                    except:
                        tprint("No value selected")
                        if more:
                            tprint("View more values?")
                            if cli.ask_yes(True):
                                cli.cls(False)
                                print(
                                    "Current unique values: (Page %d of %d)" %
                                    (i / 10 + 1, pages))
                                break
Esempio n. 3
0
def fix_guess(g, df, col):
    '''
    '''
    #BUILD SOMETHING TO SAVE THESE CHANGES
    #NEEDS RETURNS HERE AND IN SWITCH DATA
    print()
    c = input(
        "Choose the number of the correct value (1, 2...) or press Enter to use a custom value:  "
    )
    try:
        c = int(c.strip()) - 1
        correct = g[c]
        switch_data(df, col, g, c, correct)
    except Exception as e:
        #print(e) #debugging
        chosen = False
        while not chosen:
            custom = input(
                "Enter a new correct value or press Enter to cancel:  ")
            custom = custom.strip()
            if custom:
                tprint("Use %s ?" & custom)
                chosen = cli.ask_yes(True)
                if chosen:
                    switch_data(df, col, g, -1, custom)
                    #-1 is jank fix for how switch_data currently works
            else:
                chosen = True
Esempio n. 4
0
def drop_na(df, col):
    '''
    '''
    tprint(
        "Would you like to remove the rows that do not contain a value for: %s?"
        % col)
    if cli.ask_yes(True):
        df.dropna(subset=[col])
Esempio n. 5
0
def clean_numeric(df, col):
    '''
    '''
    tprint(
        "Would you like to see summary statistics for the data in this column?"
    )
    if cli.ask_yes(True):
        tprint(df[col].describe())
    cli.cls()
Esempio n. 6
0
def ask_default(names, default_name, default=True):
    '''
    '''
    rv = False
    tprint("Would you like to use the default name?")
    if cli.ask_yes(default):
        names.append(default_name)
        rv = True
    return rv
Esempio n. 7
0
def ask_convert(df, col):
    '''
    '''
    rv = False
    print("This column seems to contain text data")
    tprint("Is this column supposed to contain dates or numeric data?")
    if cli.ask_yes(False):
        rv = True
    #cli.cls()
    return rv
Esempio n. 8
0
def clean_strings(df, col):
    '''
    '''
    tprint("Removing excess white space from values in %s" % col)
    df[col] = df[col].str.strip()
    df[col] = df[col].str.replace(" ,", ",")
    df[col] = df[col].str.replace(",,", ",")
    df[col] = df[col].str.replace("  ", " ")
    df[col] = df[col].str.replace("\t", " ")
    cli.cls()
    u = df[col].unique()
    num_unique = len(u)
    print("Column " + col + " contains " + str(num_unique) + " unique values")
    if num_unique > WARN_LEVEL:
        tprint(
            "Large numbers of unique values may take significant time to analyze, depending on their length, your system and settings"
        )
    tprint("Would you like to search for possible errors in the data?")
    guesses = []
    if cli.ask_yes(True):
        cli.cls(False)
        guesses = consolidate_guesses(
            guess_linkages(u))  #doing a lot w/ generator
        if guesses:
            review_guesses(df, col, guesses)
            tprint("All automatic revisions reviewed")
        else:
            tprint("Our review did not detect any likely errors")
    cli.cls()
    ######################
    #DO THE MANUAL VERSION
    print("Current unique values in %s:" % col)
    show_values(df, col)
    cli.cls()
    print("Would you like to further clean or modify the values in %s?" % col)
    if cli.ask_yes(True):
        previous = []  #make previous update w/ confirmed automatic fixes...
        # ^ this will take some work
        fix_manual(df, col, previous)
Esempio n. 9
0
def switch_data(df, col, g, c, correct):
    '''
    '''
    try:
        for i in range(len(g)):
            if i != c:
                tprint("Would you like to convert " + g[i] + " to " + correct +
                       "?")
                if cli.ask_yes(True):
                    df.loc[df[col] == g[i], [col]] = correct
                    tprint("Converting " + g[i] + " -> " + correct, 0.5)
    except Exception as e:
        tprint("Error converting data. Data not converted")
Esempio n. 10
0
def start_record(df, col, default=True):
    '''
    '''
    print("Would you like to create a new organization?")
    new = cli.ask_yes(default)
    print()
    if not new:
        #cli.ask_continue(False)
        cli.cls(verbose=False)
        return
    named = False
    while not named:
        name = input("Type organization name or abbreviation:\t")
        name = name.upper().strip()
        print()
        print("Do you want to name this record: %s ?" % name)
        named = cli.ask_yes(True)
        cli.cls(verbose=False)
    fname = name.replace(" ", "_") + ".txt"
    #maybe put failsafe guess here... or have option to merge records later
    #^ probably latter tbh.
    add_new_fixes(df, col, fname)
Esempio n. 11
0
def fix_manual(df, col, previous=[]):
    '''
    '''
    working = True
    while working:
        tprint("Would you like to load a record of previously used changes?")
        if cli.ask_yes(True):
            fixes = choose_record(df, col, previous)  #REDO/RENAME
            if not fixes:
                fixes = add_new_fixes(df, col, previous)
        else:
            cli.cls()
            fixes = add_new_fixes(df, col, previous)
        print("Applying fixes")
        for old, new in fixes.items():
            df.loc[df[col] == old, [col]] = new
        tprint("Fixes applied")
        cli.cls()
        show_values(df, col)
        tprint("Would you like to further modify the values in %s?" % col)
        working = cli.ask_yes(True)
        cli.cls()
Esempio n. 12
0
def choose_col(df, working):
    '''
    '''
    columns.fix_current_columns(
        df, list_only=True)  #really ugly fix in columns atm
    print()
    col = input(
        "Choose the number of a column to clean (1, 2, 3...) or press Enter: ")
    try:
        i = int(col.strip()) - 1  #maybe should do something w/ the name...
        col_name = df.columns[i]
    except Exception as e:
        col_name = ""
        tprint("No column selected")
        tprint("Would you like to continue cleaning columns?")
        working = cli.ask_yes(True)
        #print(e) #debugging
    return col_name, working
Esempio n. 13
0
def add_age(df):
    '''
    '''
    print("Would you like to calculate participants' ages?")
    age = cli.ask_yes(True)
    if not age:
        return
    print("Adding ages")
    converted = _convert_dates(df)
    if converted:
        y = []
        for i in range(len(df)):
            y.append(df["date"].iloc[i].year - df["birthdate"].iloc[i].year)
        df["age"] = y
        print("Ages added")
        print()
        #May want try/except and prompt to continue
    else:
        print("Error converting dates. Ages not calculated")
        print()
Esempio n. 14
0
def give_new_names(df):
    '''
    '''
    names = []
    num_cols = len(df.columns)
    for i in range(num_cols):
        default_name = "column_" + str(i + 1)
        print("Column " + str(i + 1) + " of " + str(num_cols) +
              " is currently named: " + default_name)
        show_first_values(df, i)
        chosen = False
        while not chosen:
            name = input(
                "Enter a new name for this column or press Enter to keep the default: "
            )
            name = name.strip()
            if name:
                com_name = name.lower().replace(" ", "_")
                tprint("Modifying input for compatibility: " + name + " -> " +
                       com_name)
                tprint("Would you like to name this column " + com_name + " ?")
                chosen = cli.ask_yes(True)
                if chosen:
                    tprint("Setting column name")
                    if name in names:
                        print(
                            "Name already used for a column. Please choose another"
                        )
                    else:
                        names.append(name)
                else:
                    chosen = ask_default(names, default_name, False)
            else:
                chosen = ask_default(names, default_name, True)
        cli.cls()
    #setting the new names
    tprint("Adding column names")
    df.columns = names
    cli.cls()
Esempio n. 15
0
def review_guesses(df, col, guesses):
    '''
    '''
    #need to think about consolidating if there are a bunch of similar
    #build so g can contain 2+ values
    for g in guesses:
        print("Similar Value (Number in data):")
        i = 1
        #exists is a p-jank solution for not having consolidate_guesses...
        exists = 0
        for v in g:
            num_v = len(df[df[col] == v])
            if num_v:
                exists += 1
            print("\t" + str(i) + ") " + v + "\t(%d)" % num_v)
            i += 1
        if exists <= 1:
            cli.cls(False)
            continue
        tprint("Would you like to update one or more of these values?")
        if cli.ask_yes(False):
            fix_guess(g, df, col)
        cli.cls(True)
Esempio n. 16
0
def choose_record(df, col, previous):
    '''
    returns dict of old -> new
    large parts of this fxn are currently deprecated, unclear if stable
    '''
    org = ""
    #suggest = True
    rv = {}
    chosen = False  #this is basically a C paradigm tbh
    while not chosen:
        #REDO STRINGS/SELECTION
        '''
        if not org:
            print("Choose an organization from the list below:")
            for org in sorted(os.listdir("staff")): #REQUIRED DIR: staff
                print(org.strip(".txt").replace("_", " "))
            print("Other")
            print()
            org = input("Organization:\t").strip().upper().replace(" ", "_")
            print()
        if org == "OTHER":
            start_record(df, True)
            org = ""
            continue
            #DO SOMETHING FOR OTHER -- requires making new file --> continue, maybe redisplay orgs
        else:
        '''
        val = [n.strip(".txt") for n in os.listdir("data")
               if ".txt" in n]  #that is some l-comp
        org = show_values(df, col, output=True, values=val)
        try:
            fname = org + ".txt"
            with open("data/" + fname) as f:
                data = f.readlines()
            for row in data:
                try:
                    old, fix = row.split("///")
                    rv[old] = fix.strip()
                except ValueError:
                    #This may hurt abstraction too much as is
                    tprint("Bypassing incorrectly formatted data")
                    #print(row) #probably just cut this tbh
            chosen = True
            tprint(org + " data loaded")
        except FileNotFoundError:
            tprint("Error loading record")
            tprint("Would you like to start a new record?")
            if cli.ask_yes(True):
                chosen = True
            '''
            print("Records not found")
            print()
            if suggest and org:
                likely = cli.guess(fname.strip(".txt"), 
                                   os.listdir('staff'))
                if likely:
                    corrected = cli.ask_meant(likely.strip(".txt"))
                    if corrected:
                        org = likely
                        suggest = False                           
                        continue
                    else:
                        org = "" 
                        cli.cls(verbose = False)
                else:
                    cli.cls(verbose = False)
            #put rest INSIDE THIS BLOCK block so correction -> straight to return rv
            if not suggest: 
                add_new_fixes(df, col, present)
            '''
    return rv
Esempio n. 17
0
def add_new_fixes(df, col, previous):
    finished = False
    fixes = {}
    while not finished:
        #MAKE A FUNCTION FROM A BUNCH OF THIS SO CAN USE WITH EXISTING...
        tprint("Choose a value to replace")
        old = select_value(df, col)
        tprint(
            "Would you like to choose another existing value to replace: %s ?"
            % old)
        print(
            "(Otherwise you will be prompted to enter a custom replacement value)"
        )
        if cli.ask_yes(True):
            cli.cls(False)
            tprint("Choose a value to replace '%s'" % old)
            new = select_value(df, col)
        else:
            chosen = False
            while not chosen:
                new = input("Enter custom value to replace %s:\t" % old)
                if new:
                    tprint("Use %s ?" % new)
                    chosen = cli.ask_yes(True)
        cli.cls(verbose=False)
        if old and new:
            tprint("You chose: " + old + " -> " + new)
            tprint("Confirm this replacement?")
            if cli.ask_yes(True):
                tprint("Confirmed")
                fixes[old] = new
            cli.cls()
        if fixes:
            print("Your chosen replacements:")
            tprint("\tCurrent\tReplaced")
            sort_fixes = sorted(fixes.items())
            for old, new in sort_fixes:
                print("\t" + old + "\t" + new)
            tprint("Would you like to add another replacement?")
            if cli.ask_yes(True):
                cli.cls()
                continue  #*Slightly* jank
            tprint(
                "Would you like to save a record of these replacements for future use?"
            )
            if cli.ask_yes(True):
                if previous:
                    tprint(
                        "Would you like to include the changes you selected from our suggestions in this record?"
                    )
                    if cli.ask_yes():
                        for p in previous:
                            fixes[p[1]] = p[0]
                        sort_fixes = sorted(fixes.items())
                cli.cls()
                named = False
                while not named:
                    name = input("Choose a name for this record:\t")
                    name = name.lower().strip()
                    tprint("Do you want to name this record:  %s  ?" % name)
                    named = cli.ask_yes(True)
                    cli.cls(verbose=False)
                with open("data/" + name + ".txt", 'w') as f:
                    for old, new in sort_fixes:
                        f.write(old + '///' + new)
                        if old != sort_fixes[-1]:
                            f.write("\n")
            finished = True
    return fixes