Exemple #1
0
def convert_xCalibre_matrix_for_PCI(matrix_file,
                                    corresponding_control_file=False,
                                    id_length=8,
                                    include_id=False):
    #similar file in h_stats
    ret = []
    first_row = ""
    if corresponding_control_file:
        df = pd.read_csv(corresponding_control_file, header=None)
        item_ids = df.loc[:, 0]
        for id in item_ids:
            first_row += id + ","
        first_row = first_row[:-1]
        ret = [first_row]
    lines = hfh.get_lines(matrix_file)
    for line in lines:
        ret_line = ""
        if include_id:
            ret_line = line[:id_length]
        answer_string = line[id_length:]
        for c in answer_string:
            ret_line += c + ','
        ret_line = ret_line[:-3]
        ret_line += '\n'
        ret.append(ret_line)
    name = hfh.get_stem(matrix_file) + "__c.csv"
    hfh.write_lines_to_text(ret, name)
    translated_name = hfh.get_stem(matrix_file) + "_T_c.csv"
    df = pd.read_csv(name, header=None)
    df.to_csv("pickme.csv")
    df = df.T
    df.to_csv(translated_name, index=False)
    return df.T
def process_paired_files(pe_file, data_file, project_folder, select_Domain=False):
    #   todo: add in domain processing here so that sets without domain names are still separated
    if hfh.get_extension(pe_file) == 'xlsx':
        pe_df = hfh.get_df_from_xlsx(pe_file)
    else:
        pe_df = hfh.get_df(pe_file, header=0)

    pe_df.Position = pe_df.Position.astype(float)
    pe_df = pe_df.sort_values("Position", ascending=True, )

    pe_df = pe_df[["AccNum", 'CorrectAnswer', 'Domain']]
    #pe_df.Domain = pe_df.Domain.apply(str)
    #pe_df["AccNum"] = str(pe_df["AccNum"]) + "_" + pe_df["Domain"]

    if select_Domain:
        pe_df.loc[pe_df.Domain != select_Domain, 'include'] = 'N'

    pe_df = pe_df.drop(['Domain'], axis=1)

    pe_df['number_of_options'] = 4
    pe_df['group'] = 1
    pe_df["include"] = 'Y'
    pe_df['type'] = 'M'
    pe_df = pe_df[['AccNum', 'CorrectAnswer', 'number_of_options', 'group', 'include', 'type']]
    processed_path = project_folder + "/processed_data/"
    c_path = processed_path + hfh.get_stem(pe_file) + "_c.csv"
    pe_df.to_csv(c_path, header=False, index=False)
    h.convert_default_data_to_iteman(data_file, processed_path, new_name=hfh.get_stem(pe_file))
    return 1
def create_c_from_LXR_Test(file_path, destination_path=None):
    if destination_path is None:
        destination_path = hfh.get_parent_folder(file_path)
    lines = hfh.get_lines(file_path)
    ret = []
    counter = 0
    for line in lines:
        counter += 1
        if line[0].isnumeric():
            entry = line.split()
            test_name = hfh.get_stem(file_path)
            test_id = line[:line.index('.')]
            entry[1]
            bank_id = entry[1] + '_' + entry[2]

            if len(entry) == 4:
                subject = entry[1] + "_" + entry[2]
                bank_id = subject + entry[3]
            key_line = lines[counter]
            key_i = key_line.find('Key: ')
            if key_i > -1:
                key = key_line[key_i + len("Key: ")]
            else:
                print("hello")
            record = [bank_id, key, '4', '1', 'Y', 'M']
            ret.append(record)
    df = pd.DataFrame(ret)

    name = hfh.get_stem(file_path) + "_c.csv"
    # df.sort_values(df[1])
    df.to_csv(destination_path + "/" + name, index=False, header=False)
    return df
Exemple #4
0
def process_karen_data(path_to_data_files,
                       path_to_test_bank_files,
                       destination_path,
                       removed_suffix="FullAmdin",
                       suffix="Test"):
    #   todo: this relies on the name Test for the bank information. It is assumed that the data file is the same
    #   todo: as the test file with the exception of the suffix. i.e. XXX12Data and XXX12Test
    data_files = hfh.get_all_file_names_in_folder(path_to_data_files,
                                                  target_string='FullAdmin')
    test_bank_files = hfh.get_all_file_names_in_folder(path_to_test_bank_files,
                                                       target_string='Test')
    for file in test_bank_files:
        create_mapping_from_Karen_test_data(file, True)

    test_bank_lists = []
    for file in test_bank_files:
        a = create_mapping_from_Karen_test_data(
            file, destination_path=destination_path, create_csv=True)
        test_bank_lists.append(a)

    for file in data_files:
        convert_iteman_format(file, destination_path=destination_path)

    for file in data_files:
        matching_control_name = destination_path + hfh.get_stem(
            file) + "_c.csv"
        matching_item_bank_name = destination_path + hfh.get_stem(
            file)[:-len(removed_suffix)] + suffix + "_L.csv"
        merge_control_and_bank_info(matching_control_name,
                                    matching_item_bank_name)
Exemple #5
0
def convert_2016_format(file_name, destination_path="", pretest_cutoff=False):
    #   of form:
    #   PT1   PT116MAR   BB... correct
    #   answers
    #   todo: rename this

    lines = hfh.get_lines(file_name)
    start_of_answers = lines[0].rfind(' ')
    answers = lines[0][start_of_answers + 1:]

    ret = []

    for line in lines[2:]:
        # remove R or F at end
        last_entry = line[len(line) - 2]
        if last_entry == 'R' or last_entry == 'F':
            line = line[:-5] + '\n'
        ret.append(line)
    name = hfh.get_stem(file_name)
    new = hfh.get_stem(destination_path + "/" + name) + "_f.txt"
    hfh.write_lines_to_text(ret, new)

    if is_valid_data(new):
        convert_answers_to_default_control(name, answers, destination_path,
                                           pretest_cutoff)
        return True
    return False
Exemple #6
0
def convert_default_data_to_iteman(file_name,
                                   processed_data_path,
                                   new_name=False):
    if processed_data_path[-1] == '/':
        processed_data_path = processed_data_path[:-1]
    lines = hfh.get_lines(file_name)
    ret = []
    for line in lines[2:]:
        line = set_standard_id_length_for_line(line, 8)
        ret.append(line)

    path = processed_data_path + "/" + hfh.get_stem(file_name) + "_f.txt"
    if new_name:
        path = processed_data_path + "/" + hfh.get_stem(new_name) + "_f.txt"

    hfh.write_lines_to_text(ret, path)
def create_reports(path_to_stats, destination_path, is_cs=False):
    # todo: make this smarter. Look through for any stats files in folder or subfolders and then munch on them.
    destination_path += '/'
    if is_cs:
        files = get_all_file_names_in_folder(path_to_stats,
                                             target_string="_cs")
    else:
        files = get_all_file_names_in_folder(path_to_stats,
                                             target_string="Stats")

    for file in files:
        if is_cs:
            df = get_df(file, header=0)
        else:
            df = convert_stats_to_df(file, destination_path)
        high = get_top_n_rpbis(df, 20)
        low = get_low_performing_items(df)
        flagged = get_flagged_items(df)
        agg = get_aggregate_report_items(df)
        # parameters = get_parameters(df)
        name = get_stem(file) + "_c"
        high.to_csv(destination_path + name + "_high_.csv", index=False)
        low.to_csv(destination_path + name + "_low_.csv", index=False)
        flagged.to_csv(destination_path + name + "_flagged_.csv", index=False)
        agg.to_csv(destination_path + name + "_agg.csv", index=False)
def create_FINAL_from_CULLED(project_path):
    culled_path = hfh.get_all_files(project_path + '/reports',
                                    target_string='CULLED.csv')
    cal_f_path = hfh.get_all_files(project_path + '/calibration',
                                   target_string='CAL_f.csv')
    cal_c_path = hfh.get_all_files(project_path + '/calibration',
                                   target_string='CAL_c.csv')
    if len(culled_path) * len(cal_f_path) * len(cal_c_path) == 1:
        f_df = hfh.get_df(cal_f_path[0])
        c_df = hfh.get_df(cal_c_path[0])
        #c_df.columns = ['AccNum','A','B','C','D','E']
        culled_df = hfh.get_df(culled_path[0], header=0)
    else:
        print("invalid call CULLED, CAL_f and CAL_c")
        return False
    ids = f_df.iloc[:, 0]
    f_df = f_df.drop(columns=[0])
    f_df = f_df.T
    f_df = f_df.reset_index()
    f_df = f_df.drop(columns='index')
    c_df.columns = ['AccNum', 'B', 'C', 'D', 'E', 'F']
    f_df.insert(0, 'AccNum', c_df['AccNum'])

    c_df = c_df[c_df['AccNum'].isin(culled_df['AccNum'])]
    f_df = f_df[f_df['AccNum'].isin(culled_df['AccNum'])]
    f_df = f_df.drop(columns=['AccNum'])
    f_df = f_df.T
    f_df.insert(0, 'ID', ids)
    name = hfh.get_stem(cal_f_path[0])[:-6]
    path = project_path + '/calibration/' + name + '_FINAL_'
    f_df.to_csv(path + 'f.csv', header=False, index=False)
    c_df.to_csv(path + 'c.csv', header=False, index=False)

    print('hello')
Exemple #9
0
def convert_delimited_to_iteman(file, destination_path, delimiter=','):
    #verify is CSV
    ret = []
    lines = hfh.get_lines(file)
    if len(lines) > 0:
        if len(lines[0].split(delimiter)) > 1:
            #is CSV
            for line in lines:
                new_line = ""
                line = line.split(delimiter)
                id_handled = False
                non_answer_characters = False
                for i in line:
                    if not id_handled:
                        i += '   '
                        id_handled = True
                    if not i == 'Y' and not i == 'M':
                        new_line += i
                ret.append(new_line)
        if not non_answer_characters:
            print(
                file,
                "non answer character in data response string. It was removed."
            )

        name = hfh.create_name(hfh.get_stem(file), destination_path, 'txt',
                               '_f')
        hfh.write_lines_to_text(ret, name)
        return True
    else:
        print(file, "is empty")
        return False
def get_theta_from_passing_score(passing_score, path_to_tif):
    df = pd.read_csv(path_to_tif)
    SCORE_COL = "TRF as Number Correct"
    THETA_COL = "Theta"
    a = df.iloc[(df[SCORE_COL] - passing_score).abs().argsort()[:1]]
    a = a[THETA_COL].values[0].astype(float)
    return get_stem(path_to_tif), int(a * 100) / 100
def create_form_from_c(c_file, destination_path):
    c_df = hfh.get_df(c_file)
    form = hfh.pd.DataFrame([])
    form['AccNum'] = c_df[0]
    form['Sequence'] = form.index.values + 1
    form = form[['Sequence', 'AccNum']]
    name = hfh.get_stem(c_file)[:-2] + '_L.csv'
    form.to_csv(destination_path + '/' + name, header=True, index=False)
def create_forms_from_bank(project_path,
                           operational=True,
                           create_bank_L=False):
    bank_files = hfh.get_all_files(project_path + '/bank_files')
    print("creating forms")
    for file in bank_files:
        if not create_bank_L and file.find("BANK") > 0:
            pass
        else:
            b_df = pd.read_excel(file, header=0)
            cut = ""
            if create_bank_L and file.find('BANK') > -1:
                # check if passing is present
                i = file.find('_')
                if i > -1:
                    name = hfh.get_stem(file)
                    cut = name[i:]
                    try:
                        cut = int(cut)
                    except ValueError:
                        print(
                            file,
                            "contains and underscore but does not provide a cut"
                        )
                form = pd.DataFrame([])
                form['AccNum'] = b_df['AccNum']
                form_length = len(form) + 1
                form.insert(0, 'Position', range(1, form_length))

            elif operational:
                if 'UseCode' in b_df.columns:
                    form = b_df[b_df['UseCode'] == 'Operational']
                    form = form[['Position', 'AccNum']]
            elif 'Position' in b_df.columns:
                form = b_df[['Position', 'AccNum']]

            name = hfh.get_stem(file)
            suffix = '_LF'
            prefix = 'full'
            if operational:
                suffix = '_LO' + str(cut)
                prefix = 'operational'
            form.to_csv(project_path + '/forms/' + prefix + '/' + name +
                        suffix + '.csv',
                        index=False)
Exemple #13
0
def create_mapping_from_Karen_test_data(file_path,
                                        destination_path="",
                                        create_csv=False,
                                        add_underscore=False):
    lines = hfh.get_lines(file_path)
    #   assumes files are of format
    #   number. Name NN
    #   ....  where no lines will start with a number and then a dot other than my target lines
    #   number. Name NN
    #   name of file is stem(file_path)+_L.csv
    ret = []
    if lines:
        for line in lines:
            if line[0].isnumeric():
                entry = line.split()
                test_name = hfh.get_stem(file_path)
                test_id = line[:line.index('.')]
                subject = entry[1]
                bank_id = entry[2]
                underscore = ""
                if add_underscore:
                    underscore = "_"
                record = [
                    test_name, test_id, subject, bank_id,
                    subject + underscore + str(bank_id)
                ]
                ret.append(record)

        df = pd.DataFrame(ret)
        df.columns = [
            'form', 'test_id', 'subject', 'bank_id_number', 'bank_id'
        ]

    else:
        print(file_path + "does not contain lines.")
    if create_csv:
        name = hfh.get_stem(file_path)[:8]
        #df.sort_values(df[1])
        file_name = destination_path + "/" + name + "_L.csv"
        df.to_csv(file_name, index=False, header=0)

    return df
def process_raw_data(master_folder, raw_data):
    form_files = hfh.get_all_files(raw_data, extension="csv")
    data_files = hfh.get_all_files(raw_data, extension='txt')
    if not len(form_files) == len(data_files):
        #   assume that we are using xlsx files
        form_files = hfh.get_all_file_names_in_folder(raw_data, extension="xlsx")

    for file in form_files:
        name = hfh.get_stem(file) + '_raw_backup_form.' + hfh.get_extension(file)
        hfh.copy_file_and_write_to_destination(file, master_folder + "/data", modified_name=name)

    for file in data_files:
        name = hfh.get_stem(file) + '_raw_backup_data.' + hfh.get_extension(file)
        hfh.copy_file_and_write_to_destination(file, master_folder + "/data", modified_name=name)

    paired_files = pair_files(form_files, data_files)
    if paired_files:
        for pair in paired_files:
            pe_file = pair[0]
            data_file = pair[1]
            process_paired_files(pe_file, data_file, master_folder)
Exemple #15
0
def convert_first_line_answers_to_default_control_and_data(
        file_name, comma_delimited=False, id_length=8, id_spaces=3):
    #todo: handle cutoff for not included
    lines = hfh.get_lines(file_name)
    correct = lines[0]
    new = []
    counter = 0
    if comma_delimited:
        correct = correct.replace(',', '')
    for a in correct:
        counter += 1
        if not a == '\n':
            include = 'y'
            new.append(str(counter) + "," + a + ",4,1," + include + ",M\n")

    name = hfh.get_stem(file_name)
    hfh.write_lines_to_text(new, name + "_c.csv")
    #   contains a random F at the end will test to see if it matters
    formatted = []
    if comma_delimited:
        for line in lines[1:]:
            split_line = line.split(',')
            id = split_line[0]
            new_id = ""
            if len(id) < id_length:
                short = id_length - len(id)
                for i in range(short):
                    new_id += "_"
            new_id += id

            response_string = line[len(id):-2].replace(',', '') + '\n'
            ret_line = new_id + "   " + response_string
            formatted.append(ret_line)

    else:
        for line in lines[2:]:
            # todo: could be problematic
            id_end = line.find(',')
            id = line[:id_end]
            new_id = ""
            characters_short = id_length - len(id)
            for c in range(characters_short):
                new_id += "_"
            new_id += id
            for i in range(id_spaces):
                new_id += " "
            new_line = new_id + line
            formatted.append(
                new_line)  #no clue why : is here perhaps I will remove it.

    hfh.write_lines_to_text(formatted, name + "_f.txt")
def get_theta_from_passing_percent_correct(percent_to_pass,
                                           path_to_tif,
                                           tif_df=None,
                                           return_values=True):
    if tif_df is None:
        df = pd.read_csv(path_to_tif)
    else:
        df = tif_df
    SCORE_COL = "TRF"
    THETA_COL = "Theta"
    a = df.iloc[(df[SCORE_COL] - percent_to_pass).abs().argsort()[:1]]
    a = a[THETA_COL].values[0].astype(float)
    if return_values:
        return get_stem(path_to_tif), int(a * 100) / 100
Exemple #17
0
def fix_format_of_data_file(file, destination):
    valid = is_valid_data(file)
    if valid:
        name = hfh.get_stem(file) + "_f.txt"
        hfh.copy_file_and_write_to_destination(file,
                                               destination,
                                               modified_name=name)
    if not valid:
        valid = convert_iteman_format(file, destination)
    if not valid:
        valid = convert_2016_format(file, destination)
    if not valid:
        print("could not convert " + file)
    return valid
def convert_stats_to_df(path_to_stats, destination_path):
    first_line = get_first_line_of_stats(path_to_stats)
    try:
        blankLine = get_next_blank_line_after_index(path_to_stats,
                                                    first_line - 1)
        f = get_lines_from_X_to_Y_from_file(path_to_stats, first_line - 1,
                                            blankLine)
        name = destination_path + get_stem(path_to_stats) + "_cs.csv"
        write_lines_to_text(f, name)
        df = get_df(name, header=0, dtype="string")
    except:
        pass

    return df
Exemple #19
0
def convert_answers_to_default_control(file_name,
                                       answers,
                                       destination_path,
                                       cutoff_for_pretest=175):
    new = []
    counter = 0
    for a in answers:
        counter += 1
        if not a == '\n':
            include = 'y'
            if counter > cutoff_for_pretest and cutoff_for_pretest:
                include = 'n'
            new.append(str(counter) + "," + a + ",4,1," + include + ",M\n")
    name = destination_path + "/" + hfh.get_stem(file_name)
    hfh.write_lines_to_text(new, name + "_c.csv")
def find_month(file_name):
    months = ["JANUARY", "FEBRUARY", "MARCH", "APRIL", "MAY", "JUNE", "JULY", "AUGUST", "SEPTEMBER","OCTOBER","NOVEMBER","DECEMBER"]
    ret = []
    file_name = hfh.get_stem(file_name).upper()
    for month in months:
        full = file_name.find(month)
        if full > -1:
            ret.append(month)
        mon = month[:3]
        three = file_name.find(mon)
        if len(ret) == 0 and three > -1:
            ret.append(mon)
    if len(ret) > 1:
        return False
    else:
        return ret[0]
def get_percent_from_theta(theta,
                           path_to_tif=None,
                           tif_df=None,
                           return_values=True,
                           return_df=False):
    if path_to_tif is None and tif_df is None:
        print("get_percent_from_theta was not passed a tif path or df.")
    else:
        if tif_df is None:
            df = pd.read_csv(path_to_tif)
        else:
            df = tif_df
        SCORE_COL = "TRF"
        THETA_COL = "Theta"
        a = df.iloc[(df[THETA_COL].astype(float) - theta).abs().argsort()[:1]]
        a = a[SCORE_COL].values[0]
        if return_values:
            return hfh.get_stem(path_to_tif), int(a * 100) / 100
def process_LXR_key(key_file,
                    get_c_df_AS_0=False,
                    get_L_df_AS_1=False,
                    destination_path_c=None,
                    destination_path_L=None):
    c_df = None
    L_df = None
    name = hfh.get_stem(key_file)
    lines = hfh.get_lines(key_file)
    ids = []
    keys = []
    for line in lines:
        split = line.split('.')
        if len(split) > 1:
            id = split[1]
            id = id.strip()
            id = id.replace(' ', '_')
            ids.append(id)
        else:
            split = line.split(':')
            if len(split) > 1:
                key = split[1]
                key = key.strip()
                keys.append(key)
    df = hfh.pd.DataFrame([ids, keys]).T
    df[2] = 4
    df[3] = 1
    df[4] = 'Y'
    df[5] = 'M'
    df[6] = df.index.values + 1

    c_df = df[[0, 1, 2, 3, 4, 5]]
    L_df = df[[6, 0]]

    if get_c_df_AS_0 or get_L_df_AS_1:
        return c_df, L_df
    if destination_path_c:
        c_df.to_csv(destination_path_c + '/' + name + '_c.csv',
                    index=False,
                    header=False)
    if destination_path_L:
        L_df.to_csv(destination_path_L + '/' + name + '_L.csv',
                    index=False,
                    header=False)
Exemple #23
0
def clean_stats_csv(path, create_csv=True, get_df=False):
    lines = hfh.get_lines(path)
    #assumes report starts with Sequence
    i = -1
    cont = True
    beginning = -1
    ret_lines = []
    while cont:
        for line in lines:
            i += 1
            split_line = line.split(',')
            if split_line[0] == 'Sequence':
                cont = False
                beginning = i
            if beginning > -1:
                if line == '\n':
                    cont = False
                else:
                    ret_lines.append(line.split(','))
    df = pd.DataFrame(ret_lines[1:])
    df.columns = ret_lines[0][:-1]
    df = df.drop(columns='4 SD')
    if create_csv:
        new_path = hfh.get_parent_folder(path)
        name = new_path + "/" + hfh.get_stem(path)[:-6] + ".cleaned_stats"
        df.to_csv(name)
    if get_df:
        return df


#set_standard_id_length_in_data_files("PT_IRT/PT_processed_data", 8)
#convert_xCalibre_matrix_for_PCI("PT_data/score_matrices/PT1_18_m.txt")
#process_karen_data("LCLE_IRT","LCLE_IRT","LCLE_IRT/processed_data/")
#merge_control_and_bank_info(a,b)
#path_to_files = "data_files"
#convertOldFormatToNew("LCLE_IRT/LCLEApr2019FullAdmin.txt")
#processNewFileFormat("LCLE_IRT/lcea1_18c.csv","LCLE_IRT/lcea1_18.txt")
#convert_first_line_answers_to_default_control_and_data(path_to_files+"/pt1_16_n.txt")
#create_control_files(path_to_files)
#update_control_files_with_item_bank_key("data_files/item_map.csv", "data_files")
#convert_2016_format("data_files/pt3_16.txt")
def is_valid_name(path, harsh=False):
    acceptable_extensions = ['txt', 'csv']
    ext = hfh.get_extension(path)
    if ext not in acceptable_extensions:
        return False
    name = hfh.get_stem(path)
    month = hfh.find_month(name)
    year = hfh.find_year(name)
    if month and year:
        i_year = name.find(year)
        tag = name[:i_year]
        full_name = tag + year + month
        if harsh:
            if full_name == name:
                return name
        else:
            return name
        # currently unreachable consider removing or incorporating
        if full_name == name[:-2]:
            print("AB form detected")
            return name
    return False
Exemple #25
0
def create_key_df_from_csv(file_path):
    #   assumes format
    #   Position,Domain,AccNum,UseCode,CorrectAnswer,Content Area,...
    #   1,02,LLE669,Operational,C,0202,44,,56,0.18,,,Bank,InUse,1,,,101,AUG19(P),,,0,1,0,No,,1, ,5/13/2019,

    df = hfh.get_df(file_path, header=0)
    ret = []
    test_ids = df['Position'].tolist()
    bank_ids = df['AccNum'].tolist()
    for i in range(len(test_ids)):
        form = hfh.get_stem(file_path)
        test_id = test_ids[i]
        bank_id = bank_ids[i]
        subject = bank_id[:3]
        bank_id_number = bank_id[3:]
        ret.append([form, test_id, subject, bank_id_number, bank_id])
    df = pd.DataFrame(ret)
    df.columns = ['form', 'test_id', 'subject', 'bank_id_number', 'bank_id']

    # name = hfh.create_name("silly","LPCC_IRT/keys/L_files","csv",'_L')

    #df.to_csv(name, index=None)
    return df
def process_response_strings_for_IRT(path_to_raw_data,
                                     processed=None,
                                     bank=None,
                                     verbose=False,
                                     get_f_df=False):
    #todo edited while tired confirm it works later

    path = path_to_raw_data
    if path is not False:
        lines = hfh.get_lines(path)

        r = path.find('raw_data')
        #assumes that raw_data exists in IRT model
        name = path
        if r > -1:
            project_directory = path[:r]
            name = project_directory + "/processed_data/" + hfh.get_stem(path)
            processed = project_directory + '/processed_data/'
            bank = project_directory + '/bank_files/'
            valid = is_valid_name(path)
            while valid is False:
                print(
                    path +
                    " is a raw data name which does not conform to convention of CCCYYMON."
                )
                name = input("enter an appropriate name here")
                valid = is_valid_name(name)

        if lines is False:
            print(
                "Error in determine response string.\n Path request error in path "
                + path)
        else:
            process_response_string_file(path,
                                         bank,
                                         write_csv=True,
                                         destination_path=processed)
Exemple #27
0
def update_control_files_with_item_bank_key(path_to_item_bank_csv,
                                            path_to_control):
    #   todo: this is only good for the pt format... general application should be more robust.
    control_file_paths = hfh.get_all_file_names_in_foler(
        path_to_control, "csv")
    item_bank_df = pd.DataFrame(pd.read_csv(path_to_item_bank_csv))
    for file_path in control_file_paths:
        if not file_path == path_to_item_bank_csv:
            control_df = pd.DataFrame(pd.read_csv(file_path, header=None))
            name = hfh.get_stem(file_path)
            n = name[2]
            y = name[4:6]
            test_form = "20" + y + "_" + n
            form_relevant = item_bank_df.loc[item_bank_df['testFORM'] ==
                                             test_form]
            testSeq = form_relevant["testSeq"]
            testSeq_c = control_df[0]
            for i in testSeq:
                if control_df[0].__contains__(i):
                    value = form_relevant[form_relevant["testSeq"] ==
                                          i]["AccNum"]

            control_df[0] = np.where(control_df[0] == i, value, control_df[0])
            control_df.to_csv(file_path, index=False, header=0)
def create_upload_from_processed(c_file,
                                 f_file,
                                 path=None,
                                 c_has_header=True,
                                 to_csv=False):
    #todo: decide if _c files have headers or not...
    #todo: perhaps a different _x to indicate header or not... the only time I don't want a header is xCalibre...

    if c_has_header:
        c_df = hfh.get_df(c_file, header=0)
    else:
        c_df = hfh.get_df(c_file)

    f_df = hfh.get_df(f_file, index_col=0)
    stats_df = hfh.pd.DataFrame([])
    stats_df['AccNum'] = c_df.iloc[:, 0]
    graded_df = hr.grade_examination(f_df,
                                     c_df,
                                     grading_processed=True,
                                     correct=1,
                                     incorrect=0)

    score = graded_df.sum(axis=1)

    pbis = graded_df[graded_df.columns[0]].corr(score)

    A = get_option_df(f_df, 'A')
    B = get_option_df(f_df, 'B')
    C = get_option_df(f_df, 'C')
    D = get_option_df(f_df, 'D')

    options = ['A', 'B', 'C', 'D']
    dfs = [A, B, C, D]
    counter = -1

    N = ~f_df.isna()
    N = N.sum()
    N = N.reset_index(drop=True)

    for option in options:

        counter += 1
        a_ret = []
        b_ret = []
        c_ret = []

        df = dfs[counter]

        for column in A.columns:
            mask = df[column] == 1
            mean_score = graded_df[mask].mean().mean()
            c_ret.append(mean_score)
            pbis = df[column].corr(score)
            endorse_p = df[column].sum() / df.shape[0]
            a_ret.append(pbis)
            b_ret.append(endorse_p)
        stats_df[option + '_r'] = hfh.pd.Series(a_ret, index=stats_df.index)
        stats_df[option + '_p'] = hfh.pd.Series(b_ret, index=stats_df.index)
        stats_df[option + '_m'] = hfh.pd.Series(c_ret, index=stats_df.index)

    k_ret = []
    for i in range(graded_df.shape[1]):
        pbis = graded_df[graded_df.columns[i]].corr(score)
        k_ret.append(pbis)
    stats_df['K_r'] = hfh.pd.Series(k_ret, index=stats_df.index)
    stats_df['KEY'] = c_df['Key']
    stats_df['N'] = N
    p = graded_df.mean(axis=0)
    stats_df = stats_df.set_index('AccNum', drop=True)
    stats_df['P'] = p
    if path is None:
        name = hfh.get_stem(f_file)[:-2] + '_P.csv'
    else:
        name = path + '/' + hfh.get_stem(f_file)[:-2] + '_P.csv'
    stats_df = stats_df[[
        'KEY', 'K_r', 'P', 'A_p', 'A_r', 'A_m', 'B_p', 'B_r', 'B_m', 'C_p',
        'C_r', 'C_m', 'D_p', 'D_r', 'D_m', 'N'
    ]]
    if to_csv:
        stats_df.to_csv(name)
    return stats_df
def process_response_string_file(f_path,
                                 bank_path=None,
                                 destination_path=None,
                                 write_csv=False,
                                 get_df=True,
                                 create_c=True,
                                 paired_bank_xlsx=None):
    if create_c:
        assert destination_path is not None, "process response string needs to know where to put the processed data"
    name = hfh.get_stem(f_path)
    lines = hfh.get_lines(f_path)
    assert len(lines) > 0, "asked to process empty file:" + f_path

    c_df = None
    f_df = None

    if is_type_K(lines):
        processed_lines = processK(lines)
        f_df = processed_lines
    elif is_type_A(lines):
        processed_lines = processA(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_B(lines):
        processed_lines = processB(lines)
        f_df = processed_lines
    elif is_type_C(lines):
        processed_lines = processC(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_D(lines):
        processed_lines = processD(lines)
        f_df = processed_lines
    elif is_type_E(lines):
        processed_lines = processE(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_F(lines):
        processed_lines = processF(lines)
        f_df = processed_lines
    elif is_type_G(lines):
        processed_lines = processG(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_H(lines):
        processed_lines = processH(lines)
        f_df = processed_lines
    elif is_type_I(lines):
        processed_lines = processI(lines)
        f_df = processed_lines
    elif is_type_J(lines):
        processed_lines = processJ(lines)
        f_df = processed_lines

    else:
        print(f_path + " is already formatted")
        is_formatteed(lines)
        f_df = hfh.get_df(f_path)

    if c_df is not None and bank_path:
        # add AccNum instead of sequence
        b_df = create_c_df_from_bank(bank_path)
        b_df['Key'] = c_df['Key']
        c_df = b_df
    if c_df is None and bank_path is not None and create_c:
        #todo: consider respecting the correct answer at the time vs the bank or just destroy it
        bank_files = hfh.get_all_files(bank_path, extension='xlsx')
        pair = hfh.pair_files([f_path], bank_files)
        if len(pair) == 0:
            print(
                "could not find matching bank file and no default control information present."
            )
        if len(pair) == 1:
            # todo: may evaluate differences between bank and response string if desired
            c_df = create_c_df_from_bank(pair[0][1])
        if len(pair) > 1:
            print("more than one file matched for bank", f_path)

    #confirm_id_as_index
    if 0 in f_df.columns or '0' in f_df.columns:
        f_df = f_df.set_index(f_df[0], drop=True)
        f_df = f_df.drop(columns=0)
    if write_csv:
        #todo changed index... need to make sure all processed items spit out the same... in this case they are pre-strict.

        f_df.to_csv(destination_path + '/' + name + '_f.csv',
                    index=True,
                    header=False)
        if c_df is not None:
            c_df.to_csv(destination_path + '/' + name + '_c.csv',
                        index=None,
                        header=False)
    if get_df:
        return f_df
def create_CAL(project_path,
               processed_path=None,
               destination_path=None,
               pair_full=True,
               debug=True):
    DATA_FILE = 0
    CONTROL_FILE = 1
    if processed_path == None:
        path = project_path + '/processed_data'
    data_files = hfh.get_all_file_names_in_folder(path, target_string='_f.csv')
    control_files = hfh.get_all_file_names_in_folder(path,
                                                     target_string='_c.csv')
    df = pd.DataFrame([])
    control_dfs = []
    paired_files = hfh.pair_files(data_files,
                                  control_files,
                                  pair_full=pair_full)

    for pair in paired_files:
        print("CREATING CAL FROM PAIR: " + pair[0] + " " + pair[1])
        print(pair)
        control_path = pair[CONTROL_FILE]
        data_path = pair[DATA_FILE]
        f_df = hfh.get_df(data_path, header=None)
        c_df = hfh.get_df(control_path, header=None)

        control_dfs.append(c_df)
        f_df = get_strict_format_f_df(c_df, f_df)
        graded = strict_grade(f_df=f_df,
                              c_df=c_df,
                              operational=False,
                              correct='1',
                              incorrect='2')
        if graded is not False:
            a = graded.columns.duplicated()
            df = pd.concat([df, graded], axis=0)

    print("replacing")
    if len(paired_files) > 0:
        c_df = pd.DataFrame([])
        c_df['AccNum'] = df.columns
        c_df['Key'] = '1'
        c_df['Options'] = '2'
        c_df['Domain'] = '1'
        c_df['Include'] = 'Y'
        c_df['Type'] = 'M'
        type = "_FINAL_"
        if destination_path.find('initial') > 0:
            type = "_INITIAL_"
        name = destination_path + '/' + hfh.get_stem(pair[0])[:3] + type

        print("replacing 2.0")
        df = df.replace(2.0, '2')
        print("replacing 1.0")
        df = df.replace(1.0, '1')
        print("filling NA")
        df = df.fillna(value='-')
        print("replacing empty with X")
        df = df.replace(" ", "X")
        print("writing csvs")
        df.to_csv(name + 'f.csv', index=True, header=False)
        c_df.to_csv(name + 'c.csv', index=None, header=False)

        return df, c_df
    else:
        print("failed to pair files in create_CAL")