Python get_df Examples, core.h_file_handling.get_df Python Examples

Example #1

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def create_FINAL_from_CULLED(project_path):
    culled_path = hfh.get_all_files(project_path + '/reports',
                                    target_string='CULLED.csv')
    cal_f_path = hfh.get_all_files(project_path + '/calibration',
                                   target_string='CAL_f.csv')
    cal_c_path = hfh.get_all_files(project_path + '/calibration',
                                   target_string='CAL_c.csv')
    if len(culled_path) * len(cal_f_path) * len(cal_c_path) == 1:
        f_df = hfh.get_df(cal_f_path[0])
        c_df = hfh.get_df(cal_c_path[0])
        #c_df.columns = ['AccNum','A','B','C','D','E']
        culled_df = hfh.get_df(culled_path[0], header=0)
    else:
        print("invalid call CULLED, CAL_f and CAL_c")
        return False
    ids = f_df.iloc[:, 0]
    f_df = f_df.drop(columns=[0])
    f_df = f_df.T
    f_df = f_df.reset_index()
    f_df = f_df.drop(columns='index')
    c_df.columns = ['AccNum', 'B', 'C', 'D', 'E', 'F']
    f_df.insert(0, 'AccNum', c_df['AccNum'])

    c_df = c_df[c_df['AccNum'].isin(culled_df['AccNum'])]
    f_df = f_df[f_df['AccNum'].isin(culled_df['AccNum'])]
    f_df = f_df.drop(columns=['AccNum'])
    f_df = f_df.T
    f_df.insert(0, 'ID', ids)
    name = hfh.get_stem(cal_f_path[0])[:-6]
    path = project_path + '/calibration/' + name + '_FINAL_'
    f_df.to_csv(path + 'f.csv', header=False, index=False)
    c_df.to_csv(path + 'c.csv', header=False, index=False)

    print('hello')

Example #2

0

Show file

def add_new_form_to_data(new_c_path, new_f_path, calibrated_c_path,
                         calibrated_f_path, calibrated_stats):
    checking = True

    new_f_df = hfh.get_df(new_f_path)
    new_c_df = hfh.get_df(new_c_path)
    cal_f_df = hfh.get_df(calibrated_f_path)
    cal_c_df = hfh.get_df(calibrated_c_path)
    c_ids = cal_f_df.iloc[:, 0]
    n_ids = new_f_df.iloc[:, 0]

    combined_c = update_control(new_c_df, cal_c_df, calibrated_stats)
    combined_f = update_formatted(combined_c, new_f_df, new_c_df, cal_f_df,
                                  cal_c_df)
    if checking:
        check = cal_f_df.drop(columns=0)
        check = check.apply(hfh.pd.to_numeric, errors='coerce')
        check = check.replace(2.0, 0.0)
        check['SCORE'] = check.sum(axis=1)
        check['ID'] = c_ids
        check_it = check[['ID', 'SCORE']]

        check = combined_f.drop(columns=0)
        check = check.apply(hfh.pd.to_numeric, errors='coerce')
        check = check.replace(2.0, 0.0)
        check['SCORE'] = check.sum(axis=1)
        check_it[['SCORE_2']] = check['SCORE']
        test = check_it['SCORE_2'] != check_it['SCORE']
        check_it = check_it[test]
        if len(check_it) > 0:
            print("ERROR IN COMBINATION!!!!")
            return False
    combined_f.to_csv("COMBINED_F.csv", index=False, header=None)
    combined_c.to_csv("COMBINED_C.csv", index=True, header=None)

Example #3

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/Data_Manipulation

def process_paired_files(pe_file, data_file, project_folder, select_Domain=False):
    #   todo: add in domain processing here so that sets without domain names are still separated
    if hfh.get_extension(pe_file) == 'xlsx':
        pe_df = hfh.get_df_from_xlsx(pe_file)
    else:
        pe_df = hfh.get_df(pe_file, header=0)

    pe_df.Position = pe_df.Position.astype(float)
    pe_df = pe_df.sort_values("Position", ascending=True, )

    pe_df = pe_df[["AccNum", 'CorrectAnswer', 'Domain']]
    #pe_df.Domain = pe_df.Domain.apply(str)
    #pe_df["AccNum"] = str(pe_df["AccNum"]) + "_" + pe_df["Domain"]

    if select_Domain:
        pe_df.loc[pe_df.Domain != select_Domain, 'include'] = 'N'

    pe_df = pe_df.drop(['Domain'], axis=1)

    pe_df['number_of_options'] = 4
    pe_df['group'] = 1
    pe_df["include"] = 'Y'
    pe_df['type'] = 'M'
    pe_df = pe_df[['AccNum', 'CorrectAnswer', 'number_of_options', 'group', 'include', 'type']]
    processed_path = project_folder + "/processed_data/"
    c_path = processed_path + hfh.get_stem(pe_file) + "_c.csv"
    pe_df.to_csv(c_path, header=False, index=False)
    h.convert_default_data_to_iteman(data_file, processed_path, new_name=hfh.get_stem(pe_file))
    return 1

Example #4

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def get_f_df_repeat_status(f_path):
    # places that repeat information lives...
    #   end of string
    #   pearson file type thrid column
    #   ... other things I have not come across
    ids_with_repeat_status = []
    lines = hfh.get_lines(f_path)
    if is_type_K(lines):
        df = hfh.get_df(f_path, header=0)
        df = df.drop(0)
        df['Attempt'] = df['Attempt'].replace(['1'], 'F')
        df['Attempt'] = df['Attempt'].replace(['2'], 'R')
        ids_with_repeat_status = df['ClientID'] + '_' + df['Attempt']
    else:
        for line in lines:
            ending_character = line.strip()[-1]
            if ending_character in ['F', 'R']:
                repeat_status = ending_character
                line = line.strip()
                split_line = line.split()
                _id = None
                if len(split_line) > 1:
                    _id = split_line[0]
                else:
                    split_line = line.split(',')
                    if len(split_line) > 1:
                        _id = split_line[0]
                if _id is None:
                    assert False, "can not assign repeat status to file " + f_path
                ids_with_repeat_status.append(_id + '_' + repeat_status)

    ret = process_response_string_file(f_path, create_c=False)
    f_df = ret
    f_df = f_df.set_index(ids_with_repeat_status)
    return ids_with_repeat_status

Example #5

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/Data_Manipulation

def check_p(pe_file, data_file, threshold = .4):
    pe_df = hfh.get_df(pe_file)
    d_df = h.convert_response_string_to_csv_and_get_df(data_file, id_length=8, number_of_spaces=3).T
    pe_df_t = pe_df.T.iloc[1]
    combined = d_df
    combined['correct'] = pe_df_t
    attempted = 0
    correctAnswers = 0

    for test_taker_id in range(d_df.shape[1]):
        #for item_id in range(pe_df_t):
        count = -1
        d = pe_df_t.size
        attempted += pe_df_t.size
        for item in pe_df_t:
            count += 1
            print(test_taker_id, item, count)
            test_response = d_df.iloc[count,test_taker_id]
            correct_answer = pe_df_t.iloc[count]
            if test_response == correct_answer:
                correctAnswers += 1
    p = correctAnswers/attempted
    if p < threshold:
        return False
    return True

Example #6

0

Show file

File: h_Rasch_report_analysis.py Project: robertcalvertphd/Data_Manipulation

def create_reports(path_to_stats, destination_path, is_cs=False):
    # todo: make this smarter. Look through for any stats files in folder or subfolders and then munch on them.
    destination_path += '/'
    if is_cs:
        files = get_all_file_names_in_folder(path_to_stats,
                                             target_string="_cs")
    else:
        files = get_all_file_names_in_folder(path_to_stats,
                                             target_string="Stats")

    for file in files:
        if is_cs:
            df = get_df(file, header=0)
        else:
            df = convert_stats_to_df(file, destination_path)
        high = get_top_n_rpbis(df, 20)
        low = get_low_performing_items(df)
        flagged = get_flagged_items(df)
        agg = get_aggregate_report_items(df)
        # parameters = get_parameters(df)
        name = get_stem(file) + "_c"
        high.to_csv(destination_path + name + "_high_.csv", index=False)
        low.to_csv(destination_path + name + "_low_.csv", index=False)
        flagged.to_csv(destination_path + name + "_flagged_.csv", index=False)
        agg.to_csv(destination_path + name + "_agg.csv", index=False)

Example #7

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def create_form_from_c(c_file, destination_path):
    c_df = hfh.get_df(c_file)
    form = hfh.pd.DataFrame([])
    form['AccNum'] = c_df[0]
    form['Sequence'] = form.index.values + 1
    form = form[['Sequence', 'AccNum']]
    name = hfh.get_stem(c_file)[:-2] + '_L.csv'
    form.to_csv(destination_path + '/' + name, header=True, index=False)

Example #8

0

Show file

def merge_control_and_bank_info(control_path, bank_path):
    control_df = hfh.get_df(control_path, header=0)
    bank_df = hfh.get_df(bank_path, header=0)
    # check that id is not already merged
    sequence_value = control_df['sequence'][1]
    if not sequence_value == str(2):
        print(control_path, "expected sequence = 2 got",
              sequence_value + ", file was not merged. Check for repairs.")
        valid = is_valid_control(control_path)
        if valid: print(control_path + " is a valid control file.")

    else:
        new = pd.merge(control_df, bank_df, on='sequence', how='right')
        new = new[[
            'bank_id', 'key', 'number_of_options', 'group', 'include',
            'scoring_option'
        ]]
        new.to_csv(control_path, header=None, index=False)

Example #9

0

Show file

File: h_Rasch_report_analysis.py Project: robertcalvertphd/Data_Manipulation

def convert_stats_to_df(path_to_stats, destination_path):
    first_line = get_first_line_of_stats(path_to_stats)
    try:
        blankLine = get_next_blank_line_after_index(path_to_stats,
                                                    first_line - 1)
        f = get_lines_from_X_to_Y_from_file(path_to_stats, first_line - 1,
                                            blankLine)
        name = destination_path + get_stem(path_to_stats) + "_cs.csv"
        write_lines_to_text(f, name)
        df = get_df(name, header=0, dtype="string")
    except:
        pass

    return df

Example #10

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def validate_c_file_header(c_file, debug=True):
    #first line should be AccNum...
    lines = hfh.get_lines(c_file)
    if debug:
        print("validating: " + c_file)
    assert len(lines) > 0, "validate_c_file was fed an empty file"
    if lines[0] == E.C_HEADER_S + '\n':
        return True
    if lines[1] == E.C_HEADER_S + '\n':
        assert False, "validate_c_file detected 2 headers"
    #   header now present
    df = hfh.get_df(c_file)
    df.columns = E.C_HEADER_L
    df.to_csv(c_file, index=False)
    return True

Example #11

0

Show file

File: h_reports.py Project: robertcalvertphd/Data_Manipulation

def create_item_level_analysis(stats_R_path, stats_2_path, tif_path,
                               matrix_path, bank_path, passing_proportion):
    bank = hfh.get_df_from_xlsx(bank_path)
    stats_R_df = hfh.get_stats_df(stats_R_path, bank)
    stats_2_df = hfh.get_stats_df(stats_2_path, bank)
    tif_df = hfh.get_df(tif_path, header=0)
    matrix_df = core.h_format_manipulators.convert_xCalibre_matrix_for_PCI(
        matrix_path)
    a = ha.get_residuals_by_domain(matrix_df, stats_R_df, tif_df)
    b = ha.evaluate_in_fit_out_fit(stats_R_df)
    c = ha.evaluate_item_discriminatsion(stats_2_df)
    d = hr.get_count_summary(b, 'Domain', 'IN_DEV_EVAL')
    e = hr.get_count_summary(b, 'Domain', 'OUT_DEV_EVAL')
    f = hr.get_count_summary(b, 'Domain', 'B_EVAL')
    g = hr.get_count_summary(c, 'Domain', 'A_DEV_EVAL')
    h = ha.get_residuals_by_difficulty(matrix_df, stats_R_df, tif_df, a[1])
    j = ha.evaluate_max_info(stats_R_df, tif_df, passing_proportion)

Example #12

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def remove_accNum_from_f_and_c(accNum, name, program_path, reason=None):
    # create backup_processed_data folder
    report_folder = program_path + '/reports'
    backup_processed_folder = program_path + '/' + E.BACKUP_PROCESSED_DATA_P
    processed_folder = program_path + '/processed_data'
    hfh.create_dir(backup_processed_folder)
    # create notation of removal with reason
    f_df = hfh.get_single_file(processed_folder,
                               target_string=name + '_f.csv',
                               as_df=True,
                               strict=True)
    c_file = hfh.get_single_file(processed_folder,
                                 target_string=name + '_c.csv',
                                 strict=True)
    c_df = hfh.get_df(c_file, header=get_header_argument(c_file))

    s_ret = get_strict_format_f_df(c_df, f_df, get_c_df=True)
    c_df = s_ret[0]
    f_df = s_ret[1].T
    f_df = f_df.drop(accNum)
    c_df = c_df.set_index(['AccNum'])
    c_df = c_df.drop(accNum)
    c_df = c_df.reset_index(drop=False)
    f_df = f_df.T
    strict_grade(c_df, f_df, operational=False)  # solely for validation
    f_df.to_csv(program_path + '/processed_data/' + name + '_f.csv',
                header=None,
                index=True)
    c_df.to_csv(program_path + '/processed_data/' + name + '_c.csv',
                index=False,
                header=None)

    removed_report_path = hfh.get_all_files(program_path + "/" + E.REPORTS_P +
                                            '/',
                                            target_string=E.REMOVED_ITEMS_R)

    entry = accNum + " was removed from " + name
    if reason is not None:
        entry += " because of a " + reason
    if len(removed_report_path) == 0:
        removed_report_path = program_path + "/" + E.REPORTS_P + '/' + E.REMOVED_ITEMS_R
        hfh.write_lines_to_text([entry + '\n'], removed_report_path)
    else:
        hfh.add_lines_to_csv(removed_report_path[0], [entry])

Example #13

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def update_c_from_bank(project_path):
    # assumes that updated _c files have position instead of accNum
    bank_directory = project_path + '/bank_files'
    processed_directory = project_path + '/processed_data'
    c_files = hfh.get_all_files(processed_directory, target_string='_c.csv')
    b_files = hfh.get_all_files(bank_directory, target_string='.xlsx')
    pairs = hfh.pair_files(c_files, b_files)
    for pair in pairs:
        c_file = pair[0]
        b_file = pair[1]
        c_df = hfh.get_df(c_file)
        b_df = pd.read_excel(b_file)
        c_df.columns = [
            'AccNum', 'Key', 'Options', 'Domain', 'Include', 'Type'
        ]
        c_df['AccNum'] = b_df['AccNum']
        c_df = c_df[['AccNum', 'Key', 'Options', 'Domain', 'Include', 'Type']]

        c_df.to_csv(c_file, index=None, header=None)

Example #14

0

Show file

File: h_stats.py Project: robertcalvertphd/Data_Manipulation

def grade_responses(data_file, control_file):
    # todo: this has been back burnered so assumption item assessment can be finished.
    # this assumes that data has been processed
    a = os.path.isfile(data_file)
    b = os.path.isfile(control_file)
    data_df = h.get_data_df(data_file)
    control_df = hfh.get_df(control_file)
    ret_df = data_df
    answers = control_df.iloc[:, 1]
    ids = data_df.iloc[:, 0]
    data_df = data_df.drop(columns=0)
    data_df.index = data_df.index + 1
    for col in range(data_df.shape[0] - 1):
        for row in range(data_df.shape[1]):
            individuals_answers = data_df.loc[row + 1]
            #   data_df[data_df.loc[row+1]==individuals_answers,'q'+str(row+1)] = 1
            #   b = control_df.iloc[row,1]
            print(a, b)
            #ret_df.loc[row][data_df[col] == control_df.iloc[row,1], "q"+str(col)] = 1
    print("hello")

Example #15

0

Show file

def create_key_df_from_csv(file_path):
    #   assumes format
    #   Position,Domain,AccNum,UseCode,CorrectAnswer,Content Area,...
    #   1,02,LLE669,Operational,C,0202,44,,56,0.18,,,Bank,InUse,1,,,101,AUG19(P),,,0,1,0,No,,1, ,5/13/2019,

    df = hfh.get_df(file_path, header=0)
    ret = []
    test_ids = df['Position'].tolist()
    bank_ids = df['AccNum'].tolist()
    for i in range(len(test_ids)):
        form = hfh.get_stem(file_path)
        test_id = test_ids[i]
        bank_id = bank_ids[i]
        subject = bank_id[:3]
        bank_id_number = bank_id[3:]
        ret.append([form, test_id, subject, bank_id_number, bank_id])
    df = pd.DataFrame(ret)
    df.columns = ['form', 'test_id', 'subject', 'bank_id_number', 'bank_id']

    # name = hfh.create_name("silly","LPCC_IRT/keys/L_files","csv",'_L')

    #df.to_csv(name, index=None)
    return df

Example #16

0

Show file

File: C_form.py Project: robertcalvertphd/Data_Manipulation

    def pd_validate_inputs(self, raw_required = False, xCalibre_required = False):
        parent_path = self.report_path.get()
        raw_path = self.raw_data_path.get()
        valid = 1
        master_folder = parent_path
        if not os.path.isdir(parent_path):
            self.log.append("Path invalid:" + master_folder)
            valid = 0

        #   assume that parent path is a project folder e.g. ...Desktop/LEX
        if self.report_name.get == "":
            report_path = parent_path + "/reports"
            xCalibre_path = parent_path + "/xCalibreOutput"
            master_name = master_folder[master_folder.rfind('/')+1:]

        else:
            master_name = self.report_name.get()
            master_folder = parent_path + "/" + master_name
            report_path = master_folder + "/reports"
            xCalibre_path = master_folder + "/xCalibreOutput"
            #self.report_name.set(hfh.get_parent_folder(master_folder))
        if raw_required and raw_path == "":
            self.log.append("raw path required")
            valid = 0

        if raw_required and valid:
            data_files = hfh.get_all_files(raw_path, extension='txt')
            form_files = hfh.get_all_files(raw_path, extension='csv')
            if len(form_files) == 0:
                #   assume just xlsx files
                form_files = hfh.get_all_file_names_in_folder(raw_path, extension='xlsx')

            if data_files is None or form_files is None:
                valid = 0
                if data_files is None:
                    self.log.append("data files are missing")
                if form_files is None:
                    self.log.append("form files are missing")
            if valid:
                if len(data_files) == 0 or len(form_files) == 0:
                    valid = 0
                    self.log.append("Raw data does not contain both txt and csv files.")
                    self.log.append("Raw Path:" + raw_path)
                if not len(data_files) == len(form_files):
                    valid = 0
                    self.log.append("There are unequal data and form files in raw data.")
                    self.log.append("Found " + str(len(data_files)) + " data files and " + str(len(form_files)) + " form files.")
                    d = "data:\n"
                    fm = "form:\n"
                    for f in data_files:
                        d+= f+'\n'
                    for f in form_files:
                        fm+=f+'\n'
                    self.log.append(d)
                    self.log.append(fm)
                    self.log.append("Raw Path:" + raw_path)

                for file in data_files:
                    can_read = hfh.file_is_readable(file)
                    if not can_read:
                        valid = 0
                        self.log.append("read access denied for data file:" + file)

                for file in form_files:
                    can_read = hfh.file_is_readable(file)
                    if not can_read:
                        valid = 0
                        self.log.append("read access denied for form file:" + file)
                    if valid:
                        if hfh.get_extension(file) == 'csv':
                            test_df = hfh.get_df(file,header=0)
                        else:
                            test_df = hfh.get_df_from_xlsx(file)
                        required_columns = ["Domain", "AccNum", "CorrectAnswer"]
                        for column in required_columns:
                            if column not in test_df.columns:
                                self.log.append("______")
                                self.log.append(file)
                                self.log.append("pro exam file does not contain " + column + ".\nReset form and then download from Proexam.")
                                self.log.append("______")
                                valid = 0

        if valid:
            master_report_path = master_folder + "/" + 'reports'
            if not os.path.isdir(master_report_path):
                self.log.append("Path does not contain reports folder. \nPath:" + master_folder)
                valid = 0

        if valid:
            if not os.path.isdir(xCalibre_path) and xCalibre_required:
                self.log.append("Path does not contain xCalibreOutput folder. \nPath:" + master_folder)
                valid = 0

        if valid:
            if os.path.isfile(master_name):
                self.log.append("Folder name is a file. It should be a directory, i.e. no extension.")
                valid = 0

        if valid and raw_required:
            if not hr.get_confirm_on_pairing(raw_path):
                valid = 0
                self.log.append("User said pairing was wrong.")

        if valid and xCalibre_required:
            stats_files = hfh.get_all_file_names_in_folder(xCalibre_path, target_string="Stats")
            if len(stats_files) == 0:
                self.log.append("Report path does not contain xCalibreOutput reports")
                valid = 0

            #   check that can write reports
            aggregate_name = report_path + "/" + master_name + "_aggregate_.csv"
            complete_name = report_path + "/" + master_name + "_complete_.csv"

            if os.path.isfile(aggregate_name):
                if not hfh.file_is_writable(aggregate_name):
                    valid = 0
                    self.log.append("No access to " + aggregate_name)
            if os.path.isfile(complete_name):
                if not hfh.file_is_writable(complete_name):
                    valid = 0
                    self.log.append("No access to " + complete_name)

        if valid:
            self.log.append("validated call")
            self.d_log()
        return valid

Example #17

0

Show file

File: h_Rasch_report_analysis.py Project: robertcalvertphd/Data_Manipulation

def create_general_report_table(aggregate_cs_path, name, destination_path):
    flags = ['La', 'Lb', 'K', 'Ha', 'Hb']  # todo: sloppy
    suggestions = [
        "caution", "low_biserial", "negative_biserial", "high_B_range",
        "keyed_error", "good_irt"
    ]
    df = get_df(aggregate_cs_path, header=0)
    unique_df = unique(df['Item ID']).tolist()

    lines = [[
        "Item ID", "TR mean", "TR min", "TR max", "TR range", "SR mean",
        "SR min", "SR max", "SR range", "B mean", "B min", "B max", "B range",
        "count"
    ]]
    for item in flags:
        lines[0].append(item)
    for s in suggestions:
        lines[0].append(s)
    good_irt = 0
    cautions = 0
    pro_exam_lines = []
    for id in unique_df:
        subset = df[df['Item ID'] == id]
        s = subset.copy()
        TR = get_descriptives(s['T-Rpbis'])
        SR = get_descriptives(s['S-Rpbis'])
        B = get_descriptives(s['b'])
        flags = subset["Flags"]
        flag_count = get_flag_count(flags)
        suggestion = get_suggestions(SR, TR, B, flag_count)
        if suggestion[1]: good_irt += 1
        if suggestion[2]: cautions += 1
        suggestion = suggestion[0]
        line = [
            id, TR[0], TR[1], TR[2], TR[3], SR[0], SR[1], SR[2], SR[3], B[0],
            B[1], B[2], B[3],
            len(subset)
        ]
        for flag in flag_count:
            line.append(flag)
        for s in suggestion:
            line.append(s)
        lines.append(line)

    print("IRT:", good_irt, "of", len(lines), "=",
          str(round(good_irt * 100 / (len(lines) - 1), 2)) + "%")
    print("Cautions:", cautions, "of", len(lines), "=",
          str(round(cautions * 100 / (len(lines) - 1), 2)) + "%")
    report_df = pd.DataFrame(lines)
    final_report_name = destination_path + "/" + name + "_complete_.csv"
    pro_exam_report = destination_path + "/" + name + "_pro_exam_.txt"
    report_df.to_csv(final_report_name, index=False, header=0)
    report_df.to_csv(pro_exam_report, index=False, header=0)

    _df = pd.read_csv(final_report_name)
    for i in _df:
        print(i)
    caution = _df.where(_df['caution'] == True)
    irt = _df.where(_df['good_irt'] == True)
    caution = caution.dropna()
    irt = irt.dropna()
    caution.to_csv(destination_path + "/" + name + "_caution_.csv")
    irt.to_csv(destination_path + "/" + name + "_high_performers_for_irt_.csv")

Example #18

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def create_CAL(project_path,
               processed_path=None,
               destination_path=None,
               pair_full=True,
               debug=True):
    DATA_FILE = 0
    CONTROL_FILE = 1
    if processed_path == None:
        path = project_path + '/processed_data'
    data_files = hfh.get_all_file_names_in_folder(path, target_string='_f.csv')
    control_files = hfh.get_all_file_names_in_folder(path,
                                                     target_string='_c.csv')
    df = pd.DataFrame([])
    control_dfs = []
    paired_files = hfh.pair_files(data_files,
                                  control_files,
                                  pair_full=pair_full)

    for pair in paired_files:
        print("CREATING CAL FROM PAIR: " + pair[0] + " " + pair[1])
        print(pair)
        control_path = pair[CONTROL_FILE]
        data_path = pair[DATA_FILE]
        f_df = hfh.get_df(data_path, header=None)
        c_df = hfh.get_df(control_path, header=None)

        control_dfs.append(c_df)
        f_df = get_strict_format_f_df(c_df, f_df)
        graded = strict_grade(f_df=f_df,
                              c_df=c_df,
                              operational=False,
                              correct='1',
                              incorrect='2')
        if graded is not False:
            a = graded.columns.duplicated()
            df = pd.concat([df, graded], axis=0)

    print("replacing")
    if len(paired_files) > 0:
        c_df = pd.DataFrame([])
        c_df['AccNum'] = df.columns
        c_df['Key'] = '1'
        c_df['Options'] = '2'
        c_df['Domain'] = '1'
        c_df['Include'] = 'Y'
        c_df['Type'] = 'M'
        type = "_FINAL_"
        if destination_path.find('initial') > 0:
            type = "_INITIAL_"
        name = destination_path + '/' + hfh.get_stem(pair[0])[:3] + type

        print("replacing 2.0")
        df = df.replace(2.0, '2')
        print("replacing 1.0")
        df = df.replace(1.0, '1')
        print("filling NA")
        df = df.fillna(value='-')
        print("replacing empty with X")
        df = df.replace(" ", "X")
        print("writing csvs")
        df.to_csv(name + 'f.csv', index=True, header=False)
        c_df.to_csv(name + 'c.csv', index=None, header=False)

        return df, c_df
    else:
        print("failed to pair files in create_CAL")

Example #19

0

Show file

File: h_raw_processor.py Project: robertcalvertphd/core_irt

def process_response_string_file(f_path,
                                 bank_path=None,
                                 destination_path=None,
                                 write_csv=False,
                                 get_df=True,
                                 create_c=True,
                                 paired_bank_xlsx=None):
    if create_c:
        assert destination_path is not None, "process response string needs to know where to put the processed data"
    name = hfh.get_stem(f_path)
    lines = hfh.get_lines(f_path)
    assert len(lines) > 0, "asked to process empty file:" + f_path

    c_df = None
    f_df = None

    if is_type_K(lines):
        processed_lines = processK(lines)
        f_df = processed_lines
    elif is_type_A(lines):
        processed_lines = processA(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_B(lines):
        processed_lines = processB(lines)
        f_df = processed_lines
    elif is_type_C(lines):
        processed_lines = processC(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_D(lines):
        processed_lines = processD(lines)
        f_df = processed_lines
    elif is_type_E(lines):
        processed_lines = processE(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_F(lines):
        processed_lines = processF(lines)
        f_df = processed_lines
    elif is_type_G(lines):
        processed_lines = processG(lines)
        c_df = processed_lines[0]
        f_df = processed_lines[1]
    elif is_type_H(lines):
        processed_lines = processH(lines)
        f_df = processed_lines
    elif is_type_I(lines):
        processed_lines = processI(lines)
        f_df = processed_lines
    elif is_type_J(lines):
        processed_lines = processJ(lines)
        f_df = processed_lines

    else:
        print(f_path + " is already formatted")
        is_formatteed(lines)
        f_df = hfh.get_df(f_path)

    if c_df is not None and bank_path:
        # add AccNum instead of sequence
        b_df = create_c_df_from_bank(bank_path)
        b_df['Key'] = c_df['Key']
        c_df = b_df
    if c_df is None and bank_path is not None and create_c:
        #todo: consider respecting the correct answer at the time vs the bank or just destroy it
        bank_files = hfh.get_all_files(bank_path, extension='xlsx')
        pair = hfh.pair_files([f_path], bank_files)
        if len(pair) == 0:
            print(
                "could not find matching bank file and no default control information present."
            )
        if len(pair) == 1:
            # todo: may evaluate differences between bank and response string if desired
            c_df = create_c_df_from_bank(pair[0][1])
        if len(pair) > 1:
            print("more than one file matched for bank", f_path)

    #confirm_id_as_index
    if 0 in f_df.columns or '0' in f_df.columns:
        f_df = f_df.set_index(f_df[0], drop=True)
        f_df = f_df.drop(columns=0)
    if write_csv:
        #todo changed index... need to make sure all processed items spit out the same... in this case they are pre-strict.

        f_df.to_csv(destination_path + '/' + name + '_f.csv',
                    index=True,
                    header=False)
        if c_df is not None:
            c_df.to_csv(destination_path + '/' + name + '_c.csv',
                        index=None,
                        header=False)
    if get_df:
        return f_df

Example #20

0

Show file

File: h_proexam_upload.py Project: robertcalvertphd/core_irt

def create_upload_from_processed(c_file,
                                 f_file,
                                 path=None,
                                 c_has_header=True,
                                 to_csv=False):
    #todo: decide if _c files have headers or not...
    #todo: perhaps a different _x to indicate header or not... the only time I don't want a header is xCalibre...

    if c_has_header:
        c_df = hfh.get_df(c_file, header=0)
    else:
        c_df = hfh.get_df(c_file)

    f_df = hfh.get_df(f_file, index_col=0)
    stats_df = hfh.pd.DataFrame([])
    stats_df['AccNum'] = c_df.iloc[:, 0]
    graded_df = hr.grade_examination(f_df,
                                     c_df,
                                     grading_processed=True,
                                     correct=1,
                                     incorrect=0)

    score = graded_df.sum(axis=1)

    pbis = graded_df[graded_df.columns[0]].corr(score)

    A = get_option_df(f_df, 'A')
    B = get_option_df(f_df, 'B')
    C = get_option_df(f_df, 'C')
    D = get_option_df(f_df, 'D')

    options = ['A', 'B', 'C', 'D']
    dfs = [A, B, C, D]
    counter = -1

    N = ~f_df.isna()
    N = N.sum()
    N = N.reset_index(drop=True)

    for option in options:

        counter += 1
        a_ret = []
        b_ret = []
        c_ret = []

        df = dfs[counter]

        for column in A.columns:
            mask = df[column] == 1
            mean_score = graded_df[mask].mean().mean()
            c_ret.append(mean_score)
            pbis = df[column].corr(score)
            endorse_p = df[column].sum() / df.shape[0]
            a_ret.append(pbis)
            b_ret.append(endorse_p)
        stats_df[option + '_r'] = hfh.pd.Series(a_ret, index=stats_df.index)
        stats_df[option + '_p'] = hfh.pd.Series(b_ret, index=stats_df.index)
        stats_df[option + '_m'] = hfh.pd.Series(c_ret, index=stats_df.index)

    k_ret = []
    for i in range(graded_df.shape[1]):
        pbis = graded_df[graded_df.columns[i]].corr(score)
        k_ret.append(pbis)
    stats_df['K_r'] = hfh.pd.Series(k_ret, index=stats_df.index)
    stats_df['KEY'] = c_df['Key']
    stats_df['N'] = N
    p = graded_df.mean(axis=0)
    stats_df = stats_df.set_index('AccNum', drop=True)
    stats_df['P'] = p
    if path is None:
        name = hfh.get_stem(f_file)[:-2] + '_P.csv'
    else:
        name = path + '/' + hfh.get_stem(f_file)[:-2] + '_P.csv'
    stats_df = stats_df[[
        'KEY', 'K_r', 'P', 'A_p', 'A_r', 'A_m', 'B_p', 'B_r', 'B_m', 'C_p',
        'C_r', 'C_m', 'D_p', 'D_r', 'D_m', 'N'
    ]]
    if to_csv:
        stats_df.to_csv(name)
    return stats_df

Example #21

0

Show file

File: h_assumptions.py Project: robertcalvertphd/Data_Manipulation

def get_residuals_from_item_person(irt_folder, report_name):
    '''
    #   get df for factor analysis

    #   irt_folder has reports, xCalibreOutput and processed_data
    #   get theta from matrix
    #   get items from Stats    *not really needed for this analysis* only relevant if looking up aggregate
    #   get difficulties from Stats
    #       a different analysis would get difficulty from aggregate

    #   general process thoughts:
    #       I think I just check the raw score and then associate.
    #       residual = observed - expected (0-1)
    #       observed is 1 or 0 and comes from matrix file
    #       expected is calculated from difficulty and theta
    #       results in list of residuals which will be a row

    #  steps
        #   get matrix
        #   for person
        #       calculate person score
        #       get theta for person
        #       check each item for residual needs item difficulty
        #       write results to row
    '''

    matrix_file = irt_folder + "/xCalibreOutput/" + report_name + ' Matrix.txt'
    if os.path.isfile(matrix_file):
        matrix_df = h.convert_xCalibre_matrix_for_PCI(matrix_file, include_id=True)
        matrix_df = matrix_df.apply(pd.to_numeric, errors='ignore')
        print("hello")

    else:
        print("matrix file does not exist")
        return 0

    test_df = matrix_df
    test_df = test_df.drop(columns=0)
    test_df.to_csv("fa_test.csv")
    r = s.run_factor_analysis(test_df)
    test_df['ID'] = matrix_df[0]
    matrix_df = matrix_df.drop(columns=0)
    matrix_df['SCORE'] = matrix_df.sum(axis=1)
    matrix_df['ID'] = test_df['ID']

    cleaned_stats = irt_folder + "/xCalibreOutput/" + report_name + ".cleaned_stats"
    if os.path.isfile(cleaned_stats):
        stats_df = hfh.get_df(cleaned_stats, header=0)
    else:
        unclean_path = irt_folder + '/' + "xCalibreOutput/" + report_name + " Stats.csv"
        stats_df = h.clean_stats_csv(unclean_path, get_df=True)

    stats_df = stats_df.apply(pd.to_numeric, errors='ignore')
    tif_path = irt_folder + "/xCalibreOutput/" + report_name + " TIF.csv"
    tif_df = hfh.get_df(tif_path, header=0)
    tif_df = tif_df.apply(pd.to_numeric, errors='ignore')
    matrix_df = set_theta_from_score_in_matrix_df(tif_df, matrix_df)

    residuals_df = s.get_residuals(matrix_df, stats_df)
    residuals_df = residuals_df.apply(pd.to_numeric, errors='ignore')
    residuals_df = residuals_df.reset_index()
    residuals_df = residuals_df.drop(columns='index')
    residuals_df.to_csv("resid_test.csv", index=True)
    residuals_df.to_excel("resid_test.xlsx", index=True)
    print(can_use_factor_analysis(residuals_df))
    again = s.run_factor_analysis(residuals_df)
    again_df = pd.DataFrame(again).head(5)

Example #22

0

Show file

File: h_stats.py Project: robertcalvertphd/Data_Manipulation

def process_response_string_for_classical_upload(data_file, control_file):
    data_df = hfh.get_df(data_file)
    control_df = hfh.get_df(control_file)