def create_FINAL_from_CULLED(project_path): culled_path = hfh.get_all_files(project_path + '/reports', target_string='CULLED.csv') cal_f_path = hfh.get_all_files(project_path + '/calibration', target_string='CAL_f.csv') cal_c_path = hfh.get_all_files(project_path + '/calibration', target_string='CAL_c.csv') if len(culled_path) * len(cal_f_path) * len(cal_c_path) == 1: f_df = hfh.get_df(cal_f_path[0]) c_df = hfh.get_df(cal_c_path[0]) #c_df.columns = ['AccNum','A','B','C','D','E'] culled_df = hfh.get_df(culled_path[0], header=0) else: print("invalid call CULLED, CAL_f and CAL_c") return False ids = f_df.iloc[:, 0] f_df = f_df.drop(columns=[0]) f_df = f_df.T f_df = f_df.reset_index() f_df = f_df.drop(columns='index') c_df.columns = ['AccNum', 'B', 'C', 'D', 'E', 'F'] f_df.insert(0, 'AccNum', c_df['AccNum']) c_df = c_df[c_df['AccNum'].isin(culled_df['AccNum'])] f_df = f_df[f_df['AccNum'].isin(culled_df['AccNum'])] f_df = f_df.drop(columns=['AccNum']) f_df = f_df.T f_df.insert(0, 'ID', ids) name = hfh.get_stem(cal_f_path[0])[:-6] path = project_path + '/calibration/' + name + '_FINAL_' f_df.to_csv(path + 'f.csv', header=False, index=False) c_df.to_csv(path + 'c.csv', header=False, index=False) print('hello')
def add_new_form_to_data(new_c_path, new_f_path, calibrated_c_path, calibrated_f_path, calibrated_stats): checking = True new_f_df = hfh.get_df(new_f_path) new_c_df = hfh.get_df(new_c_path) cal_f_df = hfh.get_df(calibrated_f_path) cal_c_df = hfh.get_df(calibrated_c_path) c_ids = cal_f_df.iloc[:, 0] n_ids = new_f_df.iloc[:, 0] combined_c = update_control(new_c_df, cal_c_df, calibrated_stats) combined_f = update_formatted(combined_c, new_f_df, new_c_df, cal_f_df, cal_c_df) if checking: check = cal_f_df.drop(columns=0) check = check.apply(hfh.pd.to_numeric, errors='coerce') check = check.replace(2.0, 0.0) check['SCORE'] = check.sum(axis=1) check['ID'] = c_ids check_it = check[['ID', 'SCORE']] check = combined_f.drop(columns=0) check = check.apply(hfh.pd.to_numeric, errors='coerce') check = check.replace(2.0, 0.0) check['SCORE'] = check.sum(axis=1) check_it[['SCORE_2']] = check['SCORE'] test = check_it['SCORE_2'] != check_it['SCORE'] check_it = check_it[test] if len(check_it) > 0: print("ERROR IN COMBINATION!!!!") return False combined_f.to_csv("COMBINED_F.csv", index=False, header=None) combined_c.to_csv("COMBINED_C.csv", index=True, header=None)
def process_paired_files(pe_file, data_file, project_folder, select_Domain=False): # todo: add in domain processing here so that sets without domain names are still separated if hfh.get_extension(pe_file) == 'xlsx': pe_df = hfh.get_df_from_xlsx(pe_file) else: pe_df = hfh.get_df(pe_file, header=0) pe_df.Position = pe_df.Position.astype(float) pe_df = pe_df.sort_values("Position", ascending=True, ) pe_df = pe_df[["AccNum", 'CorrectAnswer', 'Domain']] #pe_df.Domain = pe_df.Domain.apply(str) #pe_df["AccNum"] = str(pe_df["AccNum"]) + "_" + pe_df["Domain"] if select_Domain: pe_df.loc[pe_df.Domain != select_Domain, 'include'] = 'N' pe_df = pe_df.drop(['Domain'], axis=1) pe_df['number_of_options'] = 4 pe_df['group'] = 1 pe_df["include"] = 'Y' pe_df['type'] = 'M' pe_df = pe_df[['AccNum', 'CorrectAnswer', 'number_of_options', 'group', 'include', 'type']] processed_path = project_folder + "/processed_data/" c_path = processed_path + hfh.get_stem(pe_file) + "_c.csv" pe_df.to_csv(c_path, header=False, index=False) h.convert_default_data_to_iteman(data_file, processed_path, new_name=hfh.get_stem(pe_file)) return 1
def get_f_df_repeat_status(f_path): # places that repeat information lives... # end of string # pearson file type thrid column # ... other things I have not come across ids_with_repeat_status = [] lines = hfh.get_lines(f_path) if is_type_K(lines): df = hfh.get_df(f_path, header=0) df = df.drop(0) df['Attempt'] = df['Attempt'].replace(['1'], 'F') df['Attempt'] = df['Attempt'].replace(['2'], 'R') ids_with_repeat_status = df['ClientID'] + '_' + df['Attempt'] else: for line in lines: ending_character = line.strip()[-1] if ending_character in ['F', 'R']: repeat_status = ending_character line = line.strip() split_line = line.split() _id = None if len(split_line) > 1: _id = split_line[0] else: split_line = line.split(',') if len(split_line) > 1: _id = split_line[0] if _id is None: assert False, "can not assign repeat status to file " + f_path ids_with_repeat_status.append(_id + '_' + repeat_status) ret = process_response_string_file(f_path, create_c=False) f_df = ret f_df = f_df.set_index(ids_with_repeat_status) return ids_with_repeat_status
def check_p(pe_file, data_file, threshold = .4): pe_df = hfh.get_df(pe_file) d_df = h.convert_response_string_to_csv_and_get_df(data_file, id_length=8, number_of_spaces=3).T pe_df_t = pe_df.T.iloc[1] combined = d_df combined['correct'] = pe_df_t attempted = 0 correctAnswers = 0 for test_taker_id in range(d_df.shape[1]): #for item_id in range(pe_df_t): count = -1 d = pe_df_t.size attempted += pe_df_t.size for item in pe_df_t: count += 1 print(test_taker_id, item, count) test_response = d_df.iloc[count,test_taker_id] correct_answer = pe_df_t.iloc[count] if test_response == correct_answer: correctAnswers += 1 p = correctAnswers/attempted if p < threshold: return False return True
def create_reports(path_to_stats, destination_path, is_cs=False): # todo: make this smarter. Look through for any stats files in folder or subfolders and then munch on them. destination_path += '/' if is_cs: files = get_all_file_names_in_folder(path_to_stats, target_string="_cs") else: files = get_all_file_names_in_folder(path_to_stats, target_string="Stats") for file in files: if is_cs: df = get_df(file, header=0) else: df = convert_stats_to_df(file, destination_path) high = get_top_n_rpbis(df, 20) low = get_low_performing_items(df) flagged = get_flagged_items(df) agg = get_aggregate_report_items(df) # parameters = get_parameters(df) name = get_stem(file) + "_c" high.to_csv(destination_path + name + "_high_.csv", index=False) low.to_csv(destination_path + name + "_low_.csv", index=False) flagged.to_csv(destination_path + name + "_flagged_.csv", index=False) agg.to_csv(destination_path + name + "_agg.csv", index=False)
def create_form_from_c(c_file, destination_path): c_df = hfh.get_df(c_file) form = hfh.pd.DataFrame([]) form['AccNum'] = c_df[0] form['Sequence'] = form.index.values + 1 form = form[['Sequence', 'AccNum']] name = hfh.get_stem(c_file)[:-2] + '_L.csv' form.to_csv(destination_path + '/' + name, header=True, index=False)
def merge_control_and_bank_info(control_path, bank_path): control_df = hfh.get_df(control_path, header=0) bank_df = hfh.get_df(bank_path, header=0) # check that id is not already merged sequence_value = control_df['sequence'][1] if not sequence_value == str(2): print(control_path, "expected sequence = 2 got", sequence_value + ", file was not merged. Check for repairs.") valid = is_valid_control(control_path) if valid: print(control_path + " is a valid control file.") else: new = pd.merge(control_df, bank_df, on='sequence', how='right') new = new[[ 'bank_id', 'key', 'number_of_options', 'group', 'include', 'scoring_option' ]] new.to_csv(control_path, header=None, index=False)
def convert_stats_to_df(path_to_stats, destination_path): first_line = get_first_line_of_stats(path_to_stats) try: blankLine = get_next_blank_line_after_index(path_to_stats, first_line - 1) f = get_lines_from_X_to_Y_from_file(path_to_stats, first_line - 1, blankLine) name = destination_path + get_stem(path_to_stats) + "_cs.csv" write_lines_to_text(f, name) df = get_df(name, header=0, dtype="string") except: pass return df
def validate_c_file_header(c_file, debug=True): #first line should be AccNum... lines = hfh.get_lines(c_file) if debug: print("validating: " + c_file) assert len(lines) > 0, "validate_c_file was fed an empty file" if lines[0] == E.C_HEADER_S + '\n': return True if lines[1] == E.C_HEADER_S + '\n': assert False, "validate_c_file detected 2 headers" # header now present df = hfh.get_df(c_file) df.columns = E.C_HEADER_L df.to_csv(c_file, index=False) return True
def create_item_level_analysis(stats_R_path, stats_2_path, tif_path, matrix_path, bank_path, passing_proportion): bank = hfh.get_df_from_xlsx(bank_path) stats_R_df = hfh.get_stats_df(stats_R_path, bank) stats_2_df = hfh.get_stats_df(stats_2_path, bank) tif_df = hfh.get_df(tif_path, header=0) matrix_df = core.h_format_manipulators.convert_xCalibre_matrix_for_PCI( matrix_path) a = ha.get_residuals_by_domain(matrix_df, stats_R_df, tif_df) b = ha.evaluate_in_fit_out_fit(stats_R_df) c = ha.evaluate_item_discriminatsion(stats_2_df) d = hr.get_count_summary(b, 'Domain', 'IN_DEV_EVAL') e = hr.get_count_summary(b, 'Domain', 'OUT_DEV_EVAL') f = hr.get_count_summary(b, 'Domain', 'B_EVAL') g = hr.get_count_summary(c, 'Domain', 'A_DEV_EVAL') h = ha.get_residuals_by_difficulty(matrix_df, stats_R_df, tif_df, a[1]) j = ha.evaluate_max_info(stats_R_df, tif_df, passing_proportion)
def remove_accNum_from_f_and_c(accNum, name, program_path, reason=None): # create backup_processed_data folder report_folder = program_path + '/reports' backup_processed_folder = program_path + '/' + E.BACKUP_PROCESSED_DATA_P processed_folder = program_path + '/processed_data' hfh.create_dir(backup_processed_folder) # create notation of removal with reason f_df = hfh.get_single_file(processed_folder, target_string=name + '_f.csv', as_df=True, strict=True) c_file = hfh.get_single_file(processed_folder, target_string=name + '_c.csv', strict=True) c_df = hfh.get_df(c_file, header=get_header_argument(c_file)) s_ret = get_strict_format_f_df(c_df, f_df, get_c_df=True) c_df = s_ret[0] f_df = s_ret[1].T f_df = f_df.drop(accNum) c_df = c_df.set_index(['AccNum']) c_df = c_df.drop(accNum) c_df = c_df.reset_index(drop=False) f_df = f_df.T strict_grade(c_df, f_df, operational=False) # solely for validation f_df.to_csv(program_path + '/processed_data/' + name + '_f.csv', header=None, index=True) c_df.to_csv(program_path + '/processed_data/' + name + '_c.csv', index=False, header=None) removed_report_path = hfh.get_all_files(program_path + "/" + E.REPORTS_P + '/', target_string=E.REMOVED_ITEMS_R) entry = accNum + " was removed from " + name if reason is not None: entry += " because of a " + reason if len(removed_report_path) == 0: removed_report_path = program_path + "/" + E.REPORTS_P + '/' + E.REMOVED_ITEMS_R hfh.write_lines_to_text([entry + '\n'], removed_report_path) else: hfh.add_lines_to_csv(removed_report_path[0], [entry])
def update_c_from_bank(project_path): # assumes that updated _c files have position instead of accNum bank_directory = project_path + '/bank_files' processed_directory = project_path + '/processed_data' c_files = hfh.get_all_files(processed_directory, target_string='_c.csv') b_files = hfh.get_all_files(bank_directory, target_string='.xlsx') pairs = hfh.pair_files(c_files, b_files) for pair in pairs: c_file = pair[0] b_file = pair[1] c_df = hfh.get_df(c_file) b_df = pd.read_excel(b_file) c_df.columns = [ 'AccNum', 'Key', 'Options', 'Domain', 'Include', 'Type' ] c_df['AccNum'] = b_df['AccNum'] c_df = c_df[['AccNum', 'Key', 'Options', 'Domain', 'Include', 'Type']] c_df.to_csv(c_file, index=None, header=None)
def grade_responses(data_file, control_file): # todo: this has been back burnered so assumption item assessment can be finished. # this assumes that data has been processed a = os.path.isfile(data_file) b = os.path.isfile(control_file) data_df = h.get_data_df(data_file) control_df = hfh.get_df(control_file) ret_df = data_df answers = control_df.iloc[:, 1] ids = data_df.iloc[:, 0] data_df = data_df.drop(columns=0) data_df.index = data_df.index + 1 for col in range(data_df.shape[0] - 1): for row in range(data_df.shape[1]): individuals_answers = data_df.loc[row + 1] # data_df[data_df.loc[row+1]==individuals_answers,'q'+str(row+1)] = 1 # b = control_df.iloc[row,1] print(a, b) #ret_df.loc[row][data_df[col] == control_df.iloc[row,1], "q"+str(col)] = 1 print("hello")
def create_key_df_from_csv(file_path): # assumes format # Position,Domain,AccNum,UseCode,CorrectAnswer,Content Area,... # 1,02,LLE669,Operational,C,0202,44,,56,0.18,,,Bank,InUse,1,,,101,AUG19(P),,,0,1,0,No,,1, ,5/13/2019, df = hfh.get_df(file_path, header=0) ret = [] test_ids = df['Position'].tolist() bank_ids = df['AccNum'].tolist() for i in range(len(test_ids)): form = hfh.get_stem(file_path) test_id = test_ids[i] bank_id = bank_ids[i] subject = bank_id[:3] bank_id_number = bank_id[3:] ret.append([form, test_id, subject, bank_id_number, bank_id]) df = pd.DataFrame(ret) df.columns = ['form', 'test_id', 'subject', 'bank_id_number', 'bank_id'] # name = hfh.create_name("silly","LPCC_IRT/keys/L_files","csv",'_L') #df.to_csv(name, index=None) return df
def pd_validate_inputs(self, raw_required = False, xCalibre_required = False): parent_path = self.report_path.get() raw_path = self.raw_data_path.get() valid = 1 master_folder = parent_path if not os.path.isdir(parent_path): self.log.append("Path invalid:" + master_folder) valid = 0 # assume that parent path is a project folder e.g. ...Desktop/LEX if self.report_name.get == "": report_path = parent_path + "/reports" xCalibre_path = parent_path + "/xCalibreOutput" master_name = master_folder[master_folder.rfind('/')+1:] else: master_name = self.report_name.get() master_folder = parent_path + "/" + master_name report_path = master_folder + "/reports" xCalibre_path = master_folder + "/xCalibreOutput" #self.report_name.set(hfh.get_parent_folder(master_folder)) if raw_required and raw_path == "": self.log.append("raw path required") valid = 0 if raw_required and valid: data_files = hfh.get_all_files(raw_path, extension='txt') form_files = hfh.get_all_files(raw_path, extension='csv') if len(form_files) == 0: # assume just xlsx files form_files = hfh.get_all_file_names_in_folder(raw_path, extension='xlsx') if data_files is None or form_files is None: valid = 0 if data_files is None: self.log.append("data files are missing") if form_files is None: self.log.append("form files are missing") if valid: if len(data_files) == 0 or len(form_files) == 0: valid = 0 self.log.append("Raw data does not contain both txt and csv files.") self.log.append("Raw Path:" + raw_path) if not len(data_files) == len(form_files): valid = 0 self.log.append("There are unequal data and form files in raw data.") self.log.append("Found " + str(len(data_files)) + " data files and " + str(len(form_files)) + " form files.") d = "data:\n" fm = "form:\n" for f in data_files: d+= f+'\n' for f in form_files: fm+=f+'\n' self.log.append(d) self.log.append(fm) self.log.append("Raw Path:" + raw_path) for file in data_files: can_read = hfh.file_is_readable(file) if not can_read: valid = 0 self.log.append("read access denied for data file:" + file) for file in form_files: can_read = hfh.file_is_readable(file) if not can_read: valid = 0 self.log.append("read access denied for form file:" + file) if valid: if hfh.get_extension(file) == 'csv': test_df = hfh.get_df(file,header=0) else: test_df = hfh.get_df_from_xlsx(file) required_columns = ["Domain", "AccNum", "CorrectAnswer"] for column in required_columns: if column not in test_df.columns: self.log.append("______") self.log.append(file) self.log.append("pro exam file does not contain " + column + ".\nReset form and then download from Proexam.") self.log.append("______") valid = 0 if valid: master_report_path = master_folder + "/" + 'reports' if not os.path.isdir(master_report_path): self.log.append("Path does not contain reports folder. \nPath:" + master_folder) valid = 0 if valid: if not os.path.isdir(xCalibre_path) and xCalibre_required: self.log.append("Path does not contain xCalibreOutput folder. \nPath:" + master_folder) valid = 0 if valid: if os.path.isfile(master_name): self.log.append("Folder name is a file. It should be a directory, i.e. no extension.") valid = 0 if valid and raw_required: if not hr.get_confirm_on_pairing(raw_path): valid = 0 self.log.append("User said pairing was wrong.") if valid and xCalibre_required: stats_files = hfh.get_all_file_names_in_folder(xCalibre_path, target_string="Stats") if len(stats_files) == 0: self.log.append("Report path does not contain xCalibreOutput reports") valid = 0 # check that can write reports aggregate_name = report_path + "/" + master_name + "_aggregate_.csv" complete_name = report_path + "/" + master_name + "_complete_.csv" if os.path.isfile(aggregate_name): if not hfh.file_is_writable(aggregate_name): valid = 0 self.log.append("No access to " + aggregate_name) if os.path.isfile(complete_name): if not hfh.file_is_writable(complete_name): valid = 0 self.log.append("No access to " + complete_name) if valid: self.log.append("validated call") self.d_log() return valid
def create_general_report_table(aggregate_cs_path, name, destination_path): flags = ['La', 'Lb', 'K', 'Ha', 'Hb'] # todo: sloppy suggestions = [ "caution", "low_biserial", "negative_biserial", "high_B_range", "keyed_error", "good_irt" ] df = get_df(aggregate_cs_path, header=0) unique_df = unique(df['Item ID']).tolist() lines = [[ "Item ID", "TR mean", "TR min", "TR max", "TR range", "SR mean", "SR min", "SR max", "SR range", "B mean", "B min", "B max", "B range", "count" ]] for item in flags: lines[0].append(item) for s in suggestions: lines[0].append(s) good_irt = 0 cautions = 0 pro_exam_lines = [] for id in unique_df: subset = df[df['Item ID'] == id] s = subset.copy() TR = get_descriptives(s['T-Rpbis']) SR = get_descriptives(s['S-Rpbis']) B = get_descriptives(s['b']) flags = subset["Flags"] flag_count = get_flag_count(flags) suggestion = get_suggestions(SR, TR, B, flag_count) if suggestion[1]: good_irt += 1 if suggestion[2]: cautions += 1 suggestion = suggestion[0] line = [ id, TR[0], TR[1], TR[2], TR[3], SR[0], SR[1], SR[2], SR[3], B[0], B[1], B[2], B[3], len(subset) ] for flag in flag_count: line.append(flag) for s in suggestion: line.append(s) lines.append(line) print("IRT:", good_irt, "of", len(lines), "=", str(round(good_irt * 100 / (len(lines) - 1), 2)) + "%") print("Cautions:", cautions, "of", len(lines), "=", str(round(cautions * 100 / (len(lines) - 1), 2)) + "%") report_df = pd.DataFrame(lines) final_report_name = destination_path + "/" + name + "_complete_.csv" pro_exam_report = destination_path + "/" + name + "_pro_exam_.txt" report_df.to_csv(final_report_name, index=False, header=0) report_df.to_csv(pro_exam_report, index=False, header=0) _df = pd.read_csv(final_report_name) for i in _df: print(i) caution = _df.where(_df['caution'] == True) irt = _df.where(_df['good_irt'] == True) caution = caution.dropna() irt = irt.dropna() caution.to_csv(destination_path + "/" + name + "_caution_.csv") irt.to_csv(destination_path + "/" + name + "_high_performers_for_irt_.csv")
def create_CAL(project_path, processed_path=None, destination_path=None, pair_full=True, debug=True): DATA_FILE = 0 CONTROL_FILE = 1 if processed_path == None: path = project_path + '/processed_data' data_files = hfh.get_all_file_names_in_folder(path, target_string='_f.csv') control_files = hfh.get_all_file_names_in_folder(path, target_string='_c.csv') df = pd.DataFrame([]) control_dfs = [] paired_files = hfh.pair_files(data_files, control_files, pair_full=pair_full) for pair in paired_files: print("CREATING CAL FROM PAIR: " + pair[0] + " " + pair[1]) print(pair) control_path = pair[CONTROL_FILE] data_path = pair[DATA_FILE] f_df = hfh.get_df(data_path, header=None) c_df = hfh.get_df(control_path, header=None) control_dfs.append(c_df) f_df = get_strict_format_f_df(c_df, f_df) graded = strict_grade(f_df=f_df, c_df=c_df, operational=False, correct='1', incorrect='2') if graded is not False: a = graded.columns.duplicated() df = pd.concat([df, graded], axis=0) print("replacing") if len(paired_files) > 0: c_df = pd.DataFrame([]) c_df['AccNum'] = df.columns c_df['Key'] = '1' c_df['Options'] = '2' c_df['Domain'] = '1' c_df['Include'] = 'Y' c_df['Type'] = 'M' type = "_FINAL_" if destination_path.find('initial') > 0: type = "_INITIAL_" name = destination_path + '/' + hfh.get_stem(pair[0])[:3] + type print("replacing 2.0") df = df.replace(2.0, '2') print("replacing 1.0") df = df.replace(1.0, '1') print("filling NA") df = df.fillna(value='-') print("replacing empty with X") df = df.replace(" ", "X") print("writing csvs") df.to_csv(name + 'f.csv', index=True, header=False) c_df.to_csv(name + 'c.csv', index=None, header=False) return df, c_df else: print("failed to pair files in create_CAL")
def process_response_string_file(f_path, bank_path=None, destination_path=None, write_csv=False, get_df=True, create_c=True, paired_bank_xlsx=None): if create_c: assert destination_path is not None, "process response string needs to know where to put the processed data" name = hfh.get_stem(f_path) lines = hfh.get_lines(f_path) assert len(lines) > 0, "asked to process empty file:" + f_path c_df = None f_df = None if is_type_K(lines): processed_lines = processK(lines) f_df = processed_lines elif is_type_A(lines): processed_lines = processA(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_B(lines): processed_lines = processB(lines) f_df = processed_lines elif is_type_C(lines): processed_lines = processC(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_D(lines): processed_lines = processD(lines) f_df = processed_lines elif is_type_E(lines): processed_lines = processE(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_F(lines): processed_lines = processF(lines) f_df = processed_lines elif is_type_G(lines): processed_lines = processG(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_H(lines): processed_lines = processH(lines) f_df = processed_lines elif is_type_I(lines): processed_lines = processI(lines) f_df = processed_lines elif is_type_J(lines): processed_lines = processJ(lines) f_df = processed_lines else: print(f_path + " is already formatted") is_formatteed(lines) f_df = hfh.get_df(f_path) if c_df is not None and bank_path: # add AccNum instead of sequence b_df = create_c_df_from_bank(bank_path) b_df['Key'] = c_df['Key'] c_df = b_df if c_df is None and bank_path is not None and create_c: #todo: consider respecting the correct answer at the time vs the bank or just destroy it bank_files = hfh.get_all_files(bank_path, extension='xlsx') pair = hfh.pair_files([f_path], bank_files) if len(pair) == 0: print( "could not find matching bank file and no default control information present." ) if len(pair) == 1: # todo: may evaluate differences between bank and response string if desired c_df = create_c_df_from_bank(pair[0][1]) if len(pair) > 1: print("more than one file matched for bank", f_path) #confirm_id_as_index if 0 in f_df.columns or '0' in f_df.columns: f_df = f_df.set_index(f_df[0], drop=True) f_df = f_df.drop(columns=0) if write_csv: #todo changed index... need to make sure all processed items spit out the same... in this case they are pre-strict. f_df.to_csv(destination_path + '/' + name + '_f.csv', index=True, header=False) if c_df is not None: c_df.to_csv(destination_path + '/' + name + '_c.csv', index=None, header=False) if get_df: return f_df
def create_upload_from_processed(c_file, f_file, path=None, c_has_header=True, to_csv=False): #todo: decide if _c files have headers or not... #todo: perhaps a different _x to indicate header or not... the only time I don't want a header is xCalibre... if c_has_header: c_df = hfh.get_df(c_file, header=0) else: c_df = hfh.get_df(c_file) f_df = hfh.get_df(f_file, index_col=0) stats_df = hfh.pd.DataFrame([]) stats_df['AccNum'] = c_df.iloc[:, 0] graded_df = hr.grade_examination(f_df, c_df, grading_processed=True, correct=1, incorrect=0) score = graded_df.sum(axis=1) pbis = graded_df[graded_df.columns[0]].corr(score) A = get_option_df(f_df, 'A') B = get_option_df(f_df, 'B') C = get_option_df(f_df, 'C') D = get_option_df(f_df, 'D') options = ['A', 'B', 'C', 'D'] dfs = [A, B, C, D] counter = -1 N = ~f_df.isna() N = N.sum() N = N.reset_index(drop=True) for option in options: counter += 1 a_ret = [] b_ret = [] c_ret = [] df = dfs[counter] for column in A.columns: mask = df[column] == 1 mean_score = graded_df[mask].mean().mean() c_ret.append(mean_score) pbis = df[column].corr(score) endorse_p = df[column].sum() / df.shape[0] a_ret.append(pbis) b_ret.append(endorse_p) stats_df[option + '_r'] = hfh.pd.Series(a_ret, index=stats_df.index) stats_df[option + '_p'] = hfh.pd.Series(b_ret, index=stats_df.index) stats_df[option + '_m'] = hfh.pd.Series(c_ret, index=stats_df.index) k_ret = [] for i in range(graded_df.shape[1]): pbis = graded_df[graded_df.columns[i]].corr(score) k_ret.append(pbis) stats_df['K_r'] = hfh.pd.Series(k_ret, index=stats_df.index) stats_df['KEY'] = c_df['Key'] stats_df['N'] = N p = graded_df.mean(axis=0) stats_df = stats_df.set_index('AccNum', drop=True) stats_df['P'] = p if path is None: name = hfh.get_stem(f_file)[:-2] + '_P.csv' else: name = path + '/' + hfh.get_stem(f_file)[:-2] + '_P.csv' stats_df = stats_df[[ 'KEY', 'K_r', 'P', 'A_p', 'A_r', 'A_m', 'B_p', 'B_r', 'B_m', 'C_p', 'C_r', 'C_m', 'D_p', 'D_r', 'D_m', 'N' ]] if to_csv: stats_df.to_csv(name) return stats_df
def get_residuals_from_item_person(irt_folder, report_name): ''' # get df for factor analysis # irt_folder has reports, xCalibreOutput and processed_data # get theta from matrix # get items from Stats *not really needed for this analysis* only relevant if looking up aggregate # get difficulties from Stats # a different analysis would get difficulty from aggregate # general process thoughts: # I think I just check the raw score and then associate. # residual = observed - expected (0-1) # observed is 1 or 0 and comes from matrix file # expected is calculated from difficulty and theta # results in list of residuals which will be a row # steps # get matrix # for person # calculate person score # get theta for person # check each item for residual needs item difficulty # write results to row ''' matrix_file = irt_folder + "/xCalibreOutput/" + report_name + ' Matrix.txt' if os.path.isfile(matrix_file): matrix_df = h.convert_xCalibre_matrix_for_PCI(matrix_file, include_id=True) matrix_df = matrix_df.apply(pd.to_numeric, errors='ignore') print("hello") else: print("matrix file does not exist") return 0 test_df = matrix_df test_df = test_df.drop(columns=0) test_df.to_csv("fa_test.csv") r = s.run_factor_analysis(test_df) test_df['ID'] = matrix_df[0] matrix_df = matrix_df.drop(columns=0) matrix_df['SCORE'] = matrix_df.sum(axis=1) matrix_df['ID'] = test_df['ID'] cleaned_stats = irt_folder + "/xCalibreOutput/" + report_name + ".cleaned_stats" if os.path.isfile(cleaned_stats): stats_df = hfh.get_df(cleaned_stats, header=0) else: unclean_path = irt_folder + '/' + "xCalibreOutput/" + report_name + " Stats.csv" stats_df = h.clean_stats_csv(unclean_path, get_df=True) stats_df = stats_df.apply(pd.to_numeric, errors='ignore') tif_path = irt_folder + "/xCalibreOutput/" + report_name + " TIF.csv" tif_df = hfh.get_df(tif_path, header=0) tif_df = tif_df.apply(pd.to_numeric, errors='ignore') matrix_df = set_theta_from_score_in_matrix_df(tif_df, matrix_df) residuals_df = s.get_residuals(matrix_df, stats_df) residuals_df = residuals_df.apply(pd.to_numeric, errors='ignore') residuals_df = residuals_df.reset_index() residuals_df = residuals_df.drop(columns='index') residuals_df.to_csv("resid_test.csv", index=True) residuals_df.to_excel("resid_test.xlsx", index=True) print(can_use_factor_analysis(residuals_df)) again = s.run_factor_analysis(residuals_df) again_df = pd.DataFrame(again).head(5)
def process_response_string_for_classical_upload(data_file, control_file): data_df = hfh.get_df(data_file) control_df = hfh.get_df(control_file)