def create_FINAL_from_CULLED(project_path): culled_path = hfh.get_all_files(project_path + '/reports', target_string='CULLED.csv') cal_f_path = hfh.get_all_files(project_path + '/calibration', target_string='CAL_f.csv') cal_c_path = hfh.get_all_files(project_path + '/calibration', target_string='CAL_c.csv') if len(culled_path) * len(cal_f_path) * len(cal_c_path) == 1: f_df = hfh.get_df(cal_f_path[0]) c_df = hfh.get_df(cal_c_path[0]) #c_df.columns = ['AccNum','A','B','C','D','E'] culled_df = hfh.get_df(culled_path[0], header=0) else: print("invalid call CULLED, CAL_f and CAL_c") return False ids = f_df.iloc[:, 0] f_df = f_df.drop(columns=[0]) f_df = f_df.T f_df = f_df.reset_index() f_df = f_df.drop(columns='index') c_df.columns = ['AccNum', 'B', 'C', 'D', 'E', 'F'] f_df.insert(0, 'AccNum', c_df['AccNum']) c_df = c_df[c_df['AccNum'].isin(culled_df['AccNum'])] f_df = f_df[f_df['AccNum'].isin(culled_df['AccNum'])] f_df = f_df.drop(columns=['AccNum']) f_df = f_df.T f_df.insert(0, 'ID', ids) name = hfh.get_stem(cal_f_path[0])[:-6] path = project_path + '/calibration/' + name + '_FINAL_' f_df.to_csv(path + 'f.csv', header=False, index=False) c_df.to_csv(path + 'c.csv', header=False, index=False) print('hello')
def get_confirm_on_pairing(raw_data, ui = True): pe_files = hfh.get_all_files(raw_data, extension='csv') data_files = hfh.get_all_files(raw_data, extension='txt') xlsx_files = hfh.get_all_files(raw_data, extension='xlsx') confirm = True if ui: confirm_message = "We found " + str(len(pe_files)) + " csv pro exam files, " + str(len(xlsx_files)) + \ " + excel pro exam files, and " + str(len(data_files)) + " data files.\n" confirm_message += "Is this the correct number for each?" confirm = u.get_yes_no_response("Matching", confirm_message) if confirm: return 1 return 0
def create_reports(self, remove_rtf = True): print("report call") self.log.clear() self.b_report.config(state=tk.DISABLED) self.gui.after(200, lambda: self.b_report.config(state=tk.NORMAL)) valid = self.pd_validate_inputs(xCalibre_required=True) parent_path = self.report_path.get() master_name = self.report_name.get() master_path = parent_path + "/" + master_name xCalibre_path = master_path + "/xCalibreOutput" if valid: report_log = str("report creation successful \nreports:" + master_path+"/reports" ) self.log.append(report_log) results = h2p.create_master_file(master_folder= master_path) files = hfh.get_all_files(xCalibre_path, extension='rtf') for file in files: os.remove(file) self.d_log() self.log.clear() return results else: self.log.append("Invalid function call.") self.d_log() self.log.clear() return 0
def update_c_from_bank(project_path): # assumes that updated _c files have position instead of accNum bank_directory = project_path + '/bank_files' processed_directory = project_path + '/processed_data' c_files = hfh.get_all_files(processed_directory, target_string='_c.csv') b_files = hfh.get_all_files(bank_directory, target_string='.xlsx') pairs = hfh.pair_files(c_files, b_files) for pair in pairs: c_file = pair[0] b_file = pair[1] c_df = hfh.get_df(c_file) b_df = pd.read_excel(b_file) c_df.columns = [ 'AccNum', 'Key', 'Options', 'Domain', 'Include', 'Type' ] c_df['AccNum'] = b_df['AccNum'] c_df = c_df[['AccNum', 'Key', 'Options', 'Domain', 'Include', 'Type']] c_df.to_csv(c_file, index=None, header=None)
def process_raw_data(master_folder, raw_data): form_files = hfh.get_all_files(raw_data, extension="csv") data_files = hfh.get_all_files(raw_data, extension='txt') if not len(form_files) == len(data_files): # assume that we are using xlsx files form_files = hfh.get_all_file_names_in_folder(raw_data, extension="xlsx") for file in form_files: name = hfh.get_stem(file) + '_raw_backup_form.' + hfh.get_extension(file) hfh.copy_file_and_write_to_destination(file, master_folder + "/data", modified_name=name) for file in data_files: name = hfh.get_stem(file) + '_raw_backup_data.' + hfh.get_extension(file) hfh.copy_file_and_write_to_destination(file, master_folder + "/data", modified_name=name) paired_files = pair_files(form_files, data_files) if paired_files: for pair in paired_files: pe_file = pair[0] data_file = pair[1] process_paired_files(pe_file, data_file, master_folder)
def create_forms_from_bank(project_path, operational=True, create_bank_L=False): bank_files = hfh.get_all_files(project_path + '/bank_files') print("creating forms") for file in bank_files: if not create_bank_L and file.find("BANK") > 0: pass else: b_df = pd.read_excel(file, header=0) cut = "" if create_bank_L and file.find('BANK') > -1: # check if passing is present i = file.find('_') if i > -1: name = hfh.get_stem(file) cut = name[i:] try: cut = int(cut) except ValueError: print( file, "contains and underscore but does not provide a cut" ) form = pd.DataFrame([]) form['AccNum'] = b_df['AccNum'] form_length = len(form) + 1 form.insert(0, 'Position', range(1, form_length)) elif operational: if 'UseCode' in b_df.columns: form = b_df[b_df['UseCode'] == 'Operational'] form = form[['Position', 'AccNum']] elif 'Position' in b_df.columns: form = b_df[['Position', 'AccNum']] name = hfh.get_stem(file) suffix = '_LF' prefix = 'full' if operational: suffix = '_LO' + str(cut) prefix = 'operational' form.to_csv(project_path + '/forms/' + prefix + '/' + name + suffix + '.csv', index=False)
def remove_accNum_from_f_and_c(accNum, name, program_path, reason=None): # create backup_processed_data folder report_folder = program_path + '/reports' backup_processed_folder = program_path + '/' + E.BACKUP_PROCESSED_DATA_P processed_folder = program_path + '/processed_data' hfh.create_dir(backup_processed_folder) # create notation of removal with reason f_df = hfh.get_single_file(processed_folder, target_string=name + '_f.csv', as_df=True, strict=True) c_file = hfh.get_single_file(processed_folder, target_string=name + '_c.csv', strict=True) c_df = hfh.get_df(c_file, header=get_header_argument(c_file)) s_ret = get_strict_format_f_df(c_df, f_df, get_c_df=True) c_df = s_ret[0] f_df = s_ret[1].T f_df = f_df.drop(accNum) c_df = c_df.set_index(['AccNum']) c_df = c_df.drop(accNum) c_df = c_df.reset_index(drop=False) f_df = f_df.T strict_grade(c_df, f_df, operational=False) # solely for validation f_df.to_csv(program_path + '/processed_data/' + name + '_f.csv', header=None, index=True) c_df.to_csv(program_path + '/processed_data/' + name + '_c.csv', index=False, header=None) removed_report_path = hfh.get_all_files(program_path + "/" + E.REPORTS_P + '/', target_string=E.REMOVED_ITEMS_R) entry = accNum + " was removed from " + name if reason is not None: entry += " because of a " + reason if len(removed_report_path) == 0: removed_report_path = program_path + "/" + E.REPORTS_P + '/' + E.REMOVED_ITEMS_R hfh.write_lines_to_text([entry + '\n'], removed_report_path) else: hfh.add_lines_to_csv(removed_report_path[0], [entry])
def create_master_file(report_name = False, xCalibre_output_path = False, reports_path = False, master_folder = False): function_id = "h_2p_report_analysis|create_master_file" r_entries = [] # list of tuples constant, message if master_folder: #confirm master folder does not end in / if master_folder[-1] == '/': master_folder = master_folder[:-1] reports_path = master_folder + "/reports" xCalibre_output_path = master_folder + "/xCalibreOutput" search_name = master_folder[:-1] report_name = master_folder[search_name.rfind('/')+1:] if not report_name: u.get_string("What would you like to title the report?") path = xCalibre_output_path if not xCalibre_output_path: path = u.get_folder("Locate the xCalibreOutput reports folder", required=False) if not path: r_entries.append([R.PATH_INVALID,"Invalid xCalibrePath:" + path]) f = hfh.get_parent_folder(path) if not f == 'xCalibreOutput': r_entries.append([R.WRONG_FOLDER," must select xCalibreOutput folder"]) if not reports_path: reports_path = u.get_folder("Choose a folder for generated reports.", required=False) if not reports_path: r_entries.append([R.PATH_INVALID, "Invalid report path:" + path]) if hfh.get_parent_folder(reports_path) != "xCalibreOutput": r_entries.append([R.WRONG_FOLDER,"Not xCalibreOutput"]) if os.path.isdir(reports_path): stats_files = hfh.get_all_files(xCalibre_output_path, "Stats") if len(stats_files) == 0: r_entries.append([R.NO_STATS_FILES, "xCalibreOutput reports:" + path]) else: process_stats_files(stats_files, reports_path, report_name) if len(r_entries) == 0: r_entries.append([R.VALID,"create_master_file executed"]) return R(function_id,r_entries)
def has_acceptable_correct_percentage(xCalibre_report_path, id_length=8, debug=True): files = hfh.get_all_files(xCalibre_report_path, "Matrix") for file in files: total = 0 correct = 0 lines = hfh.get_lines(file) for line in lines: scores = line[id_length:-1] for x in scores: total += 1 try: correct += int(x) except: pass percent_correct = round(correct / total * 100, 4) if debug: print(file, percent_correct) if percent_correct < 50: print(file, "has low correct rate.") return False return True
def process_response_string_file(f_path, bank_path=None, destination_path=None, write_csv=False, get_df=True, create_c=True, paired_bank_xlsx=None): if create_c: assert destination_path is not None, "process response string needs to know where to put the processed data" name = hfh.get_stem(f_path) lines = hfh.get_lines(f_path) assert len(lines) > 0, "asked to process empty file:" + f_path c_df = None f_df = None if is_type_K(lines): processed_lines = processK(lines) f_df = processed_lines elif is_type_A(lines): processed_lines = processA(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_B(lines): processed_lines = processB(lines) f_df = processed_lines elif is_type_C(lines): processed_lines = processC(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_D(lines): processed_lines = processD(lines) f_df = processed_lines elif is_type_E(lines): processed_lines = processE(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_F(lines): processed_lines = processF(lines) f_df = processed_lines elif is_type_G(lines): processed_lines = processG(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_H(lines): processed_lines = processH(lines) f_df = processed_lines elif is_type_I(lines): processed_lines = processI(lines) f_df = processed_lines elif is_type_J(lines): processed_lines = processJ(lines) f_df = processed_lines else: print(f_path + " is already formatted") is_formatteed(lines) f_df = hfh.get_df(f_path) if c_df is not None and bank_path: # add AccNum instead of sequence b_df = create_c_df_from_bank(bank_path) b_df['Key'] = c_df['Key'] c_df = b_df if c_df is None and bank_path is not None and create_c: #todo: consider respecting the correct answer at the time vs the bank or just destroy it bank_files = hfh.get_all_files(bank_path, extension='xlsx') pair = hfh.pair_files([f_path], bank_files) if len(pair) == 0: print( "could not find matching bank file and no default control information present." ) if len(pair) == 1: # todo: may evaluate differences between bank and response string if desired c_df = create_c_df_from_bank(pair[0][1]) if len(pair) > 1: print("more than one file matched for bank", f_path) #confirm_id_as_index if 0 in f_df.columns or '0' in f_df.columns: f_df = f_df.set_index(f_df[0], drop=True) f_df = f_df.drop(columns=0) if write_csv: #todo changed index... need to make sure all processed items spit out the same... in this case they are pre-strict. f_df.to_csv(destination_path + '/' + name + '_f.csv', index=True, header=False) if c_df is not None: c_df.to_csv(destination_path + '/' + name + '_c.csv', index=None, header=False) if get_df: return f_df
def validate_raw_files(project_path): bank_files = hfh.get_all_files(project_path + '/bank_files') response_strings = hfh.get_all_files(project_path + 'raw_data') pairs = hfh.pair_files(bank_files, response_strings) print("valid pairs of raw data = " + str(len(pairs)))
def pd_validate_inputs(self, raw_required = False, xCalibre_required = False): parent_path = self.report_path.get() raw_path = self.raw_data_path.get() valid = 1 master_folder = parent_path if not os.path.isdir(parent_path): self.log.append("Path invalid:" + master_folder) valid = 0 # assume that parent path is a project folder e.g. ...Desktop/LEX if self.report_name.get == "": report_path = parent_path + "/reports" xCalibre_path = parent_path + "/xCalibreOutput" master_name = master_folder[master_folder.rfind('/')+1:] else: master_name = self.report_name.get() master_folder = parent_path + "/" + master_name report_path = master_folder + "/reports" xCalibre_path = master_folder + "/xCalibreOutput" #self.report_name.set(hfh.get_parent_folder(master_folder)) if raw_required and raw_path == "": self.log.append("raw path required") valid = 0 if raw_required and valid: data_files = hfh.get_all_files(raw_path, extension='txt') form_files = hfh.get_all_files(raw_path, extension='csv') if len(form_files) == 0: # assume just xlsx files form_files = hfh.get_all_file_names_in_folder(raw_path, extension='xlsx') if data_files is None or form_files is None: valid = 0 if data_files is None: self.log.append("data files are missing") if form_files is None: self.log.append("form files are missing") if valid: if len(data_files) == 0 or len(form_files) == 0: valid = 0 self.log.append("Raw data does not contain both txt and csv files.") self.log.append("Raw Path:" + raw_path) if not len(data_files) == len(form_files): valid = 0 self.log.append("There are unequal data and form files in raw data.") self.log.append("Found " + str(len(data_files)) + " data files and " + str(len(form_files)) + " form files.") d = "data:\n" fm = "form:\n" for f in data_files: d+= f+'\n' for f in form_files: fm+=f+'\n' self.log.append(d) self.log.append(fm) self.log.append("Raw Path:" + raw_path) for file in data_files: can_read = hfh.file_is_readable(file) if not can_read: valid = 0 self.log.append("read access denied for data file:" + file) for file in form_files: can_read = hfh.file_is_readable(file) if not can_read: valid = 0 self.log.append("read access denied for form file:" + file) if valid: if hfh.get_extension(file) == 'csv': test_df = hfh.get_df(file,header=0) else: test_df = hfh.get_df_from_xlsx(file) required_columns = ["Domain", "AccNum", "CorrectAnswer"] for column in required_columns: if column not in test_df.columns: self.log.append("______") self.log.append(file) self.log.append("pro exam file does not contain " + column + ".\nReset form and then download from Proexam.") self.log.append("______") valid = 0 if valid: master_report_path = master_folder + "/" + 'reports' if not os.path.isdir(master_report_path): self.log.append("Path does not contain reports folder. \nPath:" + master_folder) valid = 0 if valid: if not os.path.isdir(xCalibre_path) and xCalibre_required: self.log.append("Path does not contain xCalibreOutput folder. \nPath:" + master_folder) valid = 0 if valid: if os.path.isfile(master_name): self.log.append("Folder name is a file. It should be a directory, i.e. no extension.") valid = 0 if valid and raw_required: if not hr.get_confirm_on_pairing(raw_path): valid = 0 self.log.append("User said pairing was wrong.") if valid and xCalibre_required: stats_files = hfh.get_all_file_names_in_folder(xCalibre_path, target_string="Stats") if len(stats_files) == 0: self.log.append("Report path does not contain xCalibreOutput reports") valid = 0 # check that can write reports aggregate_name = report_path + "/" + master_name + "_aggregate_.csv" complete_name = report_path + "/" + master_name + "_complete_.csv" if os.path.isfile(aggregate_name): if not hfh.file_is_writable(aggregate_name): valid = 0 self.log.append("No access to " + aggregate_name) if os.path.isfile(complete_name): if not hfh.file_is_writable(complete_name): valid = 0 self.log.append("No access to " + complete_name) if valid: self.log.append("validated call") self.d_log() return valid