def convert_xCalibre_matrix_for_PCI(matrix_file, corresponding_control_file=False, id_length=8, include_id=False): #similar file in h_stats ret = [] first_row = "" if corresponding_control_file: df = pd.read_csv(corresponding_control_file, header=None) item_ids = df.loc[:, 0] for id in item_ids: first_row += id + "," first_row = first_row[:-1] ret = [first_row] lines = hfh.get_lines(matrix_file) for line in lines: ret_line = "" if include_id: ret_line = line[:id_length] answer_string = line[id_length:] for c in answer_string: ret_line += c + ',' ret_line = ret_line[:-3] ret_line += '\n' ret.append(ret_line) name = hfh.get_stem(matrix_file) + "__c.csv" hfh.write_lines_to_text(ret, name) translated_name = hfh.get_stem(matrix_file) + "_T_c.csv" df = pd.read_csv(name, header=None) df.to_csv("pickme.csv") df = df.T df.to_csv(translated_name, index=False) return df.T
def process_paired_files(pe_file, data_file, project_folder, select_Domain=False): # todo: add in domain processing here so that sets without domain names are still separated if hfh.get_extension(pe_file) == 'xlsx': pe_df = hfh.get_df_from_xlsx(pe_file) else: pe_df = hfh.get_df(pe_file, header=0) pe_df.Position = pe_df.Position.astype(float) pe_df = pe_df.sort_values("Position", ascending=True, ) pe_df = pe_df[["AccNum", 'CorrectAnswer', 'Domain']] #pe_df.Domain = pe_df.Domain.apply(str) #pe_df["AccNum"] = str(pe_df["AccNum"]) + "_" + pe_df["Domain"] if select_Domain: pe_df.loc[pe_df.Domain != select_Domain, 'include'] = 'N' pe_df = pe_df.drop(['Domain'], axis=1) pe_df['number_of_options'] = 4 pe_df['group'] = 1 pe_df["include"] = 'Y' pe_df['type'] = 'M' pe_df = pe_df[['AccNum', 'CorrectAnswer', 'number_of_options', 'group', 'include', 'type']] processed_path = project_folder + "/processed_data/" c_path = processed_path + hfh.get_stem(pe_file) + "_c.csv" pe_df.to_csv(c_path, header=False, index=False) h.convert_default_data_to_iteman(data_file, processed_path, new_name=hfh.get_stem(pe_file)) return 1
def create_c_from_LXR_Test(file_path, destination_path=None): if destination_path is None: destination_path = hfh.get_parent_folder(file_path) lines = hfh.get_lines(file_path) ret = [] counter = 0 for line in lines: counter += 1 if line[0].isnumeric(): entry = line.split() test_name = hfh.get_stem(file_path) test_id = line[:line.index('.')] entry[1] bank_id = entry[1] + '_' + entry[2] if len(entry) == 4: subject = entry[1] + "_" + entry[2] bank_id = subject + entry[3] key_line = lines[counter] key_i = key_line.find('Key: ') if key_i > -1: key = key_line[key_i + len("Key: ")] else: print("hello") record = [bank_id, key, '4', '1', 'Y', 'M'] ret.append(record) df = pd.DataFrame(ret) name = hfh.get_stem(file_path) + "_c.csv" # df.sort_values(df[1]) df.to_csv(destination_path + "/" + name, index=False, header=False) return df
def process_karen_data(path_to_data_files, path_to_test_bank_files, destination_path, removed_suffix="FullAmdin", suffix="Test"): # todo: this relies on the name Test for the bank information. It is assumed that the data file is the same # todo: as the test file with the exception of the suffix. i.e. XXX12Data and XXX12Test data_files = hfh.get_all_file_names_in_folder(path_to_data_files, target_string='FullAdmin') test_bank_files = hfh.get_all_file_names_in_folder(path_to_test_bank_files, target_string='Test') for file in test_bank_files: create_mapping_from_Karen_test_data(file, True) test_bank_lists = [] for file in test_bank_files: a = create_mapping_from_Karen_test_data( file, destination_path=destination_path, create_csv=True) test_bank_lists.append(a) for file in data_files: convert_iteman_format(file, destination_path=destination_path) for file in data_files: matching_control_name = destination_path + hfh.get_stem( file) + "_c.csv" matching_item_bank_name = destination_path + hfh.get_stem( file)[:-len(removed_suffix)] + suffix + "_L.csv" merge_control_and_bank_info(matching_control_name, matching_item_bank_name)
def convert_2016_format(file_name, destination_path="", pretest_cutoff=False): # of form: # PT1 PT116MAR BB... correct # answers # todo: rename this lines = hfh.get_lines(file_name) start_of_answers = lines[0].rfind(' ') answers = lines[0][start_of_answers + 1:] ret = [] for line in lines[2:]: # remove R or F at end last_entry = line[len(line) - 2] if last_entry == 'R' or last_entry == 'F': line = line[:-5] + '\n' ret.append(line) name = hfh.get_stem(file_name) new = hfh.get_stem(destination_path + "/" + name) + "_f.txt" hfh.write_lines_to_text(ret, new) if is_valid_data(new): convert_answers_to_default_control(name, answers, destination_path, pretest_cutoff) return True return False
def convert_default_data_to_iteman(file_name, processed_data_path, new_name=False): if processed_data_path[-1] == '/': processed_data_path = processed_data_path[:-1] lines = hfh.get_lines(file_name) ret = [] for line in lines[2:]: line = set_standard_id_length_for_line(line, 8) ret.append(line) path = processed_data_path + "/" + hfh.get_stem(file_name) + "_f.txt" if new_name: path = processed_data_path + "/" + hfh.get_stem(new_name) + "_f.txt" hfh.write_lines_to_text(ret, path)
def create_reports(path_to_stats, destination_path, is_cs=False): # todo: make this smarter. Look through for any stats files in folder or subfolders and then munch on them. destination_path += '/' if is_cs: files = get_all_file_names_in_folder(path_to_stats, target_string="_cs") else: files = get_all_file_names_in_folder(path_to_stats, target_string="Stats") for file in files: if is_cs: df = get_df(file, header=0) else: df = convert_stats_to_df(file, destination_path) high = get_top_n_rpbis(df, 20) low = get_low_performing_items(df) flagged = get_flagged_items(df) agg = get_aggregate_report_items(df) # parameters = get_parameters(df) name = get_stem(file) + "_c" high.to_csv(destination_path + name + "_high_.csv", index=False) low.to_csv(destination_path + name + "_low_.csv", index=False) flagged.to_csv(destination_path + name + "_flagged_.csv", index=False) agg.to_csv(destination_path + name + "_agg.csv", index=False)
def create_FINAL_from_CULLED(project_path): culled_path = hfh.get_all_files(project_path + '/reports', target_string='CULLED.csv') cal_f_path = hfh.get_all_files(project_path + '/calibration', target_string='CAL_f.csv') cal_c_path = hfh.get_all_files(project_path + '/calibration', target_string='CAL_c.csv') if len(culled_path) * len(cal_f_path) * len(cal_c_path) == 1: f_df = hfh.get_df(cal_f_path[0]) c_df = hfh.get_df(cal_c_path[0]) #c_df.columns = ['AccNum','A','B','C','D','E'] culled_df = hfh.get_df(culled_path[0], header=0) else: print("invalid call CULLED, CAL_f and CAL_c") return False ids = f_df.iloc[:, 0] f_df = f_df.drop(columns=[0]) f_df = f_df.T f_df = f_df.reset_index() f_df = f_df.drop(columns='index') c_df.columns = ['AccNum', 'B', 'C', 'D', 'E', 'F'] f_df.insert(0, 'AccNum', c_df['AccNum']) c_df = c_df[c_df['AccNum'].isin(culled_df['AccNum'])] f_df = f_df[f_df['AccNum'].isin(culled_df['AccNum'])] f_df = f_df.drop(columns=['AccNum']) f_df = f_df.T f_df.insert(0, 'ID', ids) name = hfh.get_stem(cal_f_path[0])[:-6] path = project_path + '/calibration/' + name + '_FINAL_' f_df.to_csv(path + 'f.csv', header=False, index=False) c_df.to_csv(path + 'c.csv', header=False, index=False) print('hello')
def convert_delimited_to_iteman(file, destination_path, delimiter=','): #verify is CSV ret = [] lines = hfh.get_lines(file) if len(lines) > 0: if len(lines[0].split(delimiter)) > 1: #is CSV for line in lines: new_line = "" line = line.split(delimiter) id_handled = False non_answer_characters = False for i in line: if not id_handled: i += ' ' id_handled = True if not i == 'Y' and not i == 'M': new_line += i ret.append(new_line) if not non_answer_characters: print( file, "non answer character in data response string. It was removed." ) name = hfh.create_name(hfh.get_stem(file), destination_path, 'txt', '_f') hfh.write_lines_to_text(ret, name) return True else: print(file, "is empty") return False
def get_theta_from_passing_score(passing_score, path_to_tif): df = pd.read_csv(path_to_tif) SCORE_COL = "TRF as Number Correct" THETA_COL = "Theta" a = df.iloc[(df[SCORE_COL] - passing_score).abs().argsort()[:1]] a = a[THETA_COL].values[0].astype(float) return get_stem(path_to_tif), int(a * 100) / 100
def create_form_from_c(c_file, destination_path): c_df = hfh.get_df(c_file) form = hfh.pd.DataFrame([]) form['AccNum'] = c_df[0] form['Sequence'] = form.index.values + 1 form = form[['Sequence', 'AccNum']] name = hfh.get_stem(c_file)[:-2] + '_L.csv' form.to_csv(destination_path + '/' + name, header=True, index=False)
def create_forms_from_bank(project_path, operational=True, create_bank_L=False): bank_files = hfh.get_all_files(project_path + '/bank_files') print("creating forms") for file in bank_files: if not create_bank_L and file.find("BANK") > 0: pass else: b_df = pd.read_excel(file, header=0) cut = "" if create_bank_L and file.find('BANK') > -1: # check if passing is present i = file.find('_') if i > -1: name = hfh.get_stem(file) cut = name[i:] try: cut = int(cut) except ValueError: print( file, "contains and underscore but does not provide a cut" ) form = pd.DataFrame([]) form['AccNum'] = b_df['AccNum'] form_length = len(form) + 1 form.insert(0, 'Position', range(1, form_length)) elif operational: if 'UseCode' in b_df.columns: form = b_df[b_df['UseCode'] == 'Operational'] form = form[['Position', 'AccNum']] elif 'Position' in b_df.columns: form = b_df[['Position', 'AccNum']] name = hfh.get_stem(file) suffix = '_LF' prefix = 'full' if operational: suffix = '_LO' + str(cut) prefix = 'operational' form.to_csv(project_path + '/forms/' + prefix + '/' + name + suffix + '.csv', index=False)
def create_mapping_from_Karen_test_data(file_path, destination_path="", create_csv=False, add_underscore=False): lines = hfh.get_lines(file_path) # assumes files are of format # number. Name NN # .... where no lines will start with a number and then a dot other than my target lines # number. Name NN # name of file is stem(file_path)+_L.csv ret = [] if lines: for line in lines: if line[0].isnumeric(): entry = line.split() test_name = hfh.get_stem(file_path) test_id = line[:line.index('.')] subject = entry[1] bank_id = entry[2] underscore = "" if add_underscore: underscore = "_" record = [ test_name, test_id, subject, bank_id, subject + underscore + str(bank_id) ] ret.append(record) df = pd.DataFrame(ret) df.columns = [ 'form', 'test_id', 'subject', 'bank_id_number', 'bank_id' ] else: print(file_path + "does not contain lines.") if create_csv: name = hfh.get_stem(file_path)[:8] #df.sort_values(df[1]) file_name = destination_path + "/" + name + "_L.csv" df.to_csv(file_name, index=False, header=0) return df
def process_raw_data(master_folder, raw_data): form_files = hfh.get_all_files(raw_data, extension="csv") data_files = hfh.get_all_files(raw_data, extension='txt') if not len(form_files) == len(data_files): # assume that we are using xlsx files form_files = hfh.get_all_file_names_in_folder(raw_data, extension="xlsx") for file in form_files: name = hfh.get_stem(file) + '_raw_backup_form.' + hfh.get_extension(file) hfh.copy_file_and_write_to_destination(file, master_folder + "/data", modified_name=name) for file in data_files: name = hfh.get_stem(file) + '_raw_backup_data.' + hfh.get_extension(file) hfh.copy_file_and_write_to_destination(file, master_folder + "/data", modified_name=name) paired_files = pair_files(form_files, data_files) if paired_files: for pair in paired_files: pe_file = pair[0] data_file = pair[1] process_paired_files(pe_file, data_file, master_folder)
def convert_first_line_answers_to_default_control_and_data( file_name, comma_delimited=False, id_length=8, id_spaces=3): #todo: handle cutoff for not included lines = hfh.get_lines(file_name) correct = lines[0] new = [] counter = 0 if comma_delimited: correct = correct.replace(',', '') for a in correct: counter += 1 if not a == '\n': include = 'y' new.append(str(counter) + "," + a + ",4,1," + include + ",M\n") name = hfh.get_stem(file_name) hfh.write_lines_to_text(new, name + "_c.csv") # contains a random F at the end will test to see if it matters formatted = [] if comma_delimited: for line in lines[1:]: split_line = line.split(',') id = split_line[0] new_id = "" if len(id) < id_length: short = id_length - len(id) for i in range(short): new_id += "_" new_id += id response_string = line[len(id):-2].replace(',', '') + '\n' ret_line = new_id + " " + response_string formatted.append(ret_line) else: for line in lines[2:]: # todo: could be problematic id_end = line.find(',') id = line[:id_end] new_id = "" characters_short = id_length - len(id) for c in range(characters_short): new_id += "_" new_id += id for i in range(id_spaces): new_id += " " new_line = new_id + line formatted.append( new_line) #no clue why : is here perhaps I will remove it. hfh.write_lines_to_text(formatted, name + "_f.txt")
def get_theta_from_passing_percent_correct(percent_to_pass, path_to_tif, tif_df=None, return_values=True): if tif_df is None: df = pd.read_csv(path_to_tif) else: df = tif_df SCORE_COL = "TRF" THETA_COL = "Theta" a = df.iloc[(df[SCORE_COL] - percent_to_pass).abs().argsort()[:1]] a = a[THETA_COL].values[0].astype(float) if return_values: return get_stem(path_to_tif), int(a * 100) / 100
def fix_format_of_data_file(file, destination): valid = is_valid_data(file) if valid: name = hfh.get_stem(file) + "_f.txt" hfh.copy_file_and_write_to_destination(file, destination, modified_name=name) if not valid: valid = convert_iteman_format(file, destination) if not valid: valid = convert_2016_format(file, destination) if not valid: print("could not convert " + file) return valid
def convert_stats_to_df(path_to_stats, destination_path): first_line = get_first_line_of_stats(path_to_stats) try: blankLine = get_next_blank_line_after_index(path_to_stats, first_line - 1) f = get_lines_from_X_to_Y_from_file(path_to_stats, first_line - 1, blankLine) name = destination_path + get_stem(path_to_stats) + "_cs.csv" write_lines_to_text(f, name) df = get_df(name, header=0, dtype="string") except: pass return df
def convert_answers_to_default_control(file_name, answers, destination_path, cutoff_for_pretest=175): new = [] counter = 0 for a in answers: counter += 1 if not a == '\n': include = 'y' if counter > cutoff_for_pretest and cutoff_for_pretest: include = 'n' new.append(str(counter) + "," + a + ",4,1," + include + ",M\n") name = destination_path + "/" + hfh.get_stem(file_name) hfh.write_lines_to_text(new, name + "_c.csv")
def find_month(file_name): months = ["JANUARY", "FEBRUARY", "MARCH", "APRIL", "MAY", "JUNE", "JULY", "AUGUST", "SEPTEMBER","OCTOBER","NOVEMBER","DECEMBER"] ret = [] file_name = hfh.get_stem(file_name).upper() for month in months: full = file_name.find(month) if full > -1: ret.append(month) mon = month[:3] three = file_name.find(mon) if len(ret) == 0 and three > -1: ret.append(mon) if len(ret) > 1: return False else: return ret[0]
def get_percent_from_theta(theta, path_to_tif=None, tif_df=None, return_values=True, return_df=False): if path_to_tif is None and tif_df is None: print("get_percent_from_theta was not passed a tif path or df.") else: if tif_df is None: df = pd.read_csv(path_to_tif) else: df = tif_df SCORE_COL = "TRF" THETA_COL = "Theta" a = df.iloc[(df[THETA_COL].astype(float) - theta).abs().argsort()[:1]] a = a[SCORE_COL].values[0] if return_values: return hfh.get_stem(path_to_tif), int(a * 100) / 100
def process_LXR_key(key_file, get_c_df_AS_0=False, get_L_df_AS_1=False, destination_path_c=None, destination_path_L=None): c_df = None L_df = None name = hfh.get_stem(key_file) lines = hfh.get_lines(key_file) ids = [] keys = [] for line in lines: split = line.split('.') if len(split) > 1: id = split[1] id = id.strip() id = id.replace(' ', '_') ids.append(id) else: split = line.split(':') if len(split) > 1: key = split[1] key = key.strip() keys.append(key) df = hfh.pd.DataFrame([ids, keys]).T df[2] = 4 df[3] = 1 df[4] = 'Y' df[5] = 'M' df[6] = df.index.values + 1 c_df = df[[0, 1, 2, 3, 4, 5]] L_df = df[[6, 0]] if get_c_df_AS_0 or get_L_df_AS_1: return c_df, L_df if destination_path_c: c_df.to_csv(destination_path_c + '/' + name + '_c.csv', index=False, header=False) if destination_path_L: L_df.to_csv(destination_path_L + '/' + name + '_L.csv', index=False, header=False)
def clean_stats_csv(path, create_csv=True, get_df=False): lines = hfh.get_lines(path) #assumes report starts with Sequence i = -1 cont = True beginning = -1 ret_lines = [] while cont: for line in lines: i += 1 split_line = line.split(',') if split_line[0] == 'Sequence': cont = False beginning = i if beginning > -1: if line == '\n': cont = False else: ret_lines.append(line.split(',')) df = pd.DataFrame(ret_lines[1:]) df.columns = ret_lines[0][:-1] df = df.drop(columns='4 SD') if create_csv: new_path = hfh.get_parent_folder(path) name = new_path + "/" + hfh.get_stem(path)[:-6] + ".cleaned_stats" df.to_csv(name) if get_df: return df #set_standard_id_length_in_data_files("PT_IRT/PT_processed_data", 8) #convert_xCalibre_matrix_for_PCI("PT_data/score_matrices/PT1_18_m.txt") #process_karen_data("LCLE_IRT","LCLE_IRT","LCLE_IRT/processed_data/") #merge_control_and_bank_info(a,b) #path_to_files = "data_files" #convertOldFormatToNew("LCLE_IRT/LCLEApr2019FullAdmin.txt") #processNewFileFormat("LCLE_IRT/lcea1_18c.csv","LCLE_IRT/lcea1_18.txt") #convert_first_line_answers_to_default_control_and_data(path_to_files+"/pt1_16_n.txt") #create_control_files(path_to_files) #update_control_files_with_item_bank_key("data_files/item_map.csv", "data_files") #convert_2016_format("data_files/pt3_16.txt")
def is_valid_name(path, harsh=False): acceptable_extensions = ['txt', 'csv'] ext = hfh.get_extension(path) if ext not in acceptable_extensions: return False name = hfh.get_stem(path) month = hfh.find_month(name) year = hfh.find_year(name) if month and year: i_year = name.find(year) tag = name[:i_year] full_name = tag + year + month if harsh: if full_name == name: return name else: return name # currently unreachable consider removing or incorporating if full_name == name[:-2]: print("AB form detected") return name return False
def create_key_df_from_csv(file_path): # assumes format # Position,Domain,AccNum,UseCode,CorrectAnswer,Content Area,... # 1,02,LLE669,Operational,C,0202,44,,56,0.18,,,Bank,InUse,1,,,101,AUG19(P),,,0,1,0,No,,1, ,5/13/2019, df = hfh.get_df(file_path, header=0) ret = [] test_ids = df['Position'].tolist() bank_ids = df['AccNum'].tolist() for i in range(len(test_ids)): form = hfh.get_stem(file_path) test_id = test_ids[i] bank_id = bank_ids[i] subject = bank_id[:3] bank_id_number = bank_id[3:] ret.append([form, test_id, subject, bank_id_number, bank_id]) df = pd.DataFrame(ret) df.columns = ['form', 'test_id', 'subject', 'bank_id_number', 'bank_id'] # name = hfh.create_name("silly","LPCC_IRT/keys/L_files","csv",'_L') #df.to_csv(name, index=None) return df
def process_response_strings_for_IRT(path_to_raw_data, processed=None, bank=None, verbose=False, get_f_df=False): #todo edited while tired confirm it works later path = path_to_raw_data if path is not False: lines = hfh.get_lines(path) r = path.find('raw_data') #assumes that raw_data exists in IRT model name = path if r > -1: project_directory = path[:r] name = project_directory + "/processed_data/" + hfh.get_stem(path) processed = project_directory + '/processed_data/' bank = project_directory + '/bank_files/' valid = is_valid_name(path) while valid is False: print( path + " is a raw data name which does not conform to convention of CCCYYMON." ) name = input("enter an appropriate name here") valid = is_valid_name(name) if lines is False: print( "Error in determine response string.\n Path request error in path " + path) else: process_response_string_file(path, bank, write_csv=True, destination_path=processed)
def update_control_files_with_item_bank_key(path_to_item_bank_csv, path_to_control): # todo: this is only good for the pt format... general application should be more robust. control_file_paths = hfh.get_all_file_names_in_foler( path_to_control, "csv") item_bank_df = pd.DataFrame(pd.read_csv(path_to_item_bank_csv)) for file_path in control_file_paths: if not file_path == path_to_item_bank_csv: control_df = pd.DataFrame(pd.read_csv(file_path, header=None)) name = hfh.get_stem(file_path) n = name[2] y = name[4:6] test_form = "20" + y + "_" + n form_relevant = item_bank_df.loc[item_bank_df['testFORM'] == test_form] testSeq = form_relevant["testSeq"] testSeq_c = control_df[0] for i in testSeq: if control_df[0].__contains__(i): value = form_relevant[form_relevant["testSeq"] == i]["AccNum"] control_df[0] = np.where(control_df[0] == i, value, control_df[0]) control_df.to_csv(file_path, index=False, header=0)
def create_upload_from_processed(c_file, f_file, path=None, c_has_header=True, to_csv=False): #todo: decide if _c files have headers or not... #todo: perhaps a different _x to indicate header or not... the only time I don't want a header is xCalibre... if c_has_header: c_df = hfh.get_df(c_file, header=0) else: c_df = hfh.get_df(c_file) f_df = hfh.get_df(f_file, index_col=0) stats_df = hfh.pd.DataFrame([]) stats_df['AccNum'] = c_df.iloc[:, 0] graded_df = hr.grade_examination(f_df, c_df, grading_processed=True, correct=1, incorrect=0) score = graded_df.sum(axis=1) pbis = graded_df[graded_df.columns[0]].corr(score) A = get_option_df(f_df, 'A') B = get_option_df(f_df, 'B') C = get_option_df(f_df, 'C') D = get_option_df(f_df, 'D') options = ['A', 'B', 'C', 'D'] dfs = [A, B, C, D] counter = -1 N = ~f_df.isna() N = N.sum() N = N.reset_index(drop=True) for option in options: counter += 1 a_ret = [] b_ret = [] c_ret = [] df = dfs[counter] for column in A.columns: mask = df[column] == 1 mean_score = graded_df[mask].mean().mean() c_ret.append(mean_score) pbis = df[column].corr(score) endorse_p = df[column].sum() / df.shape[0] a_ret.append(pbis) b_ret.append(endorse_p) stats_df[option + '_r'] = hfh.pd.Series(a_ret, index=stats_df.index) stats_df[option + '_p'] = hfh.pd.Series(b_ret, index=stats_df.index) stats_df[option + '_m'] = hfh.pd.Series(c_ret, index=stats_df.index) k_ret = [] for i in range(graded_df.shape[1]): pbis = graded_df[graded_df.columns[i]].corr(score) k_ret.append(pbis) stats_df['K_r'] = hfh.pd.Series(k_ret, index=stats_df.index) stats_df['KEY'] = c_df['Key'] stats_df['N'] = N p = graded_df.mean(axis=0) stats_df = stats_df.set_index('AccNum', drop=True) stats_df['P'] = p if path is None: name = hfh.get_stem(f_file)[:-2] + '_P.csv' else: name = path + '/' + hfh.get_stem(f_file)[:-2] + '_P.csv' stats_df = stats_df[[ 'KEY', 'K_r', 'P', 'A_p', 'A_r', 'A_m', 'B_p', 'B_r', 'B_m', 'C_p', 'C_r', 'C_m', 'D_p', 'D_r', 'D_m', 'N' ]] if to_csv: stats_df.to_csv(name) return stats_df
def process_response_string_file(f_path, bank_path=None, destination_path=None, write_csv=False, get_df=True, create_c=True, paired_bank_xlsx=None): if create_c: assert destination_path is not None, "process response string needs to know where to put the processed data" name = hfh.get_stem(f_path) lines = hfh.get_lines(f_path) assert len(lines) > 0, "asked to process empty file:" + f_path c_df = None f_df = None if is_type_K(lines): processed_lines = processK(lines) f_df = processed_lines elif is_type_A(lines): processed_lines = processA(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_B(lines): processed_lines = processB(lines) f_df = processed_lines elif is_type_C(lines): processed_lines = processC(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_D(lines): processed_lines = processD(lines) f_df = processed_lines elif is_type_E(lines): processed_lines = processE(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_F(lines): processed_lines = processF(lines) f_df = processed_lines elif is_type_G(lines): processed_lines = processG(lines) c_df = processed_lines[0] f_df = processed_lines[1] elif is_type_H(lines): processed_lines = processH(lines) f_df = processed_lines elif is_type_I(lines): processed_lines = processI(lines) f_df = processed_lines elif is_type_J(lines): processed_lines = processJ(lines) f_df = processed_lines else: print(f_path + " is already formatted") is_formatteed(lines) f_df = hfh.get_df(f_path) if c_df is not None and bank_path: # add AccNum instead of sequence b_df = create_c_df_from_bank(bank_path) b_df['Key'] = c_df['Key'] c_df = b_df if c_df is None and bank_path is not None and create_c: #todo: consider respecting the correct answer at the time vs the bank or just destroy it bank_files = hfh.get_all_files(bank_path, extension='xlsx') pair = hfh.pair_files([f_path], bank_files) if len(pair) == 0: print( "could not find matching bank file and no default control information present." ) if len(pair) == 1: # todo: may evaluate differences between bank and response string if desired c_df = create_c_df_from_bank(pair[0][1]) if len(pair) > 1: print("more than one file matched for bank", f_path) #confirm_id_as_index if 0 in f_df.columns or '0' in f_df.columns: f_df = f_df.set_index(f_df[0], drop=True) f_df = f_df.drop(columns=0) if write_csv: #todo changed index... need to make sure all processed items spit out the same... in this case they are pre-strict. f_df.to_csv(destination_path + '/' + name + '_f.csv', index=True, header=False) if c_df is not None: c_df.to_csv(destination_path + '/' + name + '_c.csv', index=None, header=False) if get_df: return f_df
def create_CAL(project_path, processed_path=None, destination_path=None, pair_full=True, debug=True): DATA_FILE = 0 CONTROL_FILE = 1 if processed_path == None: path = project_path + '/processed_data' data_files = hfh.get_all_file_names_in_folder(path, target_string='_f.csv') control_files = hfh.get_all_file_names_in_folder(path, target_string='_c.csv') df = pd.DataFrame([]) control_dfs = [] paired_files = hfh.pair_files(data_files, control_files, pair_full=pair_full) for pair in paired_files: print("CREATING CAL FROM PAIR: " + pair[0] + " " + pair[1]) print(pair) control_path = pair[CONTROL_FILE] data_path = pair[DATA_FILE] f_df = hfh.get_df(data_path, header=None) c_df = hfh.get_df(control_path, header=None) control_dfs.append(c_df) f_df = get_strict_format_f_df(c_df, f_df) graded = strict_grade(f_df=f_df, c_df=c_df, operational=False, correct='1', incorrect='2') if graded is not False: a = graded.columns.duplicated() df = pd.concat([df, graded], axis=0) print("replacing") if len(paired_files) > 0: c_df = pd.DataFrame([]) c_df['AccNum'] = df.columns c_df['Key'] = '1' c_df['Options'] = '2' c_df['Domain'] = '1' c_df['Include'] = 'Y' c_df['Type'] = 'M' type = "_FINAL_" if destination_path.find('initial') > 0: type = "_INITIAL_" name = destination_path + '/' + hfh.get_stem(pair[0])[:3] + type print("replacing 2.0") df = df.replace(2.0, '2') print("replacing 1.0") df = df.replace(1.0, '1') print("filling NA") df = df.fillna(value='-') print("replacing empty with X") df = df.replace(" ", "X") print("writing csvs") df.to_csv(name + 'f.csv', index=True, header=False) c_df.to_csv(name + 'c.csv', index=None, header=False) return df, c_df else: print("failed to pair files in create_CAL")