def check_types(filename, config_file): headers, rows = read_csv(filename=filename) regex_checks = regex_from_config_file(config_file) mismatched_count = 0 unchecked_columns = set() for row in rows: for i, column in enumerate(row): if headers[i] not in regex_checks: if headers[i] not in unchecked_columns: unchecked_columns.add(headers[i]) print(" ---> No type check for {}: {}".format( headers[i], column)) continue if not regex_checks[headers[i]].match(column): print(" ---> Mismatch on {}: {} {}".format( headers[i], column, row)) mismatched_count += 1 print(" ---> Mismatched count: {}".format(mismatched_count)) assert mismatched_count == 0
def importFromCSV(csv_filename, database_filename): """Used to convert from old .csv format to database""" import file_util data = list(file_util.read_csv(csv_filename)) header = data[0] new_entries = list() header = [ 'date', 'Weather At Home European region (hadam3p_eu) Tasks ready to send', 'Weather At Home Pacific North West region (hadam3p_pnw) Tasks ready to send', 'Weather At Home Australia New Zealand region (hadam3p_anz) Tasks ready to send', 'hadcm3n Tasks ready to send', #'RAPIT project (hadcm3n) Tasks ready to send', 'hadam3p (Global model only) with MOSES II land scheme Tasks ready to send', 'Total Tasks ready to send', 'Tasks in progress' ] for row in data[1:]: if len(row) == len(header) and int( row[0]) > 1406206802 - 7 * 24 * 60 * 60: for ix in range(1, len(row)): new_entries.append( (header[ix].strip(), row[0], row[ix])) # name, time, count pass # added elif len(row) == 1: pass # ignore else: print len(row), row d = Database(database_filename, 'server_status') d.insert(new_entries) header, data = d.select_column_view() print header for d in data: print d
for line in output_csv: completed = False for i, item in enumerate(line): if headers[i] == 'completed' and item == "True": total_completed += 1 completed = True for i, item in enumerate(line): if headers[i] == 'course_id': course_student_counts[item] += 1 course_completion_rates[item]['completed' if completed else 'attempted'] += 1 completion_rate = total_completed/len(output_csv) print(" ---> Completion rate overall: %-7s %s%%" % (len(output_csv), round(completion_rate*100, 2))) for course_id, rates in course_completion_rates.items(): student_count = course_student_counts[course_id] completion_rate = rates['completed']/(rates['completed']+rates['attempted']) # print(" ---> Completion rate for %-30s: %-7s %s%%" % (course_id, student_count, round(completion_rate*100, 2))) return output_csv if __name__ == '__main__': if len(sys.argv) < 4: print('Usage: python k_suppress.py infile.csv configfile outfile.csv k') sys.exit(1) headers, rows = read_csv(filename=sys.argv[1]) delete_columns, qi_columns = columns_from_config_file(sys.argv[2]) out_filename = sys.argv[3] k = int(sys.argv[4]) k_suppress(headers, rows, delete_columns, qi_columns, out_filename, k)
#!/usr/bin/evn python3 # -*- coding: utf-8 -*- """ Created on 2019-02-18 @author Susan Get the number of unique values in each column of a dataset. """ import sys from file_util import read_csv, columns_from_config_file from deidentifier_util import count_column_uniques if __name__ == '__main__': if len(sys.argv) < 3: print('Usage: python count_column_uniques.py file config_file') sys.exit(1) config = sys.argv[2] headers, rows = read_csv(sys.argv[1]) deleted, qi_columns = columns_from_config_file(config) unique_values = count_column_uniques(rows, headers) for col in headers: if len(unique_values[col]) > 10: print(col, len(unique_values[col])) continue print(col, unique_values[col])