def read_csv_file(filename, required_fieldnames=None, varlen=False): """ Read CSV file and check required fieldnames present; varlen if variable-length rows. """ # print("Reading CSV file:", filename) with open(filename) as file: reader = csv.reader(file) rows = [row for row in reader] fieldnames = rows[0] rows = rows[1:] # gather, clean, and trim field names, eliminating blanks fieldnames = [ids.clean_id(fieldname) for fieldname in fieldnames] while len(fieldnames)>0 and fieldnames[-1]=='': fieldnames.pop() if len(set(fieldnames)) != len(fieldnames): raise Exception("Duplicate field name:" + str(fieldnames)) # data rows row_dicts = [] for row in rows: row = ["" if item==None else ids.clean_id(item) for item in row] while len(row)>0 and row[-1] == '': row.pop() if not varlen: if len(row) > len(fieldnames): warnings.warn("Ignoring extra values in row:"+str(row)) row = row[:len(fieldnames)] while len(row) < len(fieldnames): row.append("") row_dict = {} for (fieldname, value) in zip(fieldnames, row): row_dict[fieldname] = value if varlen: if len(row) < len(fieldnames)-1: if len(row) > 0: # QUESTION: should this be >= 0? warnings.warn("Ignoring too-short row:"+str(row)) continue last_fieldname = fieldnames[-1] last_value = tuple(row[len(fieldnames)-1:]) row_dict[last_fieldname] = last_value row_dicts.append(row_dict) if required_fieldnames != None: # check that all required fieldnames are present required_fieldnames = [ids.clean_id(id) for id in required_fieldnames] missing_fieldnames = set(required_fieldnames).difference(set(fieldnames)) if len(missing_fieldnames) > 0: raise Exception("File {} has fieldnames {}, while {} are required. Missing {}." .format(filename, fieldnames, required_fieldnames, missing_fieldnames)) # check to see if extra fieldnames present; warn user if so extra_fieldnames = set(fieldnames).difference(set(required_fieldnames)) if len(extra_fieldnames) > 0: warnings.warn("File {} has extra fieldnames (ignored): {}" .format(filename, extra_fieldnames)) return row_dicts
def test_clean_id(): assert ids.clean_id(" ab") == "ab" assert ids.clean_id("ab ") == "ab" assert ids.clean_id(" ab cd ") == "ab cd" assert ids.clean_id("\t ab\n cd\n") == "ab cd"