def parse(self, filename): try: workbook = openpyxl.reader.excel.load_workbook(filename) except: print "Loading %s failed" % filename sys.exit() for sheetname in workbook.get_sheet_names(): if sheetname == "Legend-Key": continue obsclass = self.observationclasses.setdefault(sheetname, SDTMObservationClass(sheetname)) sheet = workbook.get_sheet_by_name(sheetname) for row in sheet.rows: # header row if si(row[0]).startswith("SDTM v3.1"): for (col_idx, col) in enumerate(row): if col_idx >= 4: obsclass.add_domain(col_idx, si(col)) elif si(row[1]) == "": continue else: for (col_idx, col) in enumerate([si(x) for x in row]): if obsclass.domains.get(col_idx): if col.lower() != "not used": # add the column to the dataset obsclass.domains.get(col_idx).add_variable(row[0], row[1], row[2], row[3], col) print "Loaded %s" % sheetname for obclass in self.observationclasses.itervalues(): for domain in obclass.domains.itervalues(): print "%s: %s (%s variables)" % (obclass.name, domain.name, len(domain.variables))
def load_files(self, path): # the SHARE content exists in a number of sheets if not os.path.exists(path): print "No such path %s" % path sys.exit() for document in glob.glob(os.path.join(path, "*.xlsx")): if not re.match("^[^~](.*) Template.xlsx$", os.path.basename(document)): continue print "Checking %s" % document try: workbook = openpyxl.reader.excel.load_workbook(os.path.join(path, document)) except Exception, exc: print "Loading %s failed: %s" % (document, exc) sys.exit() for sheetname in workbook.get_sheet_names(): if not re.match("^Generic ([A-Z\-\s]+) Template$", sheetname.strip()): continue sheet = workbook.get_sheet_by_name(sheetname) domain_or_class = None # set this to something large start_offset = 99 for (row_idx, row) in enumerate(sheet.rows): if si(row[0]) == "Domain": for col in [si(x) for x in row[1:]]: if col != "": if "GENERIC" in col.upper(): domain_or_class = SDTMObservationClass(col) else: domain_or_class = SDTMDomain(col) elif si(row[0]) == "Variable Name": start_offset = row_idx elif (row_idx > start_offset): if si(row[0]) != "": domain_or_class.add_row(dict(zip(COLUMNS, [si(x) for x in row]))) else: self.domains[domain_or_class.name] = domain_or_class