예제 #1
0
 def parse(self, filename):
   try:
     workbook = openpyxl.reader.excel.load_workbook(filename)
   except:
     print "Loading %s failed" % filename
     sys.exit()
   for sheetname in workbook.get_sheet_names():
     if sheetname == "Legend-Key":
       continue
     obsclass = self.observationclasses.setdefault(sheetname, SDTMObservationClass(sheetname))
     sheet = workbook.get_sheet_by_name(sheetname)
     for row in sheet.rows:
       # header row
       if si(row[0]).startswith("SDTM v3.1"):
         for (col_idx, col) in enumerate(row):
           if col_idx >= 4:
             obsclass.add_domain(col_idx, si(col))
       elif si(row[1]) == "":
         continue
       else:
         for (col_idx, col) in enumerate([si(x) for x in row]):
           if obsclass.domains.get(col_idx):
             if col.lower() != "not used":
               # add the column to the dataset
               obsclass.domains.get(col_idx).add_variable(row[0],
                                                           row[1],
                                                           row[2],
                                                           row[3], 
                                                           col)
     print "Loaded %s" % sheetname
   for obclass in self.observationclasses.itervalues():
     for domain in obclass.domains.itervalues():
       print "%s: %s (%s variables)" % (obclass.name, domain.name, len(domain.variables))
예제 #2
0
 def load_files(self, path):
   # the SHARE content exists in a number of sheets
   if not os.path.exists(path):
     print "No such path %s" % path
     sys.exit()
   for document in glob.glob(os.path.join(path, "*.xlsx")):
     if not re.match("^[^~](.*) Template.xlsx$", os.path.basename(document)):
       continue
     print "Checking %s" % document
     try:
       workbook = openpyxl.reader.excel.load_workbook(os.path.join(path, document))
     except Exception, exc:
       print "Loading %s failed: %s" % (document, exc)
       sys.exit()
     for sheetname in workbook.get_sheet_names():
       if not re.match("^Generic ([A-Z\-\s]+) Template$", sheetname.strip()):
         continue
       sheet = workbook.get_sheet_by_name(sheetname)
       domain_or_class = None
       # set this to something large
       start_offset = 99
       for (row_idx, row) in enumerate(sheet.rows):
         if si(row[0]) == "Domain":
           for col in [si(x) for x in row[1:]]:
             if col != "":
               if "GENERIC" in col.upper():
                 domain_or_class = SDTMObservationClass(col)
               else:   
                 domain_or_class = SDTMDomain(col)
         elif si(row[0]) == "Variable Name":
           start_offset = row_idx
         elif (row_idx > start_offset):
           if si(row[0]) != "":
             domain_or_class.add_row(dict(zip(COLUMNS, [si(x) for x in row])))
       else:
         self.domains[domain_or_class.name] = domain_or_class