def xls_value_to_unicode(value, value_type): """ Take a xls formatted value and try to make a unicode string representation. """ if value_type == xlrd.XL_CELL_BOOLEAN: return u"TRUE" if value else u"FALSE" elif value_type == xlrd.XL_CELL_NUMBER: # Try to display as an int if possible. int_value = int(value) if int_value == value: return unicode(int_value) else: return unicode(value) elif value_type is xlrd.XL_CELL_DATE: # Warn that it is better to single quote as a string. # error_location = cellFormatString % (ss_row_idx, ss_col_idx) # raise Exception( # "Cannot handle excel formatted date at " + error_location) datetime_or_time_only = xlrd.xldate_as_tuple( value, workbook.datemode) if datetime_or_time_only[:3] == (0, 0, 0): # must be time only return unicode(datetime.time(*datetime_or_time_only[3:])) return unicode(datetime.datetime(*datetime_or_time_only)) else: # ensure unicode and replace nbsp spaces with normal ones # to avoid this issue: # https://github.com/modilabs/pyxform/issues/83 return unicode(value).replace(unichr(160), ' ')
def parse(self, xml_str): clean_xml_str = xml_str.strip() clean_xml_str = re.sub(unicode(r">\s+<"), unicode("><"), clean_xml_str) self._xml_obj = minidom.parseString(clean_xml_str) self._root_node = self._xml_obj.documentElement self._dict = _xml_node_to_dict(self._root_node) self._flat_dict = {} for path, value in _flatten_dict(self._dict, []): self._flat_dict[u"/".join(path[1:])] = value self._set_attributes()
def csv_to_dict(path_or_file): if isinstance(path_or_file, basestring): csv_data = open(path_or_file, 'rb') else: csv_data = path_or_file _dict = OrderedDict() def first_column_as_sheet_name(row): if len(row) == 0: return None, None elif len(row) == 1: return row[0], None else: s_or_c = row[0] content = row[1:] if s_or_c == '': s_or_c = None # concatenate all the strings in content if reduce(lambda x, y: x + y, content) == '': # content is a list of empty strings content = None return s_or_c, content reader = csv.reader(csv_data, encoding='utf-8') sheet_name = None current_headers = None for row in reader: survey_or_choices, content = first_column_as_sheet_name(row) if survey_or_choices is not None: sheet_name = survey_or_choices if sheet_name not in _dict: _dict[unicode(sheet_name)] = [] current_headers = None if content is not None: if current_headers is None: current_headers = content _dict[u"%s_header" % sheet_name] = \ _list_to_dict_list(current_headers) else: _d = OrderedDict() for key, val in zip(current_headers, content): if val != "": # Slight modification so values are striped # this is because csvs often spaces following commas # (but the csv reader might already handle that.) _d[unicode(key)] = unicode(val.strip()) _dict[sheet_name].append(_d) csv_data.close() return _dict
def _flatten_dict(d, prefix): """ Return a list of XPath, value pairs. """ assert type(d) == dict assert type(prefix) == list for key, value in d.items(): new_prefix = prefix + [key] if type(value) == dict: for pair in _flatten_dict(value, new_prefix): yield pair elif type(value) == list: for i, item in enumerate(value): item_prefix = list(new_prefix) # make a copy # note on indexing xpaths: IE5 and later has # implemented that [0] should be the first node, but # according to the W3C standard it should have been # [1]. I'm adding 1 to i to start at 1. item_prefix[-1] += u"[%s]" % unicode(i + 1) if type(item) == dict: for pair in _flatten_dict(item, item_prefix): yield pair else: yield (item_prefix, item) else: yield (new_prefix, value)
def _generate_static_instances(self): """ Generates <instance> elements for static data (e.g. choices for select type questions) """ for list_name, choice_list in self.choices.items(): instance_element_list = [] for idx, choice in zip(range(len(choice_list)), choice_list): choice_element_list = [] # Add a unique id to the choice element incase there is itext # it refrences itext_id = '-'.join(['static_instance', list_name, str(idx)]) choice_element_list.append(node("itextId", itext_id)) for choicePropertyName, choicePropertyValue in choice.items(): if isinstance(choicePropertyValue, basestring) \ and choicePropertyName != 'label': choice_element_list.append( node(choicePropertyName, unicode(choicePropertyValue))) instance_element_list.append(node("item", *choice_element_list)) yield node("instance", node("root", *instance_element_list), id=list_name)
def insert_xpaths(self, text): """ Replace all instances of ${var} with the xpath to var. """ bracketed_tag = r"\$\{(.*?)\}" return re.sub(bracketed_tag, self._var_repl_function, unicode(text))
def __init__(self, path_or_file): path = path_or_file if type(path_or_file) is file: path = path.name self._dict = parse_file_to_workbook_dict(path) self._path = path self._id = unicode(get_filename(path)) self._name = self._print_name = self._title = self._id
def xml_instance(self): survey = self.get_root() attributes = {} attributes.update(self.get(u'instance', {})) for key, value in attributes.items(): attributes[key] = survey.insert_xpaths(value) if self.get(u"default"): return node(self.name, unicode(self.get(u"default")), **attributes) return node(self.name, **attributes)
def __init__(self, path_or_file): path = path_or_file try: path = path.name except AttributeError: pass self._dict = parse_file_to_workbook_dict(path) self._path = path self._id = unicode(get_filename(path)) self._name = self._print_name = self._title = self._id
def xml_instance(self): survey = self.get_root() attributes = {} attributes.update(self.get(u'instance', {})) for key, value in attributes.items(): attributes[key] = survey.insert_xpaths(value) if self.get(u"default"): return node( self.name, unicode(self.get(u"default")), **attributes ) return node(self.name, **attributes)
def parse_file_to_json(path, default_name=None, default_language=u"default", warnings=None, file_object=None): """ A wrapper for workbook_to_json """ if warnings is None: warnings = [] workbook_dict = parse_file_to_workbook_dict(path, file_object) if default_name is None: default_name = unicode(get_filename(path)) return workbook_to_json( workbook_dict, default_name, default_language, warnings)
def detail(handler, note): local_path = os.path.join(config.note_dir, note) if not os.path.exists(local_path): handler.send_response(404) return with open(local_path) as f: md = f.read() md = utils.unicode(md) content = markdown.markdown(md, ['toc']) html = template_engine.render('detail.html', {'content':content}) headers = { 'Content-Type': 'text/html;charset=UTF-8', 'Content-Length': len(html) } handler.simple_response(headers, html)
def parse_file_to_json(path, default_name=None, default_language=u"default", warnings=None, file_object=None): """ A wrapper for workbook_to_json """ if warnings is None: warnings = [] workbook_dict = parse_file_to_workbook_dict(path, file_object) if default_name is None: default_name = unicode(get_filename(path)) return workbook_to_json(workbook_dict, default_name, default_language, warnings)
def insert_output_values(self, text): """ Replace all the ${variables} in text with xpaths. Returns that and a boolean indicating if there were any ${variables} present. """ # There was a bug where escaping is completely turned off in labels # where variable replacement is used. # For exampke, `${name} < 3` causes an error but `< 3` does not. # This is my hacky fix for it, which does string escaping prior to # variable replacement: text_node = PatchedText() text_node.data = text xml_text = text_node.toxml() bracketed_tag = r"\$\{(.*?)\}" # need to make sure we have reason to replace # since at this point < is <, # the net effect < gets translated again to &lt; if unicode(xml_text).find('{') != -1: result = re.sub(bracketed_tag, self._var_repl_output_function, unicode(xml_text)) return result, not result == xml_text return text, False
def xml_instance(self): result = Section.xml_instance(self) # set these first to prevent overwriting id and version for key, value in self.attribute.items(): result.setAttribute(unicode(key), value) result.setAttribute(u"id", self.id_string) # add instance xmlns attribute to the instance node if self.instance_xmlns: result.setAttribute(u"xmlns", self.instance_xmlns) if self.version: result.setAttribute(u"version", self.version) return result
def writerow(self, row): encoded_row = [] for col in row: if col: col = unicode(col).encode('utf-8', 'replace') # If this breaks, experiment with the workaround code in my utils.unicode module.... encoded_row.append(col) self.writer.writerow(encoded_row) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... and reencode it into the target encoding data = self.encoder.encode(data) # write to the target stream self.stream.write(data) # empty queue self.queue.truncate(0)
def next(self): row = self.reader.next() return [unicode(s, "utf-8") for s in row]
def get_abbreviated_xpath(self): lineage = self.get_lineage() if len(lineage) >= 2: return u"/".join([unicode(n.name) for n in lineage[1:]]) else: return lineage[0].name
def __repr__(self): return unicode(self)
def workbook_to_json(workbook_dict, form_name=None, default_language=u"default", warnings=None): """ workbook_dict -- nested dictionaries representing a spreadsheet. should be similar to those returned by xls_to_dict form_name -- The spreadsheet's filename default_language -- default_language does two things: 1. In the xform the default language is the language reverted to when there is no translation available for some itext element. Because of this every itext element must have a default language translation. 2. In the workbook if media/labels/hints that do not have a language suffix will be treated as though their suffix is the default language. If the default language is used as a suffix for media/labels/hints, then the suffixless version will be overwritten. warnings -- an optional list which warnings will be appended to returns a nested dictionary equivalent to the format specified in the json form spec. """ # ensure required headers are present if warnings is None: warnings = [] is_valid = False for row in workbook_dict.get('survey', []): is_valid = 'type' in row if is_valid: break if not is_valid: raise PyXFormError( u"The survey sheet is either empty or missing important " u"column headers.") row_format_string = '[row : %s]' # Make sure the passed in vars are unicode form_name = unicode(form_name) default_language = unicode(default_language) # We check for double columns to determine whether to use them # or single colons to delimit grouped headers. # Single colons are bad because they conflict with with the xform namespace # syntax (i.e. jr:constraintMsg), # so we only use them if we have to for backwards compatibility. use_double_colons = has_double_colon(workbook_dict) # Break the spreadsheet dict into easier to access objects # (settings, choices, survey_sheet): # ########## Settings sheet ########## settings_sheet = dealias_and_group_headers( workbook_dict.get(constants.SETTINGS, []), aliases.settings_header, use_double_colons) settings = settings_sheet[0] if len(settings_sheet) > 0 else {} default_language = settings.get(constants.DEFAULT_LANGUAGE, default_language) # add_none_option is a boolean that when true, # indicates a none option should automatically be added to selects. # It should probably be deprecated but I haven't checked yet. if u"add_none_option" in settings: settings[u"add_none_option"] = aliases.yes_no.get( settings[u"add_none_option"], False) # Here we create our json dict root with default settings: id_string = settings.get(constants.ID_STRING, form_name) sms_keyword = settings.get(constants.SMS_KEYWORD, id_string) json_dict = { constants.TYPE: constants.SURVEY, constants.NAME: form_name, constants.TITLE: id_string, constants.ID_STRING: id_string, constants.SMS_KEYWORD: sms_keyword, constants.DEFAULT_LANGUAGE: default_language, # By default the version is based on the date and time yyyymmddhh # Leaving default version out for now since it might cause # problems for formhub. # constants.VERSION : datetime.datetime.now().strftime("%Y%m%d%H"), constants.CHILDREN: [] } # Here the default settings are overridden by those in the settings sheet json_dict.update(settings) # ########## Choices sheet ########## # Columns and "choices and columns" sheets are deprecated, # but we combine them with the choices sheet for backwards-compatibility. choices_and_columns_sheet = workbook_dict.get( constants.CHOICES_AND_COLUMNS, {}) choices_and_columns_sheet = dealias_and_group_headers( choices_and_columns_sheet, aliases.list_header, use_double_colons, default_language) columns_sheet = workbook_dict.get(constants.COLUMNS, []) columns_sheet = dealias_and_group_headers(columns_sheet, aliases.list_header, use_double_colons, default_language) choices_sheet = workbook_dict.get(constants.CHOICES, []) choices_sheet = dealias_and_group_headers(choices_sheet, aliases.list_header, use_double_colons, default_language) # ########## Cascading Select sheet ########### cascading_choices = workbook_dict.get(constants.CASCADING_CHOICES, []) if len(cascading_choices): if 'choices' in cascading_choices[0]: choices_sheet = choices_sheet + cascading_choices[0]['choices'] combined_lists = group_dictionaries_by_key( choices_and_columns_sheet + choices_sheet + columns_sheet, constants.LIST_NAME) choices = combined_lists # Make sure all the options have the required properties: warnedabout = set() for list_name, options in choices.items(): for option in options: if 'name' not in option: info = "[list_name : " + list_name + ']' raise PyXFormError("On the choices sheet there is " "a option with no name. " + info) if 'label' not in option: info = "[list_name : " + list_name + ']' warnings.append( "On the choices sheet there is a option with no label. " + info) # chrislrobert's fix for a cryptic error message: # see: https://code.google.com/p/opendatakit/issues/detail?id=832&start=200 # noqa option_keys = list(option.keys()) for headername in option_keys: # Using warnings and removing the bad columns # instead of throwing errors because some forms # use choices column headers for notes. if ' ' in headername: if headername not in warnedabout: warnedabout.add(headername) warnings.append("On the choices sheet there is " + "a column (\"" + headername + "\") with an illegal header. " + "Headers cannot include spaces.") del option[headername] elif headername == '': warnings.append("On the choices sheet there is a value" + " in a column with no header.") del option[headername] # ########## Survey sheet ########### if constants.SURVEY not in workbook_dict: raise PyXFormError("You must have a sheet named (case-sensitive): " + constants.SURVEY) survey_sheet = workbook_dict[constants.SURVEY] # Process the headers: clean_text_values_enabled = aliases.yes_no.get( settings.get("clean_text_values", "true()")) if clean_text_values_enabled: survey_sheet = clean_text_values(survey_sheet) survey_sheet = dealias_and_group_headers(survey_sheet, aliases.survey_header, use_double_colons, default_language) survey_sheet = dealias_types(survey_sheet) osm_sheet = workbook_dict.get(constants.OSM, []) osm_tags = group_dictionaries_by_key(osm_sheet, constants.LIST_NAME) # ################################# # Parse the survey sheet while generating a survey in our json format: row_number = 1 # We start at 1 because the column header row is not # included in the survey sheet (presumably). # A stack is used to keep track of begin/end expressions stack = [(None, json_dict.get(constants.CHILDREN))] # If a group has a table-list appearance flag # this will be set to the name of the list table_list = None # For efficiency we compile all the regular expressions # that will be used to parse types: end_control_regex = re.compile(r"^(?P<end>end)(\s|_)(?P<type>(" + '|'.join(aliases.control.keys()) + r"))$") begin_control_regex = re.compile(r"^(?P<begin>begin)(\s|_)(?P<type>(" + '|'.join(aliases.control.keys()) + r"))( (over )?(?P<list_name>\S+))?$") select_regexp = re.compile( r"^(?P<select_command>(" + '|'.join(aliases.select.keys()) + r")) (?P<list_name>\S+)" + "( (?P<specify_other>(or specify other|or_other|or other)))?$") cascading_regexp = re.compile(r"^(?P<cascading_command>(" + '|'.join(aliases.cascading.keys()) + r")) (?P<cascading_level>\S+)?$") osm_regexp = re.compile(r"(?P<osm_command>(" + '|'.join(aliases.osm.keys()) + ')) (?P<list_name>\S+)') for row in survey_sheet: row_number += 1 prev_control_type, parent_children_array = stack[-1] # Disabled should probably be first # so the attributes below can be disabled. if u"disabled" in row: warnings.append( row_format_string % row_number + " The 'disabled' column header is not part of the current" + " spec. We recommend using relevant instead.") disabled = row.pop(u"disabled") if aliases.yes_no.get(disabled): continue # skip empty rows if len(row) == 0: continue # Get question type question_type = row.get(constants.TYPE) if not question_type: # if name and label are also missing, # then its a comment row, and we skip it with warning if not ((constants.NAME in row) or (constants.LABEL in row)): warnings.append( row_format_string % row_number + " Row without name, text, or label is being skipped:\n" + str(row)) continue raise PyXFormError(row_format_string % row_number + " Question with no type.\n" + str(row)) if question_type == 'calculate': calculation = row.get('bind', {}).get('calculate') if not calculation: raise PyXFormError(row_format_string % row_number + " Missing calculation.") # Check if the question is actually a setting specified # on the survey sheet settings_type = aliases.settings_header.get(question_type) if settings_type: json_dict[settings_type] = unicode(row.get(constants.NAME)) continue # Try to parse question as a end control statement # (i.e. end loop/repeat/group): end_control_parse = end_control_regex.search(question_type) if end_control_parse: parse_dict = end_control_parse.groupdict() if parse_dict.get("end") and "type" in parse_dict: control_type = aliases.control[parse_dict["type"]] if prev_control_type != control_type or len(stack) == 1: raise PyXFormError( row_format_string % row_number + " Unmatched end statement. Previous control type: " + str(prev_control_type) + ", Control type: " + str(control_type)) stack.pop() table_list = None continue # Make sure the row has a valid name if constants.NAME not in row: if row['type'] == 'note': # autogenerate names for notes without them row['name'] = "generated_note_name_" + str(row_number) # elif 'group' in row['type'].lower(): # # autogenerate names for groups without them # row['name'] = "generated_group_name_" + str(row_number) else: raise PyXFormError(row_format_string % row_number + " Question or group with no name.") question_name = unicode(row[constants.NAME]) if not is_valid_xml_tag(question_name): error_message = row_format_string % row_number error_message += " Invalid question name [" + \ question_name.encode('utf-8') + "] " error_message += "Names must begin with a letter, colon,"\ + " or underscore." error_message += "Subsequent characters can include numbers," \ + " dashes, and periods." raise PyXFormError(error_message) if constants.LABEL not in row and \ row.get(constants.MEDIA) is None and \ question_type not in aliases.label_optional_types: # TODO: Should there be a default label? # Not sure if we should throw warnings for groups... # Warnings can be ignored so I'm not too concerned # about false positives. warnings.append(row_format_string % row_number + " Question has no label: " + str(row)) # Try to parse question as begin control statement # (i.e. begin loop/repeat/group): begin_control_parse = begin_control_regex.search(question_type) if begin_control_parse: parse_dict = begin_control_parse.groupdict() if parse_dict.get("begin") and "type" in parse_dict: # Create a new json dict with children, and the proper type, # and add it to parent_children_array in place of a question. # parent_children_array will then be set to its children array # (so following questions are nested under it) # until an end command is encountered. control_type = aliases.control[parse_dict["type"]] new_json_dict = row.copy() new_json_dict[constants.TYPE] = control_type child_list = list() new_json_dict[constants.CHILDREN] = child_list if control_type is constants.LOOP: if not parse_dict.get("list_name"): # TODO: Perhaps warn and make repeat into a group? raise PyXFormError(row_format_string % row_number + " Repeat loop without list name.") list_name = parse_dict["list_name"] if list_name not in choices: raise PyXFormError( row_format_string % row_number + " List name not in columns sheet: " + list_name) new_json_dict[constants.COLUMNS] = choices[list_name] # Generate a new node for the jr:count column so # xpath expressions can be used. repeat_count_expression = new_json_dict.get('control', {}).get('jr:count') if repeat_count_expression: generated_node_name = new_json_dict['name'] + "_count" parent_children_array.append({ "name": generated_node_name, "bind": { "readonly": "true()", "calculate": repeat_count_expression, }, "type": "calculate", }) new_json_dict['control']['jr:count'] = \ "${" + generated_node_name + "}" # Code to deal with table_list appearance flags # (for groups of selects) ctrl_ap = new_json_dict.get(u"control", {}).get(u"appearance") if ctrl_ap == constants.TABLE_LIST: table_list = True new_json_dict[u"control"][u"appearance"] = u"field-list" # Generate a note label element so hints and labels # work as expected in table-lists. # see https://github.com/modilabs/pyxform/issues/62 if 'label' in new_json_dict or 'hint' in new_json_dict: generated_label_element = { "type": "note", "name": "generated_table_list_label_" + str(row_number) } if 'label' in new_json_dict: generated_label_element[constants.LABEL] = \ new_json_dict[constants.LABEL] del new_json_dict[constants.LABEL] if 'hint' in new_json_dict: generated_label_element['hint'] = \ new_json_dict['hint'] del new_json_dict['hint'] child_list.append(generated_label_element) if 'intent' in new_json_dict: new_json_dict['control'] = \ new_json_dict.get(u"control", {}) new_json_dict['control']['intent'] = \ new_json_dict['intent'] parent_children_array.append(new_json_dict) stack.append((control_type, child_list)) continue # try to parse as a cascading select cascading_parse = cascading_regexp.search(question_type) if cascading_parse: parse_dict = cascading_parse.groupdict() if parse_dict.get("cascading_command"): cascading_level = parse_dict["cascading_level"] cascading_prefix = row.get(constants.NAME) if not cascading_prefix: raise PyXFormError(row_format_string % row_number + " Cascading select needs a name.") # cascading_json = get_cascading_json( # cascading_choices, cascading_prefix, cascading_level) if len(cascading_choices) <= 0 or \ 'questions' not in cascading_choices[0]: raise PyXFormError("Found a cascading_select " + cascading_level + ", but could not find " + cascading_level + "in cascades sheet.") cascading_json = cascading_choices[0]['questions'] json_dict['choices'] = choices include_bindings = False if 'bind' in row: include_bindings = True for cq in cascading_json: # include bindings if include_bindings: cq['bind'] = row['bind'] def replace_prefix(d, prefix): for k, v in d.items(): if isinstance(v, basestring): d[k] = v.replace('$PREFIX$', prefix) elif isinstance(v, dict): d[k] = replace_prefix(v, prefix) elif isinstance(v, list): d[k] = map(lambda x: replace_prefix(x, prefix), v) return d parent_children_array.append( replace_prefix(cq, cascading_prefix)) continue # so the row isn't put in as is # Try to parse question as a select: select_parse = select_regexp.search(question_type) if select_parse: parse_dict = select_parse.groupdict() if parse_dict.get("select_command"): select_type = aliases.select[parse_dict["select_command"]] if select_type == 'select one external' \ and 'choice_filter' not in row: warnings.append(row_format_string % row_number + u" select one external is only meant for" u" filtered selects.") select_type = aliases.select['select_one'] list_name = parse_dict["list_name"] list_file_name, file_extension = os.path.splitext(list_name) if list_name not in choices \ and select_type != 'select one external' \ and file_extension not in ['.csv', '.xml']: if not choices: raise PyXFormError( u"There should be a choices sheet in this xlsform." u" Please ensure that the choices sheet name is " u"all in small caps and has columns 'list name', " u"'name', and 'label' (or aliased column names).") raise PyXFormError(row_format_string % row_number + " List name not in choices sheet: " + list_name) # Validate select_multiple choice names by making sure # they have no spaces (will cause errors in exports). if select_type == constants.SELECT_ALL_THAT_APPLY \ and file_extension not in ['.csv', '.xml']: for choice in choices[list_name]: if ' ' in choice[constants.NAME]: raise PyXFormError( "Choice names with spaces cannot be added " "to multiple choice selects. See [" + choice[constants.NAME] + "] in [" + list_name + "]") specify_other_question = None if parse_dict.get("specify_other") is not None: select_type += u" or specify other" # With this code we no longer need to handle or_other # questions in survey builder. # However, it depends on being able to use choice filters # and xpath expressions that return empty sets. # choices[list_name].append( # { # 'name': 'other', # 'label': {default_language : 'Other'}, # 'orOther': 'true', # }) # or_other_xpath = 'isNull(orOther)' # if 'choice_filter' in row: # row['choice_filter'] += ' or ' + or_other_xpath # else: # row['choice_filter'] = or_other_xpath # specify_other_question = \ # { # 'type':'text', # 'name': row['name'] + '_specify_other', # 'label': # 'Specify Other for:\n"' + row['label'] + '"', # 'bind' : {'relevant': # "selected(../%s, 'other')" % row['name']}, # } new_json_dict = row.copy() new_json_dict[constants.TYPE] = select_type if row.get('choice_filter'): if select_type == 'select one external': new_json_dict['query'] = list_name else: new_json_dict['itemset'] = list_name json_dict['choices'] = choices elif file_extension in ['.csv', '.xml']: new_json_dict['itemset'] = list_name else: new_json_dict[constants.CHOICES] = choices[list_name] # Code to deal with table_list appearance flags # (for groups of selects) if table_list is not None: # Then this row is the first select in a table list if not isinstance(table_list, basestring): table_list = list_name table_list_header = { constants.TYPE: select_type, constants.NAME: "reserved_name_for_field_list_labels_" + str(row_number), # Adding row number for uniqueness # noqa constants.CONTROL: { u"appearance": u"label" }, constants.CHOICES: choices[list_name], # Do we care about filtered selects in table lists? # 'itemset' : list_name, } parent_children_array.append(table_list_header) if table_list != list_name: error_message = row_format_string % row_number error_message += " Badly formatted table list," \ " list names don't match: " + \ table_list + " vs. " + list_name raise PyXFormError(error_message) control = new_json_dict[u"control"] = \ new_json_dict.get(u"control", {}) control[u"appearance"] = "list-nolabel" parent_children_array.append(new_json_dict) if specify_other_question: parent_children_array.append(specify_other_question) continue # Try to parse question as osm: osm_parse = osm_regexp.search(question_type) if osm_parse: parse_dict = osm_parse.groupdict() new_dict = row.copy() new_dict['type'] = constants.OSM if parse_dict.get('list_name') is not None: tags = osm_tags.get(parse_dict.get('list_name')) for tag in tags: if osm_tags.get(tag.get('name')): tag['choices'] = osm_tags.get(tag.get('name')) new_dict['tags'] = tags parent_children_array.append(new_dict) continue # TODO: Consider adding some question_type validation here. # Put the row in the json dict as is: parent_children_array.append(row) if len(stack) != 1: raise PyXFormError("Unmatched begin statement: " + str(stack[-1][0])) if settings.get('flat', False): # print "Generating flattened instance..." add_flat_annotations(stack[0][1]) meta_children = [] if aliases.yes_no.get(settings.get("omit_instanceID")): if settings.get("public_key"): raise PyXFormError( "Cannot omit instanceID, it is required for encryption.") else: # Automatically add an instanceID element: meta_children.append({ "name": "instanceID", "bind": { "readonly": "true()", "calculate": settings.get("instance_id", "concat('uuid:', uuid())"), }, "type": "calculate", }) if 'instance_name' in settings: # Automatically add an instanceName element: meta_children.append({ "name": "instanceName", "bind": { "calculate": settings['instance_name'] }, "type": "calculate", }) if len(meta_children) > 0: meta_element = \ { "name": "meta", "type": "group", "control": { "bodyless": True }, "children": meta_children } noop, survey_children_array = stack[0] survey_children_array.append(meta_element) # print_pyobj_to_json(json_dict) return json_dict
def workbook_to_json( workbook_dict, form_name=None, default_language=u"default", warnings=None): """ workbook_dict -- nested dictionaries representing a spreadsheet. should be similar to those returned by xls_to_dict form_name -- The spreadsheet's filename default_language -- default_language does two things: 1. In the xform the default language is the language reverted to when there is no translation available for some itext element. Because of this every itext element must have a default language translation. 2. In the workbook if media/labels/hints that do not have a language suffix will be treated as though their suffix is the default language. If the default language is used as a suffix for media/labels/hints, then the suffixless version will be overwritten. warnings -- an optional list which warnings will be appended to returns a nested dictionary equivalent to the format specified in the json form spec. """ # ensure required headers are present if warnings is None: warnings = [] is_valid = False workbook_dict = {x.lower(): y for x,y in workbook_dict.items()} for row in workbook_dict.get(constants.SURVEY, []): is_valid = 'type' in [z.lower() for z in row] if is_valid: break if not is_valid: raise PyXFormError( u"The survey sheet is either empty or missing important " u"column headers.") row_format_string = '[row : %s]' # Make sure the passed in vars are unicode form_name = unicode(form_name) default_language = unicode(default_language) # We check for double columns to determine whether to use them # or single colons to delimit grouped headers. # Single colons are bad because they conflict with with the xform namespace # syntax (i.e. jr:constraintMsg), # so we only use them if we have to for backwards compatibility. use_double_colons = has_double_colon(workbook_dict) # Break the spreadsheet dict into easier to access objects # (settings, choices, survey_sheet): # ########## Settings sheet ########## settings_sheet = dealias_and_group_headers( workbook_dict.get(constants.SETTINGS, []), aliases.settings_header, use_double_colons) settings = settings_sheet[0] if len(settings_sheet) > 0 else {} replace_smart_quotes_in_dict(settings) default_language = settings.get( constants.DEFAULT_LANGUAGE, default_language) # add_none_option is a boolean that when true, # indicates a none option should automatically be added to selects. # It should probably be deprecated but I haven't checked yet. if u"add_none_option" in settings: settings[u"add_none_option"] = aliases.yes_no.get( settings[u"add_none_option"], False) # Here we create our json dict root with default settings: id_string = settings.get(constants.ID_STRING, form_name) sms_keyword = settings.get(constants.SMS_KEYWORD, id_string) json_dict = { constants.TYPE: constants.SURVEY, constants.NAME: form_name, constants.TITLE: id_string, constants.ID_STRING: id_string, constants.SMS_KEYWORD: sms_keyword, constants.DEFAULT_LANGUAGE: default_language, # By default the version is based on the date and time yyyymmddhh # Leaving default version out for now since it might cause # problems for formhub. # constants.VERSION : datetime.datetime.now().strftime("%Y%m%d%H"), constants.CHILDREN: [] } # Here the default settings are overridden by those in the settings sheet json_dict.update(settings) # ########## Choices sheet ########## # Columns and "choices and columns" sheets are deprecated, # but we combine them with the choices sheet for backwards-compatibility. choices_and_columns_sheet = workbook_dict.get( constants.CHOICES_AND_COLUMNS, {}) choices_and_columns_sheet = dealias_and_group_headers( choices_and_columns_sheet, aliases.list_header, use_double_colons, default_language) columns_sheet = workbook_dict.get(constants.COLUMNS, []) columns_sheet = dealias_and_group_headers( columns_sheet, aliases.list_header, use_double_colons, default_language) choices_sheet = workbook_dict.get(constants.CHOICES, []) for choice_item in choices_sheet: replace_smart_quotes_in_dict(choice_item) choices_sheet = dealias_and_group_headers( choices_sheet, aliases.list_header, use_double_colons, default_language) # ########## Cascading Select sheet ########### cascading_choices = workbook_dict.get(constants.CASCADING_CHOICES, []) if len(cascading_choices): if 'choices' in cascading_choices[0]: choices_sheet = choices_sheet + cascading_choices[0]['choices'] combined_lists = group_dictionaries_by_key( choices_and_columns_sheet + choices_sheet + columns_sheet, constants.LIST_NAME) choices = combined_lists # Make sure all the options have the required properties: warnedabout = set() for list_name, options in choices.items(): for option in options: if 'name' not in option: info = "[list_name : " + list_name + ']' raise PyXFormError("On the choices sheet there is " "a option with no name. " + info) if 'label' not in option: info = "[list_name : " + list_name + ']' warnings.append( "On the choices sheet there is a option with no label. " + info) # chrislrobert's fix for a cryptic error message: # see: https://code.google.com/p/opendatakit/issues/detail?id=832&start=200 # noqa option_keys = list(option.keys()) for headername in option_keys: # Using warnings and removing the bad columns # instead of throwing errors because some forms # use choices column headers for notes. if ' ' in headername: if headername not in warnedabout: warnedabout.add(headername) warnings.append("On the choices sheet there is " + "a column (\"" + headername + "\") with an illegal header. " + "Headers cannot include spaces.") del option[headername] elif headername == '': warnings.append("On the choices sheet there is a value" + " in a column with no header.") del option[headername] # ########## Survey sheet ########### if constants.SURVEY not in workbook_dict: raise PyXFormError( "You must have a sheet named (case-sensitive): " + constants.SURVEY) survey_sheet = workbook_dict[constants.SURVEY] # Process the headers: clean_text_values_enabled = aliases.yes_no.get( settings.get("clean_text_values", "true()")) if clean_text_values_enabled: survey_sheet = clean_text_values(survey_sheet) survey_sheet = dealias_and_group_headers( survey_sheet, aliases.survey_header, use_double_colons, default_language) survey_sheet = dealias_types(survey_sheet) osm_sheet = dealias_and_group_headers(workbook_dict.get(constants.OSM, []), aliases.list_header, True) osm_tags = group_dictionaries_by_key(osm_sheet, constants.LIST_NAME) # ################################# # Parse the survey sheet while generating a survey in our json format: row_number = 1 # We start at 1 because the column header row is not # included in the survey sheet (presumably). # A stack is used to keep track of begin/end expressions stack = [(None, json_dict.get(constants.CHILDREN))] # If a group has a table-list appearance flag # this will be set to the name of the list table_list = None # For efficiency we compile all the regular expressions # that will be used to parse types: end_control_regex = re.compile(r"^(?P<end>end)(\s|_)(?P<type>(" + '|'.join(aliases.control.keys()) + r"))$") begin_control_regex = re.compile(r"^(?P<begin>begin)(\s|_)(?P<type>(" + '|'.join(aliases.control.keys()) + r"))( (over )?(?P<list_name>\S+))?$") select_regexp = re.compile( r"^(?P<select_command>(" + '|'.join(aliases.select.keys()) + r")) (?P<list_name>\S+)" + "( (?P<specify_other>(or specify other|or_other|or other)))?$") cascading_regexp = re.compile( r"^(?P<cascading_command>(" + '|'.join(aliases.cascading.keys()) + r")) (?P<cascading_level>\S+)?$") osm_regexp = re.compile( r"(?P<osm_command>(" + '|'.join(aliases.osm.keys()) + ')) (?P<list_name>\S+)') # Rows from the survey sheet that should be nested in meta survey_meta = [] for row in survey_sheet: row_number += 1 prev_control_type, parent_children_array = stack[-1] # Disabled should probably be first # so the attributes below can be disabled. if u"disabled" in row: warnings.append( row_format_string % row_number + " The 'disabled' column header is not part of the current" + " spec. We recommend using relevant instead.") disabled = row.pop(u"disabled") if aliases.yes_no.get(disabled): continue # skip empty rows if len(row) == 0: continue # Get question type question_type = row.get(constants.TYPE) if not question_type: # if name and label are also missing, # then its a comment row, and we skip it with warning if not ((constants.NAME in row) or (constants.LABEL in row)): warnings.append( row_format_string % row_number + " Row without name, text, or label is being skipped:\n" + str(row)) continue raise PyXFormError( row_format_string % row_number + " Question with no type.\n" + str(row)) # Pull out questions that will go in meta block if question_type == 'audit': # Force audit name to always be "audit" to follow XForms spec if 'name' in row and row['name'] not in [None, '', 'audit']: raise PyXFormError(row_format_string % row_number + " Audits must always be named 'audit.'" + " The name column should be left blank.") row['name'] = 'audit' survey_meta.append(row) continue if question_type == 'calculate': calculation = row.get('bind', {}).get('calculate') if not calculation: raise PyXFormError( row_format_string % row_number + " Missing calculation.") # Check if the question is actually a setting specified # on the survey sheet settings_type = aliases.settings_header.get(question_type) if settings_type: json_dict[settings_type] = unicode(row.get(constants.NAME)) continue # Try to parse question as a end control statement # (i.e. end loop/repeat/group): end_control_parse = end_control_regex.search(question_type) if end_control_parse: parse_dict = end_control_parse.groupdict() if parse_dict.get("end") and "type" in parse_dict: control_type = aliases.control[parse_dict["type"]] if prev_control_type != control_type or len(stack) == 1: raise PyXFormError( row_format_string % row_number + " Unmatched end statement. Previous control type: " + str(prev_control_type) + ", Control type: " + str(control_type)) stack.pop() table_list = None continue # Make sure the row has a valid name if constants.NAME not in row: if row['type'] == 'note': # autogenerate names for notes without them row['name'] = "generated_note_name_" + str(row_number) # elif 'group' in row['type'].lower(): # # autogenerate names for groups without them # row['name'] = "generated_group_name_" + str(row_number) else: raise PyXFormError(row_format_string % row_number + " Question or group with no name.") question_name = unicode(row[constants.NAME]) if not is_valid_xml_tag(question_name): error_message = row_format_string % row_number error_message += " Invalid question name [" + \ question_name.encode('utf-8') + "] " error_message += "Names must begin with a letter, colon,"\ + " or underscore." error_message += "Subsequent characters can include numbers," \ + " dashes, and periods." raise PyXFormError(error_message) if constants.LABEL not in row and \ row.get(constants.MEDIA) is None and \ question_type not in aliases.label_optional_types: # TODO: Should there be a default label? # Not sure if we should throw warnings for groups... # Warnings can be ignored so I'm not too concerned # about false positives. warnings.append( row_format_string % row_number + " Question has no label: " + str(row)) # Try to parse question as begin control statement # (i.e. begin loop/repeat/group): begin_control_parse = begin_control_regex.search(question_type) if begin_control_parse: parse_dict = begin_control_parse.groupdict() if parse_dict.get("begin") and "type" in parse_dict: # Create a new json dict with children, and the proper type, # and add it to parent_children_array in place of a question. # parent_children_array will then be set to its children array # (so following questions are nested under it) # until an end command is encountered. control_type = aliases.control[parse_dict["type"]] new_json_dict = row.copy() new_json_dict[constants.TYPE] = control_type child_list = list() new_json_dict[constants.CHILDREN] = child_list if control_type is constants.LOOP: if not parse_dict.get("list_name"): # TODO: Perhaps warn and make repeat into a group? raise PyXFormError( row_format_string % row_number + " Repeat loop without list name.") list_name = parse_dict["list_name"] if list_name not in choices: raise PyXFormError( row_format_string % row_number + " List name not in columns sheet: " + list_name) new_json_dict[constants.COLUMNS] = choices[list_name] # Generate a new node for the jr:count column so # xpath expressions can be used. repeat_count_expression = new_json_dict.get( 'control', {}).get('jr:count') if repeat_count_expression: generated_node_name = new_json_dict['name'] + "_count" parent_children_array.append({ "name": generated_node_name, "bind": { "readonly": "true()", "calculate": repeat_count_expression, }, "type": "calculate", }) new_json_dict['control']['jr:count'] = \ "${" + generated_node_name + "}" # Code to deal with table_list appearance flags # (for groups of selects) ctrl_ap = new_json_dict.get(u"control", {}).get(u"appearance") if ctrl_ap == constants.TABLE_LIST: table_list = True new_json_dict[u"control"][u"appearance"] = u"field-list" # Generate a note label element so hints and labels # work as expected in table-lists. # see https://github.com/modilabs/pyxform/issues/62 if 'label' in new_json_dict or 'hint' in new_json_dict: generated_label_element = { "type": "note", "name": "generated_table_list_label_" + str(row_number) } if 'label' in new_json_dict: generated_label_element[constants.LABEL] = \ new_json_dict[constants.LABEL] del new_json_dict[constants.LABEL] if 'hint' in new_json_dict: generated_label_element['hint'] = \ new_json_dict['hint'] del new_json_dict['hint'] child_list.append(generated_label_element) if 'intent' in new_json_dict: new_json_dict['control'] = \ new_json_dict.get(u"control", {}) new_json_dict['control']['intent'] = \ new_json_dict['intent'] parent_children_array.append(new_json_dict) stack.append((control_type, child_list)) continue # try to parse as a cascading select cascading_parse = cascading_regexp.search(question_type) if cascading_parse: parse_dict = cascading_parse.groupdict() if parse_dict.get("cascading_command"): cascading_level = parse_dict["cascading_level"] cascading_prefix = row.get(constants.NAME) if not cascading_prefix: raise PyXFormError( row_format_string % row_number + " Cascading select needs a name.") # cascading_json = get_cascading_json( # cascading_choices, cascading_prefix, cascading_level) if len(cascading_choices) <= 0 or \ 'questions' not in cascading_choices[0]: raise PyXFormError( "Found a cascading_select " + cascading_level + ", but could not find " + cascading_level + "in cascades sheet.") cascading_json = cascading_choices[0]['questions'] json_dict['choices'] = choices include_bindings = False if 'bind' in row: include_bindings = True for cq in cascading_json: # include bindings if include_bindings: cq['bind'] = row['bind'] def replace_prefix(d, prefix): for k, v in d.items(): if isinstance(v, basestring): d[k] = v.replace('$PREFIX$', prefix) elif isinstance(v, dict): d[k] = replace_prefix(v, prefix) elif isinstance(v, list): d[k] = map( lambda x: replace_prefix(x, prefix), v) return d parent_children_array.append( replace_prefix(cq, cascading_prefix)) continue # so the row isn't put in as is # Try to parse question as a select: select_parse = select_regexp.search(question_type) if select_parse: parse_dict = select_parse.groupdict() if parse_dict.get("select_command"): select_type = aliases.select[parse_dict["select_command"]] if select_type == 'select one external' \ and 'choice_filter' not in row: warnings.append( row_format_string % row_number + u" select one external is only meant for" u" filtered selects.") select_type = aliases.select['select_one'] list_name = parse_dict["list_name"] list_file_name, file_extension = os.path.splitext(list_name) if list_name not in choices \ and select_type != 'select one external' \ and file_extension not in ['.csv', '.xml']: if not choices: raise PyXFormError( u"There should be a choices sheet in this xlsform." u" Please ensure that the choices sheet name is " u"all in small caps and has columns 'list name', " u"'name', and 'label' (or aliased column names).") raise PyXFormError( row_format_string % row_number + " List name not in choices sheet: " + list_name) # Validate select_multiple choice names by making sure # they have no spaces (will cause errors in exports). if select_type == constants.SELECT_ALL_THAT_APPLY \ and file_extension not in ['.csv', '.xml']: for choice in choices[list_name]: if ' ' in choice[constants.NAME]: raise PyXFormError( "Choice names with spaces cannot be added " "to multiple choice selects. See [" + choice[constants.NAME] + "] in [" + list_name + "]") specify_other_question = None if parse_dict.get("specify_other") is not None: select_type += u" or specify other" # With this code we no longer need to handle or_other # questions in survey builder. # However, it depends on being able to use choice filters # and xpath expressions that return empty sets. # choices[list_name].append( # { # 'name': 'other', # 'label': {default_language : 'Other'}, # 'orOther': 'true', # }) # or_other_xpath = 'isNull(orOther)' # if 'choice_filter' in row: # row['choice_filter'] += ' or ' + or_other_xpath # else: # row['choice_filter'] = or_other_xpath # specify_other_question = \ # { # 'type':'text', # 'name': row['name'] + '_specify_other', # 'label': # 'Specify Other for:\n"' + row['label'] + '"', # 'bind' : {'relevant': # "selected(../%s, 'other')" % row['name']}, # } new_json_dict = row.copy() new_json_dict[constants.TYPE] = select_type if row.get('choice_filter'): if select_type == 'select one external': new_json_dict['query'] = list_name else: new_json_dict['itemset'] = list_name json_dict['choices'] = choices elif file_extension in ['.csv', '.xml']: new_json_dict['itemset'] = list_name else: new_json_dict[constants.CHOICES] = choices[list_name] # Code to deal with table_list appearance flags # (for groups of selects) if table_list is not None: # Then this row is the first select in a table list if not isinstance(table_list, basestring): table_list = list_name table_list_header = { constants.TYPE: select_type, constants.NAME: "reserved_name_for_field_list_labels_" + str(row_number), # Adding row number for uniqueness # noqa constants.CONTROL: {u"appearance": u"label"}, constants.CHOICES: choices[list_name], # Do we care about filtered selects in table lists? # 'itemset' : list_name, } parent_children_array.append(table_list_header) if table_list != list_name: error_message = row_format_string % row_number error_message += " Badly formatted table list," \ " list names don't match: " + \ table_list + " vs. " + list_name raise PyXFormError(error_message) control = new_json_dict[u"control"] = \ new_json_dict.get(u"control", {}) control[u"appearance"] = "list-nolabel" parent_children_array.append(new_json_dict) if specify_other_question: parent_children_array.append(specify_other_question) continue # Try to parse question as osm: osm_parse = osm_regexp.search(question_type) if osm_parse: parse_dict = osm_parse.groupdict() new_dict = row.copy() new_dict['type'] = constants.OSM if parse_dict.get('list_name') is not None: tags = osm_tags.get(parse_dict.get('list_name')) for tag in tags: if osm_tags.get(tag.get('name')): tag['choices'] = osm_tags.get(tag.get('name')) new_dict['tags'] = tags parent_children_array.append(new_dict) continue # range question_type if question_type == 'range': new_dict = process_range_question_type(row) parent_children_array.append(new_dict) continue # TODO: Consider adding some question_type validation here. # Put the row in the json dict as is: parent_children_array.append(row) if len(stack) != 1: raise PyXFormError("Unmatched begin statement: " + str(stack[-1][0])) if settings.get('flat', False): # print "Generating flattened instance..." add_flat_annotations(stack[0][1]) meta_children = [] + survey_meta if aliases.yes_no.get(settings.get("omit_instanceID")): if settings.get("public_key"): raise PyXFormError( "Cannot omit instanceID, it is required for encryption.") else: # Automatically add an instanceID element: meta_children.append({ "name": "instanceID", "bind": { "readonly": "true()", "calculate": settings.get( "instance_id", "concat('uuid:', uuid())"), }, "type": "calculate", }) if 'instance_name' in settings: # Automatically add an instanceName element: meta_children.append({ "name": "instanceName", "bind": { "calculate": settings['instance_name'] }, "type": "calculate", }) if len(meta_children) > 0: meta_element = \ { "name": "meta", "type": "group", "control": { "bodyless": True }, "children": meta_children } noop, survey_children_array = stack[0] survey_children_array.append(meta_element) # print_pyobj_to_json(json_dict) return json_dict