def __init__(self, content, name, resource_dir): self.log = logging.getLogger() if content is None: raise Exception("SpecParser cannot be initialized, content is None") self.content = content self.name = name self.resource_list = None self.load_resources(resource_dir) self.answer_code_spec = AnswerCodeSpec() self.errors = None self.is_error = False self.img_content_re = re.compile(r"(<img_[A-Za-z0-9._]+>)")
def __init__(self, target_file, path, serialize_raw=False): self.log = logging.getLogger() self.target_file = target_file self._path = path self.position = 0 self.content = [] self.styles = {} #self.__root = None self._style_element = None self._text_element = None self.errors = None self.is_error = False self.CONTENT_FILE = 'content.xml' self.answer_code_spec = AnswerCodeSpec() self.serialize_raw = serialize_raw deflate(self.target_file, self._path)
class ODTParser: """ - Deflates target_file on the given path. - Reads content.xml files and loads a list of data objects. Each object contains text&style info in following format: {'text': str, 'style': list of style-attributes/vlaues}, 'position': text position, 'is_mathml': 'True' if text is mathml} - Slice the single data list into sublist, based on qstart/qend markers. i.e. a sublist contains data for a question. - Convert each question list into qustion object. Each answer code section is separated. For example" {'qstart': {[] of data objects, 'anstype': {[] of data objects, 'prompt': {[] of data objects, ... } For now Question object is a dict. """ def __init__(self, target_file, path, serialize_raw=False): self.log = logging.getLogger() self.target_file = target_file self._path = path self.position = 0 self.content = [] self.styles = {} #self.__root = None self._style_element = None self._text_element = None self.errors = None self.is_error = False self.CONTENT_FILE = 'content.xml' self.answer_code_spec = AnswerCodeSpec() self.serialize_raw = serialize_raw deflate(self.target_file, self._path) def parse(self): self.parse_xml() if self.is_error: return self.content, self.errors # make a lisk of question(dictionary objects) raw_list = self.slice_dice() if self.is_error: self.log.error("Slicing data into [] of questions FAILED") return self.content, self.errors self.log.debug("Slicing data into [] of questions. Done. Questions found %s." % len(raw_list)) self.content = self.convert_to_dict(raw_list) if self.is_error: self.log.error("Converting question [] to object FAILED") return self.content, self.errors def parse_xml(self): self.load_components() for child in self._text_element: if (self.is_p_tag(child) or self.is_list_tag(child)) and self._is_leaf(child): self.extract_leaf(child) else: self.dig(child) # if self.serialize_raw: # if self.content is not None: # f = os.sep.join([self._path, 'raw.text']) # print '---------', f # with open(f, 'w') as raw: # for data in self.content: # v = data.get('text') # if v is None: # v = 'None' # raw.write(v.encode('utf8')) return self.content def slice_dice(self): """ [] -> [][] - Make question slices based on question boundaries """ questions = list() sub_list = None for i, data in enumerate(self.content): text = data.get('text') if text is None and sub_list is None: # ignoring blank lines outside question boundaries continue if self.is_qstart(text): if sub_list is not None: msg = "Question End 'qend' not found" # print ' - Sublist is not None: %s' % sub_list self.log.error(msg) self.add_error(msg) break sub_list = list() sub_list.append(data) continue if text is not None and sub_list is None: msg = "Question Start 'qstart' not found" self.log.error(msg) self.add_error(msg) break if self.is_qend(text): sub_list.append(data) questions.append(sub_list) sub_list = None else: sub_list.append(data) return questions def convert_to_dict(self, qlist): questions = list() for que_list in qlist: question = {} code = None values = list() for line_obj in que_list: text = line_obj.get('text') #if self._is_code(text): if self.answer_code_spec.is_code_like(text): if code is None: code = text else: # codes are ':' stripped and lowered code = code.strip().rstrip(':').lower() question[code] = values code = text values = list() else: values.append(line_obj) questions.append(question) return questions def load_components(self): tree = etree.parse(os.path.join(self._path, self.CONTENT_FILE)) root = tree.getroot() for element in root: if 'automatic-styles' in element.tag: self._style_element = element elif 'body' in element.tag: self._text_element = element[0] def dig(self, elem): if self.is_list_tag(elem): self.process_list(elem) elif self.is_p_tag(elem) or self.is_span_tag(elem): if self._is_leaf(elem): self.extract_leaf(elem) else: for child in elem.xpath("./node()"): #print '>>>>>', sub, 'TYPE: ', type(sub) # TODO: confirm unicode transformation if isinstance(child, etree._ElementStringResult): self.extract_leaf(elem, text=str(child)) elif isinstance(child, etree._ElementUnicodeResult): self.extract_leaf(elem, text=repr(child)) elif isinstance(child, etree._Element): if self._is_leaf(child): self.extract_leaf(child) else: self.dig(child) elif self.is_equation_tag(elem): obj_ref = elem[0].get("{http://www.w3.org/1999/xlink}href") if obj_ref: obj_path = self._path + '/' + obj_ref + '/' + 'content.xml' if os.path.exists(obj_path): ref_file = open(obj_path) mathml = ref_file.read() ref_file.close() self.extract_leaf(None, mathml=mathml) else: msg = "Invalid Object referenced '%s'" % obj_ref self.log.error(msg) self.add_error(msg) def extract_leaf(self, element, text=None, mathml=None): if mathml is None and element.tag.endswith('soft-page-break'): return self.position += 1 if mathml: param = {'text': mathml, 'style': None, 'position': self.position, 'is_mathml': True} self.content.append(param) #print 'Leaf +>', param.get('text') return if not text: if self.is_p_tag(element): if element.text is not None: text = '\n' + element.text + '\n' # elif element.text is None: # text = '\n' else: text = element.text param = {'text': text, 'style': self.get_style(element), 'position': self.position} self.content.append(param) #print 'Leaf +>', param.get('text') def process_list(self, element): # todo Test variations in multiple documents for child in element[0]: if self._is_leaf(child): self.extract_leaf(child) else: self.dig(child) def get_style(self, element): # return None style_name = element.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name") #print " ## target style: ",style_name if self.styles.get(style_name): return self.styles.get(style_name) for child in self._style_element: if child.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") == style_name: #print "--------",style_name #if sub.find([0].tag.endswith('text-properties'): target_property_tag_names = ['text-properties', 'paragraph-properties'] style_info = {} for tag_name in target_property_tag_names: prop_elem = self._get_child(child, tag_name) if prop_elem is not None: #BOLD "{urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0}font-weight" #ITALIC "{urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0}font-style" # Load all style info for attrib_name in prop_elem.keys(): style_info[attrib_name] = prop_elem.get(attrib_name) if style_info: style_info['style_name'] = style_name self.styles[style_name] = style_info if self.styles.get(style_name) is not None: return self.styles.get(style_name) elif style_name not in [None, 'Standard']: self.log.info('No style found with name "%s"' % style_name) def _get_child(self, elem, name): for sub in elem.iter(): if sub.tag.endswith(name): return sub def _is_leaf(self, element): if len(element)==0: return True def is_p_tag(self, element): if element.tag.endswith('}p'): return True def is_span_tag(self, element): if element.tag.endswith('}span'): return True def is_list_tag(self, element): if element.tag.endswith('}list'): return True def is_equation_tag(self, element): if element.tag.endswith('}frame'): return True def add_error(self, msg): if self.errors is None: self.errors = list() self.is_error = True self.errors.append(msg) def is_qstart(self, text): if text and text.strip().lower() == 'qstart:': return True def is_qend(self, text): if text and text.strip().lower() == 'qend:': return True
class SpecParser: """ Acceps a list of Question objects parsed by ODTParser in content """ def __init__(self, content, name, resource_dir): self.log = logging.getLogger() if content is None: raise Exception("SpecParser cannot be initialized, content is None") self.content = content self.name = name self.resource_list = None self.load_resources(resource_dir) self.answer_code_spec = AnswerCodeSpec() self.errors = None self.is_error = False self.img_content_re = re.compile(r"(<img_[A-Za-z0-9._]+>)") def process(self): self.content = self.merge_clean(self.content) if self.is_error: return self.content, self.errors if not self.validate(): self.log.info("Validation Failed") return self.content, self.errors def validate(self): #todo Apply spec validation validated = True for index, question in enumerate(self.content): #print "******* validating question %s of %s " % ((index + 1),len(self.content)) if not self.validate_question(question, index + 1): validated = False return validated def validate_question(self, question, queno): anstype = self.get_question_anstype(question) anstype_spec = self.answer_code_spec.get_anstype_spec(anstype) if not anstype_spec: print ">>>>>>>>>>>>>>>> Anstype spec not defined for '%s'<<<<<<<<<<<<<<<<<<" % anstype self.log.warning("Answer type spec not defined for '%s'" % anstype) is_valid = True for code in anstype_spec.keys(): # Question must contain emtpy or non empty code if code not in question: msg = "Q#[%s]: Incomlete question. Code '%s' is required. Codes found are %s" % \ (queno, code, question.keys()) self.log.error(msg) self.add_error(msg) is_valid = False continue code_data = question.get(code) code_spec = anstype_spec.get(code) if code_spec.get('plain_text', False) and code_spec.get('values', []): values = code_spec.get('values') if not self.validate_value(code_data, values): msg = "Q#[%s]: Invalid value '%s'. Code '%s' can only have values %s" % (queno, code_data[0].get('text'), code, values) self.log.error(msg) self.add_error(msg) is_valid = False if code_spec.get('plain_text') is None: if self.validate_for_images(code_data, queno) is False: is_valid = False question[code] = self.clean(code_data) return is_valid def validate_value(self, data, values): if data[0].get('text').strip() in values: return True def merge_clean(self, questions): default_spec = self.answer_code_spec.get_default_spec() for queno, question in enumerate(questions): queno += 1 if 'anstype' not in question.keys(): msg = "Q#[%s]: Mandatory code 'anstype' not found" % queno self.log.error(msg) self.add_error(msg) return None for code in question.keys(): # for each code in question #print '--------------', code qdata = question[code] spec = default_spec[code] if spec is None: msg = "Q#[%s]: Invalid Code %s" % (queno, code) self.log.error(msg) self.add_error(msg) return None if spec.get('plain_text', False): # single object instead of a list d = self._merge_as_plain(qdata) question[code] = d # anstype value should be flattened by now anstype = question.get('anstype')[0].get('text') if anstype: anstype = anstype.strip() question.get('anstype')[0]['text'] = anstype if not self.is_valid_anstype(anstype): msg = "Q#[%s]: Invalid answer type value '%s'. Valid values are %s" % \ (queno, anstype, AnswerCodeSpec.ANSWER_TYPES) self.log.error(msg) self.add_error(msg) return None return questions def clean(self, code_data): squeezed = [] for data in code_data: if data.get('text') and data.get('text').strip(): if 'style' in data: del data['style'] squeezed.append(data) return squeezed # squeezed = [] # merged = None # for data in code_data: # if data.get('is_mathml', False): # if merged: # squeezed.append({'text': merged}) # merged = None # squeezed.append(data) # continue # if merged is None: # merged = '' # # if data.get('text'): # merged += data.get('text') # else: # merged += '\n' # if merged: # squeezed.append({'text': merged}) # return squeezed def validate_for_images(self, text, queno): # <img_3.MD.1_003> = 3.MD.1_003.png, <img_3.MD.1_002a> = 3.MD.1_002a.png is_valid = True for line in text: if line.get('text'): for img in self.img_content_re.findall(line.get('text')): if self.ref_object_exist(img) is False: is_valid = False msg = "Q#[%s]: Image not available '%s'" % (queno, img) self.log.error(msg) self.add_error(msg) return is_valid def is_valid_anstype(self, anstype): return anstype in AnswerCodeSpec.ANSWER_TYPES def ref_object_exist(self, name): if self.resource_list is None: return False name = name.lstrip("<img_").rstrip(">") return name+".png" in self.resource_list def get_question_anstype(self, question): return question.get('anstype')[0].get('text') def load_resources(self, dir): if os.path.exists(dir) is False: self.log.warning("Image dir not found") return if self.resource_list is None: self.resource_list = list() for name in os.listdir(dir): self.resource_list.append(name) def _merge_as_plain(self, data): text = '' for obj in data: t = obj.get('text') if t is None: text += '\n' else: text += t return [{'text': text}] def add_error(self, msg): if self.errors is None: self.errors = list() self.is_error = True self.errors.append(msg)