Ejemplo n.º 1
0
class ODTParser:
    """
    - Deflates target_file on the given path.
    - Reads content.xml files and loads a list of data objects. Each object contains text&style info in following format:
        {'text': str,
        'style': list of style-attributes/vlaues},
        'position': text position,
        'is_mathml': 'True' if text is mathml}
    - Slice the single data list into sublist, based on qstart/qend markers. i.e. a sublist contains data for a question.
    - Convert each question list into qustion object. Each answer code section is separated. For example"
        {'qstart': {[] of data objects,
         'anstype': {[] of data objects,
         'prompt': {[] of data objects,
         ...
        }
        For now Question object is a dict.
    """
    def __init__(self, target_file, path, serialize_raw=False):
        self.log = logging.getLogger()
        self.target_file = target_file
        self._path = path
        self.position = 0
        self.content = []
        self.styles = {}
        #self.__root = None
        self._style_element = None
        self._text_element = None
        self.errors = None
        self.is_error = False
        self.CONTENT_FILE = 'content.xml'
        self.answer_code_spec = AnswerCodeSpec()
        self.serialize_raw = serialize_raw
        deflate(self.target_file, self._path)

    def parse(self):
        self.parse_xml()
        if self.is_error:
            return self.content, self.errors

        # make a lisk of question(dictionary objects)
        raw_list = self.slice_dice()
        if self.is_error:
            self.log.error("Slicing data into [] of questions FAILED")
            return self.content, self.errors
        self.log.debug("Slicing data into [] of questions. Done. Questions found %s." % len(raw_list))

        self.content = self.convert_to_dict(raw_list)
        if self.is_error:
            self.log.error("Converting question [] to object FAILED")
        return self.content, self.errors

    def parse_xml(self):
        self.load_components()
        for child in self._text_element:
            if (self.is_p_tag(child) or self.is_list_tag(child)) and self._is_leaf(child):
                self.extract_leaf(child)
            else:
                self.dig(child)

        # if self.serialize_raw:
        #     if self.content is not None:
        #         f = os.sep.join([self._path, 'raw.text'])
        #         print '---------', f
        #         with open(f, 'w') as raw:
        #             for data in self.content:
        #                 v = data.get('text')
        #                 if v is None:
        #                     v = 'None'
        #                 raw.write(v.encode('utf8'))

        return self.content

    def slice_dice(self):
        """
        [] -> [][] - Make question slices based on question boundaries
        """
        questions = list()
        sub_list = None
        for i, data in enumerate(self.content):
            text = data.get('text')
            if text is None and sub_list is None:
                # ignoring blank lines outside question boundaries
                continue
            if self.is_qstart(text):
                if sub_list is not None:
                    msg = "Question End 'qend' not found"
                    # print ' - Sublist is not None: %s' % sub_list
                    self.log.error(msg)
                    self.add_error(msg)
                    break
                sub_list = list()
                sub_list.append(data)
                continue
            if text is not None and sub_list is None:
                msg = "Question Start 'qstart' not found"
                self.log.error(msg)
                self.add_error(msg)
                break
            if self.is_qend(text):
                sub_list.append(data)
                questions.append(sub_list)
                sub_list = None
            else:
                sub_list.append(data)
        return questions

    def convert_to_dict(self, qlist):
        questions = list()
        for que_list in qlist:
            question = {}
            code = None
            values = list()
            for line_obj in que_list:
                text = line_obj.get('text')
                #if self._is_code(text):
                if self.answer_code_spec.is_code_like(text):
                    if code is None:
                        code = text
                    else:
                        # codes are ':' stripped and lowered
                        code = code.strip().rstrip(':').lower()
                        question[code] = values
                        code = text
                        values = list()
                else:
                    values.append(line_obj)

            questions.append(question)
        return questions

    def load_components(self):
        tree = etree.parse(os.path.join(self._path, self.CONTENT_FILE))
        root = tree.getroot()
        for element in root:
            if 'automatic-styles' in element.tag:
                self._style_element = element
            elif 'body' in element.tag:
                self._text_element = element[0]

    def dig(self, elem):
        if self.is_list_tag(elem):
            self.process_list(elem)
        elif self.is_p_tag(elem) or self.is_span_tag(elem):
            if self._is_leaf(elem):
                self.extract_leaf(elem)
            else:
                for child in elem.xpath("./node()"):
                    #print '>>>>>', sub, 'TYPE: ', type(sub)
                    # TODO: confirm unicode transformation
                    if isinstance(child, etree._ElementStringResult):
                        self.extract_leaf(elem, text=str(child))
                    elif isinstance(child, etree._ElementUnicodeResult):
                        self.extract_leaf(elem, text=repr(child))
                    elif isinstance(child, etree._Element):
                        if self._is_leaf(child):
                            self.extract_leaf(child)
                        else:
                            self.dig(child)
        elif self.is_equation_tag(elem):
            obj_ref = elem[0].get("{http://www.w3.org/1999/xlink}href")
            if obj_ref:
                obj_path = self._path + '/' + obj_ref + '/' + 'content.xml'
                if os.path.exists(obj_path):
                    ref_file = open(obj_path)
                    mathml = ref_file.read()
                    ref_file.close()
                    self.extract_leaf(None, mathml=mathml)
                else:
                    msg = "Invalid Object referenced '%s'" % obj_ref
                    self.log.error(msg)
                    self.add_error(msg)

    def extract_leaf(self, element, text=None, mathml=None):
        if mathml is None and element.tag.endswith('soft-page-break'):
            return
        self.position += 1
        if mathml:
            param = {'text': mathml, 'style': None, 'position': self.position, 'is_mathml': True}
            self.content.append(param)
            #print 'Leaf +>', param.get('text')
            return
        if not text:
            if self.is_p_tag(element):
                if element.text is not None:
                    text = '\n' + element.text + '\n'
                # elif element.text is None:
                #     text = '\n'
            else:
                text = element.text
        param = {'text': text, 'style': self.get_style(element), 'position': self.position}
        self.content.append(param)
        #print 'Leaf +>', param.get('text')

    def process_list(self, element):
        # todo Test variations in multiple documents
        for child in element[0]:
            if self._is_leaf(child):
                self.extract_leaf(child)
            else:
                self.dig(child)

    def get_style(self, element):
        # return None
        style_name = element.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name")
        #print " ## target style: ",style_name
        if self.styles.get(style_name):
            return self.styles.get(style_name)

        for child in self._style_element:
            if child.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") == style_name:
                #print "--------",style_name
                #if sub.find([0].tag.endswith('text-properties'):
                target_property_tag_names = ['text-properties', 'paragraph-properties']
                style_info = {}
                for tag_name in target_property_tag_names:
                    prop_elem = self._get_child(child, tag_name)
                    if prop_elem is not None:
                        #BOLD "{urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0}font-weight"
                        #ITALIC "{urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0}font-style"
                        # Load all style info
                        for attrib_name in prop_elem.keys():
                            style_info[attrib_name] = prop_elem.get(attrib_name)

                if style_info:
                    style_info['style_name'] = style_name
                self.styles[style_name] = style_info
        if self.styles.get(style_name) is not None:
            return self.styles.get(style_name)
        elif style_name not in [None, 'Standard']:
            self.log.info('No style found with name "%s"' % style_name)

    def _get_child(self, elem, name):
        for sub in elem.iter():
            if sub.tag.endswith(name):
                return sub

    def _is_leaf(self, element):
        if len(element)==0:
            return True

    def is_p_tag(self, element):
        if element.tag.endswith('}p'):
            return True

    def is_span_tag(self, element):
        if element.tag.endswith('}span'):
            return True

    def is_list_tag(self, element):
        if element.tag.endswith('}list'):
            return True

    def is_equation_tag(self, element):
        if element.tag.endswith('}frame'):
            return True

    def add_error(self, msg):
        if self.errors is None:
            self.errors = list()
        self.is_error = True
        self.errors.append(msg)

    def is_qstart(self, text):
        if text and text.strip().lower() == 'qstart:':
            return True

    def is_qend(self, text):
        if text and text.strip().lower() == 'qend:':
            return True