Ejemplo n.º 1
0
 def __init__(self, content, name, resource_dir):
     self.log = logging.getLogger()
     if content is None:
         raise Exception("SpecParser cannot be initialized, content is None")
     self.content = content
     self.name = name
     self.resource_list = None
     self.load_resources(resource_dir)
     self.answer_code_spec = AnswerCodeSpec()
     self.errors = None
     self.is_error = False
     self.img_content_re = re.compile(r"(<img_[A-Za-z0-9._]+>)")
Ejemplo n.º 2
0
 def __init__(self, target_file, path, serialize_raw=False):
     self.log = logging.getLogger()
     self.target_file = target_file
     self._path = path
     self.position = 0
     self.content = []
     self.styles = {}
     #self.__root = None
     self._style_element = None
     self._text_element = None
     self.errors = None
     self.is_error = False
     self.CONTENT_FILE = 'content.xml'
     self.answer_code_spec = AnswerCodeSpec()
     self.serialize_raw = serialize_raw
     deflate(self.target_file, self._path)
Ejemplo n.º 3
0
class ODTParser:
    """
    - Deflates target_file on the given path.
    - Reads content.xml files and loads a list of data objects. Each object contains text&style info in following format:
        {'text': str,
        'style': list of style-attributes/vlaues},
        'position': text position,
        'is_mathml': 'True' if text is mathml}
    - Slice the single data list into sublist, based on qstart/qend markers. i.e. a sublist contains data for a question.
    - Convert each question list into qustion object. Each answer code section is separated. For example"
        {'qstart': {[] of data objects,
         'anstype': {[] of data objects,
         'prompt': {[] of data objects,
         ...
        }
        For now Question object is a dict.
    """
    def __init__(self, target_file, path, serialize_raw=False):
        self.log = logging.getLogger()
        self.target_file = target_file
        self._path = path
        self.position = 0
        self.content = []
        self.styles = {}
        #self.__root = None
        self._style_element = None
        self._text_element = None
        self.errors = None
        self.is_error = False
        self.CONTENT_FILE = 'content.xml'
        self.answer_code_spec = AnswerCodeSpec()
        self.serialize_raw = serialize_raw
        deflate(self.target_file, self._path)

    def parse(self):
        self.parse_xml()
        if self.is_error:
            return self.content, self.errors

        # make a lisk of question(dictionary objects)
        raw_list = self.slice_dice()
        if self.is_error:
            self.log.error("Slicing data into [] of questions FAILED")
            return self.content, self.errors
        self.log.debug("Slicing data into [] of questions. Done. Questions found %s." % len(raw_list))

        self.content = self.convert_to_dict(raw_list)
        if self.is_error:
            self.log.error("Converting question [] to object FAILED")
        return self.content, self.errors

    def parse_xml(self):
        self.load_components()
        for child in self._text_element:
            if (self.is_p_tag(child) or self.is_list_tag(child)) and self._is_leaf(child):
                self.extract_leaf(child)
            else:
                self.dig(child)

        # if self.serialize_raw:
        #     if self.content is not None:
        #         f = os.sep.join([self._path, 'raw.text'])
        #         print '---------', f
        #         with open(f, 'w') as raw:
        #             for data in self.content:
        #                 v = data.get('text')
        #                 if v is None:
        #                     v = 'None'
        #                 raw.write(v.encode('utf8'))

        return self.content

    def slice_dice(self):
        """
        [] -> [][] - Make question slices based on question boundaries
        """
        questions = list()
        sub_list = None
        for i, data in enumerate(self.content):
            text = data.get('text')
            if text is None and sub_list is None:
                # ignoring blank lines outside question boundaries
                continue
            if self.is_qstart(text):
                if sub_list is not None:
                    msg = "Question End 'qend' not found"
                    # print ' - Sublist is not None: %s' % sub_list
                    self.log.error(msg)
                    self.add_error(msg)
                    break
                sub_list = list()
                sub_list.append(data)
                continue
            if text is not None and sub_list is None:
                msg = "Question Start 'qstart' not found"
                self.log.error(msg)
                self.add_error(msg)
                break
            if self.is_qend(text):
                sub_list.append(data)
                questions.append(sub_list)
                sub_list = None
            else:
                sub_list.append(data)
        return questions

    def convert_to_dict(self, qlist):
        questions = list()
        for que_list in qlist:
            question = {}
            code = None
            values = list()
            for line_obj in que_list:
                text = line_obj.get('text')
                #if self._is_code(text):
                if self.answer_code_spec.is_code_like(text):
                    if code is None:
                        code = text
                    else:
                        # codes are ':' stripped and lowered
                        code = code.strip().rstrip(':').lower()
                        question[code] = values
                        code = text
                        values = list()
                else:
                    values.append(line_obj)

            questions.append(question)
        return questions

    def load_components(self):
        tree = etree.parse(os.path.join(self._path, self.CONTENT_FILE))
        root = tree.getroot()
        for element in root:
            if 'automatic-styles' in element.tag:
                self._style_element = element
            elif 'body' in element.tag:
                self._text_element = element[0]

    def dig(self, elem):
        if self.is_list_tag(elem):
            self.process_list(elem)
        elif self.is_p_tag(elem) or self.is_span_tag(elem):
            if self._is_leaf(elem):
                self.extract_leaf(elem)
            else:
                for child in elem.xpath("./node()"):
                    #print '>>>>>', sub, 'TYPE: ', type(sub)
                    # TODO: confirm unicode transformation
                    if isinstance(child, etree._ElementStringResult):
                        self.extract_leaf(elem, text=str(child))
                    elif isinstance(child, etree._ElementUnicodeResult):
                        self.extract_leaf(elem, text=repr(child))
                    elif isinstance(child, etree._Element):
                        if self._is_leaf(child):
                            self.extract_leaf(child)
                        else:
                            self.dig(child)
        elif self.is_equation_tag(elem):
            obj_ref = elem[0].get("{http://www.w3.org/1999/xlink}href")
            if obj_ref:
                obj_path = self._path + '/' + obj_ref + '/' + 'content.xml'
                if os.path.exists(obj_path):
                    ref_file = open(obj_path)
                    mathml = ref_file.read()
                    ref_file.close()
                    self.extract_leaf(None, mathml=mathml)
                else:
                    msg = "Invalid Object referenced '%s'" % obj_ref
                    self.log.error(msg)
                    self.add_error(msg)

    def extract_leaf(self, element, text=None, mathml=None):
        if mathml is None and element.tag.endswith('soft-page-break'):
            return
        self.position += 1
        if mathml:
            param = {'text': mathml, 'style': None, 'position': self.position, 'is_mathml': True}
            self.content.append(param)
            #print 'Leaf +>', param.get('text')
            return
        if not text:
            if self.is_p_tag(element):
                if element.text is not None:
                    text = '\n' + element.text + '\n'
                # elif element.text is None:
                #     text = '\n'
            else:
                text = element.text
        param = {'text': text, 'style': self.get_style(element), 'position': self.position}
        self.content.append(param)
        #print 'Leaf +>', param.get('text')

    def process_list(self, element):
        # todo Test variations in multiple documents
        for child in element[0]:
            if self._is_leaf(child):
                self.extract_leaf(child)
            else:
                self.dig(child)

    def get_style(self, element):
        # return None
        style_name = element.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name")
        #print " ## target style: ",style_name
        if self.styles.get(style_name):
            return self.styles.get(style_name)

        for child in self._style_element:
            if child.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") == style_name:
                #print "--------",style_name
                #if sub.find([0].tag.endswith('text-properties'):
                target_property_tag_names = ['text-properties', 'paragraph-properties']
                style_info = {}
                for tag_name in target_property_tag_names:
                    prop_elem = self._get_child(child, tag_name)
                    if prop_elem is not None:
                        #BOLD "{urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0}font-weight"
                        #ITALIC "{urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0}font-style"
                        # Load all style info
                        for attrib_name in prop_elem.keys():
                            style_info[attrib_name] = prop_elem.get(attrib_name)

                if style_info:
                    style_info['style_name'] = style_name
                self.styles[style_name] = style_info
        if self.styles.get(style_name) is not None:
            return self.styles.get(style_name)
        elif style_name not in [None, 'Standard']:
            self.log.info('No style found with name "%s"' % style_name)

    def _get_child(self, elem, name):
        for sub in elem.iter():
            if sub.tag.endswith(name):
                return sub

    def _is_leaf(self, element):
        if len(element)==0:
            return True

    def is_p_tag(self, element):
        if element.tag.endswith('}p'):
            return True

    def is_span_tag(self, element):
        if element.tag.endswith('}span'):
            return True

    def is_list_tag(self, element):
        if element.tag.endswith('}list'):
            return True

    def is_equation_tag(self, element):
        if element.tag.endswith('}frame'):
            return True

    def add_error(self, msg):
        if self.errors is None:
            self.errors = list()
        self.is_error = True
        self.errors.append(msg)

    def is_qstart(self, text):
        if text and text.strip().lower() == 'qstart:':
            return True

    def is_qend(self, text):
        if text and text.strip().lower() == 'qend:':
            return True
Ejemplo n.º 4
0
class SpecParser:
    """
    Acceps a list of Question objects parsed by ODTParser in content
    """
    def __init__(self, content, name, resource_dir):
        self.log = logging.getLogger()
        if content is None:
            raise Exception("SpecParser cannot be initialized, content is None")
        self.content = content
        self.name = name
        self.resource_list = None
        self.load_resources(resource_dir)
        self.answer_code_spec = AnswerCodeSpec()
        self.errors = None
        self.is_error = False
        self.img_content_re = re.compile(r"(<img_[A-Za-z0-9._]+>)")

    def process(self):
        self.content = self.merge_clean(self.content)
        if self.is_error:
            return self.content, self.errors

        if not self.validate():
            self.log.info("Validation Failed")
        return self.content, self.errors

    def validate(self):
        #todo Apply spec validation
        validated = True
        for index, question in enumerate(self.content):
            #print "******* validating question %s of %s " % ((index + 1),len(self.content))
            if not self.validate_question(question, index + 1):
                validated = False
        return validated

    def validate_question(self, question, queno):
        anstype = self.get_question_anstype(question)
        anstype_spec = self.answer_code_spec.get_anstype_spec(anstype)
        if not anstype_spec:
            print ">>>>>>>>>>>>>>>> Anstype spec not defined for '%s'<<<<<<<<<<<<<<<<<<" % anstype
            self.log.warning("Answer type spec not defined for '%s'" % anstype)
        is_valid = True
        for code in anstype_spec.keys():
            # Question must contain emtpy or non empty code
            if code not in question:
                msg = "Q#[%s]: Incomlete question. Code '%s' is required. Codes found are %s" % \
                      (queno, code, question.keys())
                self.log.error(msg)
                self.add_error(msg)
                is_valid = False
                continue
            code_data = question.get(code)
            code_spec = anstype_spec.get(code)
            if code_spec.get('plain_text', False) and code_spec.get('values', []):
                values = code_spec.get('values')
                if not self.validate_value(code_data, values):
                    msg = "Q#[%s]: Invalid value '%s'. Code '%s' can only have values %s" % (queno, code_data[0].get('text'), code, values)
                    self.log.error(msg)
                    self.add_error(msg)
                    is_valid = False

            if code_spec.get('plain_text') is None:
                if self.validate_for_images(code_data, queno) is False:
                    is_valid = False
                question[code] = self.clean(code_data)

        return is_valid

    def validate_value(self, data, values):
        if data[0].get('text').strip() in values:
            return True

    def merge_clean(self, questions):
        default_spec = self.answer_code_spec.get_default_spec()
        for queno, question in enumerate(questions):
            queno += 1
            if 'anstype' not in question.keys():
                msg = "Q#[%s]: Mandatory code 'anstype' not found" % queno
                self.log.error(msg)
                self.add_error(msg)
                return None
            for code in question.keys():
                # for each code in question
                #print '--------------', code
                qdata = question[code]
                spec = default_spec[code]
                if spec is None:
                    msg = "Q#[%s]: Invalid Code %s" % (queno, code)
                    self.log.error(msg)
                    self.add_error(msg)
                    return None
                if spec.get('plain_text', False):
                    # single object instead of a list
                    d = self._merge_as_plain(qdata)
                    question[code] = d

            # anstype value should be flattened by now
            anstype = question.get('anstype')[0].get('text')
            if anstype:
                anstype = anstype.strip()
                question.get('anstype')[0]['text'] = anstype
            if not self.is_valid_anstype(anstype):
                msg = "Q#[%s]: Invalid answer type value '%s'. Valid values are %s" % \
                      (queno, anstype, AnswerCodeSpec.ANSWER_TYPES)
                self.log.error(msg)
                self.add_error(msg)
                return None

        return questions

    def clean(self, code_data):
        squeezed = []
        for data in code_data:
            if data.get('text') and data.get('text').strip():
                if 'style' in data:
                    del data['style']
                squeezed.append(data)
        return squeezed
        # squeezed = []
        # merged = None
        # for data in code_data:
        #     if data.get('is_mathml', False):
        #         if merged:
        #             squeezed.append({'text': merged})
        #             merged = None
        #         squeezed.append(data)
        #         continue
        #     if merged is None:
        #         merged = ''
        #
        #     if data.get('text'):
        #         merged += data.get('text')
        #     else:
        #         merged += '\n'
        # if merged:
        #     squeezed.append({'text': merged})
        # return squeezed


    def validate_for_images(self, text, queno):
        # <img_3.MD.1_003> = 3.MD.1_003.png, <img_3.MD.1_002a> = 3.MD.1_002a.png
        is_valid = True
        for line in text:
            if line.get('text'):
                for img in self.img_content_re.findall(line.get('text')):
                    if self.ref_object_exist(img) is False:
                        is_valid = False
                        msg = "Q#[%s]: Image not available '%s'" % (queno, img)
                        self.log.error(msg)
                        self.add_error(msg)
        return is_valid

    def is_valid_anstype(self, anstype):
        return anstype in AnswerCodeSpec.ANSWER_TYPES


    def ref_object_exist(self, name):
        if self.resource_list is None:
            return False
        name = name.lstrip("<img_").rstrip(">")
        return name+".png" in self.resource_list

    def get_question_anstype(self, question):
        return question.get('anstype')[0].get('text')

    def load_resources(self, dir):
        if os.path.exists(dir) is False:
            self.log.warning("Image dir not found")
            return
        if self.resource_list is None:
            self.resource_list = list()
            for name in os.listdir(dir):
                self.resource_list.append(name)

    def _merge_as_plain(self, data):
        text = ''
        for obj in data:
            t = obj.get('text')
            if t is None:
                text += '\n'
            else:
                text += t
        return [{'text': text}]

    def add_error(self, msg):
        if self.errors is None:
            self.errors = list()
        self.is_error = True
        self.errors.append(msg)