def __init__(self, file_stream, url): DocumentParser.__init__(self, file_stream) self._url = url self._namespace_URI = 'http://www.loc.gov/METS/' self._mods_namespace_URI = 'http://www.loc.gov/mods/v3' #read the content of the file self._content_str = self._file_stream.read() self._logical_structure = None self._physical_structure = None self._meta_data = None self._relation = None self._file_list = None #some METS files contain uppercase mets directive #self._content_str = self._content_str.replace('METS=', 'mets=') #self._content_str = self._content_str.replace('', '') #self._content_str = self._content_str.replace('MODS=', 'mods=') #self._content_str = self._content_str.replace('', '') try: self._doc = parseString(self._content_str) except Exception: raise ParserError.InvalidDocument("The file is invalid. (is it" \ "corrupted?)") if self._check_xml() is not True: raise ParserError.InvalidDocument("The file is invalid. (is it" \ "corrupted?)")
def parse_verb(word_list): skip(word_list, 'stop') if peek(word_list) == 'verb': return match(word_list, 'verb') else: raise ParserError("Expected a verb next.")
def _parse_assignment(self): equals_token = self.expect(TokenType.EQUALS) self.next() left_side = self.stack.pop() if not isinstance(left_side, Identifier) and not isinstance(left_side, ArrayDeclaration): raise ParserError( 'unexpected left side of assignment, expected Identifier or ArrayDeclaration, but got {}'.format( repr(left_side))) self.skip_whitespaces(include_newlines=True) token = self.token() if token.token_type == TokenType.WORD: right_side = self._parse_identifier() elif token.token_type in [TokenType.QUOTE, TokenType.DOUBLE_QUOTES]: right_side = self._parse_string_literal() elif token.token_type == TokenType.NUMBER: right_side = self._parse_constant() elif token.token_type == TokenType.L_CURLY: right_side = self._parse_array() else: raise ParsingError('unexpected right side of assignment: {}'.format(repr(token))) semicolon_token = self.expect(TokenType.SEMICOLON) self.index += 1 return Assignment(left_side, equals_token, right_side, semicolon_token)
def _parse_array(self): l_curly_token = self.expect(TokenType.L_CURLY) token = self.next() children = [] while token.token_type != TokenType.R_CURLY and self.index < len(self.tokens): if token.token_type in [TokenType.WHITESPACE, TokenType.TAB, TokenType.NEWLINE]: self.skip_whitespaces(include_newlines=True) token = self.token() if token.token_type == TokenType.NUMBER: children.append(self._parse_constant()) elif token.token_type in [TokenType.QUOTE, TokenType.DOUBLE_QUOTES]: children.append(self._parse_string_literal()) elif token.token_type == TokenType.WORD: children.append(self._parse_identifier()) elif token.token_type == TokenType.L_CURLY: children.append(self._parse_array()) else: raise ParserError('encountered unexpected token while parsing array: {}'.format(repr(token))) self.skip_whitespaces(include_newlines=True) self.expect([TokenType.COMMA, TokenType.R_CURLY]) if self.token().token_type == TokenType.COMMA: token = self.next() continue else: break r_curly_token = self.expect(TokenType.R_CURLY) self.index += 1 return Array(l_curly_token, children, r_curly_token)
def _get_record(self): """Get the record object in the xml file.""" self._file_stream.seek(0) content_str = self._file_stream.read() doc = parseString(content_str) records = doc.getElementsByTagNameNS(self._namespace_URI, 'mods') # get the id number of the first record if len(records) == 0: raise ParserError.InvalidDocument( "XML/Mods Core document should contains at lease one record!") if len(records) > 1: raise ParserError.InvalidDocument( "XML/Mods Core document should not contains more than "\ "one record!") return records[0]
def parse_object(word_list): skip(word_list, 'stop') next_word = peek(word_list) if next_word == 'noun': return match(word_list, 'noun') elif next_word == 'direction': return match(word_list, 'direction') else: raise ParserError("Expected a noun or direction")
def parse_subject(word_list): skip(word_list, 'stop') next_word = peek(word_list) if next_word == 'noun': return match(word_list, 'noun') elif next_word == 'verb': return ('noun', 'player') else: raise ParserError("Expected a verb next.")
def _parse_array_declaration(self): l_square = self.expect(TokenType.L_SQUARE) r_square = self.expect_next(TokenType.R_SQUARE) self.index += 1 identifier = self.stack.pop() if not isinstance(identifier, Identifier): raise ParserError('expected identifier for array declaration, but got {}'.format(repr(identifier))) return ArrayDeclaration(identifier, l_square, r_square)
def parse(version_str): m = re.match( r""" ^JDK # 先頭JDK ([1-9][0-9]*) # familyNumber1桁目は0位外の数字で始まりn桁の数字 u # 固定文字 u ([0-9]+)$ # 1桁以上の数字(0可) """, version_str, re.VERBOSE) if m is None: raise ParserError("invalid version string.") familyNumber = int(m.group(1)) updateNumber = int(m.group(2)) return JavaVersion(familyNumber, updateNumber)
def __init__(self, node, argument): '''Exception raised when a error is found''' ParserError.__init__(self, "(Line %i, Col %i) " % \ (node.lineno, node.col_offset) + argument)