def parse_rtf(self): """ Parse the file by calling on other classes. Requires: Nothing Returns: A parsed file in XML, either to standard output or to a file, depending on the value of 'output' when the instance was created. """ self.__temp_file = self.__make_temp_file(self.__file) # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory if self.__debug_dir: copy_obj = copy.Copy( bug_handler=RtfInvalidCodeException, ) copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets( file=self.__temp_file, bug_handler=RtfInvalidCodeException, ) # convert Macintosh and Windows line endings to Unix line endings # why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, replace_illegals=self.__replace_illegals, ) return_value = line_obj.fix_endings() # calibre return what? self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler=RtfInvalidCodeException, in_file=self.__temp_file, copy=self.__copy, run_level=self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, exception_handler=InvalidRtfException, ) try: return_value = process_tokens_obj.process_tokens() except InvalidRtfException as msg: # Check to see if the file is correctly encoded encode_obj = default_encoding.DefaultEncoding( in_file=self.__temp_file, run_level=self.__run_level, bug_handler=RtfInvalidCodeException, check_raw=True, default_encoding=self.__default_encoding, ) platform, code_page, default_font_num = encode_obj.find_default_encoding() check_encoding_obj = check_encoding.CheckEncoding( bug_handler=RtfInvalidCodeException, ) enc = encode_obj.get_codepage() # TODO: to check if cp is a good idea or if I should use a dict to convert enc = 'cp' + enc msg = '%s\nException in token processing' % unicode_type(msg) if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, bytes) \ else self.__file.encode('utf-8') msg +='\nFile %s does not appear to be correctly encoded.\n' % file_name try: os.remove(self.__temp_file) except OSError: pass raise InvalidRtfException(msg) delete_info_obj = delete_info.DeleteInfo( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level,) # found destination means {\*\destination # if found, the RTF should be newer RTF found_destination = delete_info_obj.delete_info() self.__bracket_match('delete_data_info') # put picts in a separate file pict_obj = pict.Pict( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, orig_file=self.__file, out_file=self.__out_file, run_level=self.__run_level, ) pict_obj.process_pict() self.__bracket_match('pict_data_info') combine_obj = combine_borders.CombineBorders( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level,) combine_obj.combine_borders() self.__bracket_match('combine_borders_info') footnote_obj = footnote.Footnote( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) footnote_obj.separate_footnotes() self.__bracket_match('separate_footnotes_info') header_obj = header.Header( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) header_obj.separate_headers() self.__bracket_match('separate_headers_info') list_numbers_obj = list_numbers.ListNumbers( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) list_numbers_obj.fix_list_numbers() self.__bracket_match('list_number_info') preamble_div_obj = preamble_div.PreambleDiv( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) list_of_lists = preamble_div_obj.make_preamble_divisions() self.__bracket_match('make_preamble_divisions') encode_obj = default_encoding.DefaultEncoding( in_file=self.__temp_file, run_level=self.__run_level, bug_handler=RtfInvalidCodeException, default_encoding=self.__default_encoding, ) platform, code_page, default_font_num = encode_obj.find_default_encoding() hex2utf_obj = hex_2_utf8.Hex2Utf8( in_file=self.__temp_file, copy=self.__copy, area_to_convert='preamble', char_file=self.__char_data, default_char_map=code_page, run_level=self.__run_level, bug_handler=RtfInvalidCodeException, invalid_rtf_handler=InvalidRtfException, ) hex2utf_obj.convert_hex_2_utf8() self.__bracket_match('hex_2_utf_preamble') fonts_obj = fonts.Fonts( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, default_font_num=default_font_num, run_level=self.__run_level, ) special_font_dict = fonts_obj.convert_fonts() self.__bracket_match('fonts_info') color_obj = colors.Colors( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level, ) color_obj.convert_colors() self.__bracket_match('colors_info') style_obj = styles.Styles( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) style_obj.convert_styles() self.__bracket_match('styles_info') info_obj = info.Info( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) info_obj.fix_info() default_font = special_font_dict.get('default-font') preamble_rest_obj = preamble_rest.Preamble( file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, platform=platform, default_font=default_font, code_page=code_page) preamble_rest_obj.fix_preamble() self.__bracket_match('preamble_rest_info') old_rtf_obj = OldRtf( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, run_level=self.__run_level, ) # RTF can actually have destination groups and old RTF. # BAH! old_rtf = old_rtf_obj.check_if_old_rtf() if old_rtf: if self.__run_level > 5: msg = 'Older RTF\n' \ 'self.__run_level is "%s"\n' % self.__run_level raise RtfInvalidCodeException(msg) if self.__run_level > 1: sys.stderr.write('File could be older RTF...\n') if found_destination: if self.__run_level > 1: sys.stderr.write( 'File also has newer RTF.\n' 'Will do the best to convert...\n' ) add_brackets_obj = add_brackets.AddBrackets( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, ) add_brackets_obj.add_brackets() fields_small_obj = fields_small.FieldsSmall( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level,) fields_small_obj.fix_fields() self.__bracket_match('fix_small_fields_info') fields_large_obj = fields_large.FieldsLarge( in_file=self.__temp_file, copy=self.__copy, bug_handler=RtfInvalidCodeException, run_level=self.__run_level) fields_large_obj.fix_fields() self.__bracket_match('fix_large_fields_info') sections_obj = sections.Sections( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level,) sections_obj.make_sections() self.__bracket_match('sections_info') paragraphs_obj = paragraphs.Paragraphs( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, write_empty_para=self.__empty_paragraphs, run_level=self.__run_level,) paragraphs_obj.make_paragraphs() self.__bracket_match('paragraphs_info') default_font = special_font_dict['default-font'] paragraph_def_obj = paragraph_def.ParagraphDef( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, default_font=default_font, run_level=self.__run_level,) list_of_styles = paragraph_def_obj.make_paragraph_def() body_styles_obj = body_styles.BodyStyles( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, list_of_styles=list_of_styles, run_level=self.__run_level,) body_styles_obj.insert_info() self.__bracket_match('body_styles_info') self.__bracket_match('paragraph_def_info') table_obj = table.Table( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level,) table_data = table_obj.make_table() self.__bracket_match('table_info') table_info_obj = table_info.TableInfo( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, table_data=table_data, run_level=self.__run_level,) table_info_obj.insert_info() self.__bracket_match('table__data_info') if self.__form_lists: make_list_obj = make_lists.MakeLists( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, headings_to_sections=self.__headings_to_sections, run_level=self.__run_level, list_of_lists=list_of_lists, ) make_list_obj.make_lists() self.__bracket_match('form_lists_info') if self.__headings_to_sections: headings_to_sections_obj = headings_to_sections.HeadingsToSections( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level,) headings_to_sections_obj.make_sections() self.__bracket_match('headings_to_sections_info') if self.__group_styles: group_styles_obj = group_styles.GroupStyles( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, wrap=1, run_level=self.__run_level,) group_styles_obj.group_styles() self.__bracket_match('group_styles_info') if self.__group_borders: group_borders_obj = group_borders.GroupBorders( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, wrap=1, run_level=self.__run_level,) group_borders_obj.group_borders() self.__bracket_match('group_borders_info') inline_obj = inline.Inline( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level,) inline_obj.form_tags() self.__bracket_match('inline_info') hex2utf_obj.update_values(file=self.__temp_file, area_to_convert='body', copy=self.__copy, char_file=self.__char_data, convert_caps=self.__convert_caps, convert_symbol=self.__convert_symbol, convert_wingdings=self.__convert_wingdings, convert_zapf=self.__convert_zapf, symbol=1, wingdings=1, dingbats=1, ) hex2utf_obj.convert_hex_2_utf8() header_obj.join_headers() footnote_obj.join_footnotes() tags_obj = convert_to_tags.ConvertToTags( in_file=self.__temp_file, copy=self.__copy, dtd_path=self.__dtd_path, indent=self.__indent, run_level=self.__run_level, no_dtd=self.__no_dtd, encoding=encode_obj.get_codepage(), bug_handler=RtfInvalidCodeException, ) tags_obj.convert_to_tags() output_obj = output.Output( file=self.__temp_file, orig_file=self.__file, output_dir=self.__out_dir, out_file=self.__out_file, ) output_obj.output() os.remove(self.__temp_file) return self.__exit_level
def parse_rtf(self): """ Parse the file by calling on other classes. Requires: Nothing Returns: A parsed file in XML, either to standard output or to a file, depending on the value of 'output' when the instance was created. """ self.__temp_file = self.__make_temp_file(self.__file) # if the self.__deb_dir is true, then create a copy object, # set the directory to write to, remove files, and copy # the new temporary file to this directory if self.__debug_dir: copy_obj = copy.Copy( bug_handler=RtfInvalidCodeException, ) copy_obj.set_dir(self.__debug_dir) copy_obj.remove_files() copy_obj.copy_file(self.__temp_file, "original_file") # Function to check if bracket are well handled if self.__debug_dir or self.__run_level > 2: self.__check_brack_obj = check_brackets.CheckBrackets( file=self.__temp_file, bug_handler=RtfInvalidCodeException, ) # convert Macintosh and Windows line endings to Unix line endings # why do this if you don't wb after? line_obj = line_endings.FixLineEndings( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, replace_illegals=self.__replace_illegals, ) return_value = line_obj.fix_endings() # calibre return what? self.__return_code(return_value) tokenize_obj = tokenize.Tokenize( bug_handler=RtfInvalidCodeException, in_file=self.__temp_file, copy=self.__copy, run_level=self.__run_level) tokenize_obj.tokenize() process_tokens_obj = process_tokens.ProcessTokens( in_file=self.__temp_file, bug_handler=RtfInvalidCodeException, copy=self.__copy, run_level=self.__run_level, exception_handler=InvalidRtfException, ) try: return_value = process_tokens_obj.process_tokens() except InvalidRtfException, msg: # Check to see if the file is correctly encoded encode_obj = default_encoding.DefaultEncoding( in_file=self.__temp_file, run_level=self.__run_level, bug_handler=RtfInvalidCodeException, check_raw=True, default_encoding=self.__default_encoding, ) platform, code_page, default_font_num = encode_obj.find_default_encoding() check_encoding_obj = check_encoding.CheckEncoding( bug_handler=RtfInvalidCodeException, ) enc = encode_obj.get_codepage() # TODO: to check if cp is a good idea or if I should use a dict to convert enc = 'cp' + enc msg = '%s\nException in token processing' % str(msg) if check_encoding_obj.check_encoding(self.__file, enc): file_name = self.__file if isinstance(self.__file, str) \ else self.__file.encode('utf-8') msg +='\nFile %s does not appear to be correctly encoded.\n' % file_name try: os.remove(self.__temp_file) except OSError: pass raise InvalidRtfException(msg)