def update_file_regress(text, filename, check_regression): ''' Behaves like jlib.update_file(), but if check_regression is true and <filename> already exists with different content from <text>, we show a diff and raise an exception. ''' text_old = jlib.update_file(text, filename, check_regression) if text_old: jlib.log( 'jlib.update_file() => {len(text_old)=}. {filename=} {check_regression}' ) if check_regression: if text_old is not None: # Existing content differs and <check_regression> is true. with open(f'{filename}-2', 'w') as f: f.write(text) jlib.log('Output would have changed: {filename}') jlib.system(f'diff -u {filename} {filename}-2', verbose=True, raise_errors=False, prefix=f'diff {os.path.relpath(filename)}: ', out='log') return Exception(f'Output would have changed: {filename}') else: jlib.log('Generated file unchanged: {filename}')
def cmd_run_multiple(commands, prefix=None): ''' Windows-only. Runs multiple commands joined by &&, using cmd.exe if we are running under Cygwin. We cope with commands that already contain double-quote characters. ''' if state_.cygwin: command = 'cmd.exe /V /C @ ' + ' "&&" '.join(commands) else: command = ' && '.join(commands) jlib.system(command, verbose=1, out='log', prefix=prefix)
def test_swig(): ''' For testing different swig .i constructs. ''' test_i = textwrap.dedent(''' %include argcargv.i %apply (int ARGC, char **ARGV) { (int retainlen, const char **retainlist) } %apply (int ARGC, char **ARGV) { (const char **retainlist, int retainlen) } %apply (int ARGC, char **ARGV) { (const char *retainlist[], int retainlen) } %clear double a, int ARGC, char **ARGV; %clear double a, int argc, char *argv[]; %clear int ARGC, char **ARGV; %clear (double a, int ARGC, char **ARGV); %clear (double a, int argc, char *argv[]); %clear (int ARGC, char **ARGV); %clear int retainlen, const char **retainlist; int bar( int argc, char* argv[]); int foo( double a, int argc, char* argv[]); int qwe( double a, int argc, const char** argv); void ppdf_clean_file( char *infile, char *outfile, char *password, pdf_write_options *opts, int retainlen, const char **retainlist); void ppdf_clean_file2(char *infile, char *outfile, char *password, pdf_write_options *opts, const char **retainlist, int retainlen); void ppdf_clean_file3(char *infile, char *outfile, char *password, pdf_write_options *opts, const char *retainlist[], int retainlen); ''') jlib.update_file( test_i, 'test.i') jlib.system( textwrap.dedent( ''' swig -Wall -c++ -python -module test -outdir . -o test.cpp test.i ''').replace( '\n', ' \\\n') )
def system( command, raise_errors=True, return_output=False, prefix=None, caller=1, bufsize=-1, ): ''' Runs a command. See jlib.system()'s docs for details. ''' return jlib.system( command, verbose=not return_output, raise_errors=raise_errors, out='return' if return_output else 'log', prefix=prefix, caller=caller+1, bufsize=bufsize, )
def build_swig( state_, build_dirs, generated, language='python', swig_command='swig', check_regress=False, force_rebuild=False, ): ''' Builds python or C# wrappers for all mupdf_* functions and classes, by creating a .i file that #include's our generated C++ header files and running swig. build_dirs A BuildDirs instance. generated. A Generated instance. language The output language, must be 'python' or 'csharp'. swig Location of swig binary. check_regress If true, we fail with error if generated .i file already exists and differs from our new content. ''' assert isinstance(state_, state.State) jlib.log('{=build_dirs type(build_dirs)}') assert isinstance(build_dirs, state.BuildDirs), type(build_dirs) assert isinstance(generated, cpp.Generated), type(generated) assert language in ('python', 'csharp') # Find version of swig. (We use quotes around <swig> to make things work on # Windows.) try: t = jlib.system(f'"{swig_command}" -version', out='return') except Exception as e: if state_.windows: raise Exception( 'swig failed; on Windows swig can be auto-installed with: --swig-windows-auto' ) from e else: raise m = re.search('SWIG Version ([0-9]+)[.]([0-9]+)[.]([0-9]+)', t) assert m swig_major = int(m.group(1)) # Create a .i file for SWIG. # common = f''' #include <stdexcept> #include "mupdf/functions.h" #include "mupdf/classes.h" #include "mupdf/classes2.h" ''' if language == 'csharp': common += textwrap.dedent(f''' /* This is required otherwise compiling the resulting C++ code fails with: error: use of undeclared identifier 'SWIG_fail' But no idea whether it is the 'correct' thing to do; seems odd that SWIG doesn't define SWIG_fail itself. */ #define SWIG_fail throw std::runtime_error( e.what()); ''') if language == 'python': common += textwrap.dedent(f''' /* Support for extracting buffer data into a Python bytes. */ PyObject* buffer_extract_bytes(fz_buffer* buffer) {{ unsigned char* c = NULL; /* We mimic the affects of fz_buffer_extract(), which leaves the buffer with zero capacity. */ size_t len = mupdf::buffer_storage(buffer, &c); PyObject* ret = PyBytes_FromStringAndSize((const char*) c, (Py_ssize_t) len); if (ret) {{ mupdf::clear_buffer(buffer); mupdf::trim_buffer(buffer); }} return ret; }} /* Creates Python bytes from copy of raw data. */ PyObject* raw_to_python_bytes(const unsigned char* c, size_t len) {{ return PyBytes_FromStringAndSize((const char*) c, (Py_ssize_t) len); }} /* Creates Python bytes from copy of raw data. */ PyObject* raw_to_python_bytes(const void* c, size_t len) {{ return PyBytes_FromStringAndSize((const char*) c, (Py_ssize_t) len); }} /* The SWIG wrapper for this function returns a SWIG proxy for a 'const unsigned char*' pointing to the raw data of a python bytes. This proxy can then be passed from Python to functions that take a 'const unsigned char*'. For example to create a MuPDF fz_buffer* from a copy of a Python bytes instance: bs = b'qwerty' buffer_ = mupdf.new_buffer_from_copied_data(mupdf.python_bytes_data(bs), len(bs)) */ const unsigned char* python_bytes_data(const unsigned char* PYTHON_BYTES_DATA, size_t PYTHON_BYTES_SIZE) {{ return PYTHON_BYTES_DATA; }} /* Casts an integer to a pdf_obj*. Used to convert SWIG's int values for PDF_ENUM_NAME_* into PdfObj's. */ pdf_obj* obj_enum_to_obj(int n) {{ return (pdf_obj*) (intptr_t) n; }} /* SWIG-friendly alternative to ppdf_set_annot_color(). */ void ppdf_set_annot_color2(pdf_annot *annot, int n, float color0, float color1, float color2, float color3) {{ float color[] = {{ color0, color1, color2, color3 }}; return mupdf::ppdf_set_annot_color(annot, n, color); }} /* SWIG-friendly alternative to ppdf_set_annot_color(). */ void ppdf_set_annot_interior_color2(pdf_annot *annot, int n, float color0, float color1, float color2, float color3) {{ float color[] = {{ color0, color1, color2, color3 }}; return mupdf::ppdf_set_annot_color(annot, n, color); }} /* SWIG-friendly alternative to mfz_fill_text(). */ void mfz_fill_text2( mupdf::Device& dev, const mupdf::Text& text, mupdf::Matrix& ctm, const mupdf::Colorspace& colorspace, float color0, float color1, float color2, float color3, float alpha, mupdf::ColorParams& color_params ) {{ float color[] = {{color0, color1, color2, color3}}; return mfz_fill_text(dev, text, ctm, colorspace, color, alpha, color_params); }} std::vector<unsigned char> mfz_memrnd2(int length) {{ std::vector<unsigned char> ret(length); mupdf::mfz_memrnd(&ret[0], length); return ret; }} ''') common += textwrap.dedent(f''' /* SWIG-friendly alternative to fz_runetochar(). */ std::vector<unsigned char> runetochar2(int rune) {{ std::vector<unsigned char> buffer(10); int n = mupdf::runetochar((char*) &buffer[0], rune); assert(n < sizeof(buffer)); buffer.resize(n); return buffer; }} /* SWIG-friendly alternatives to fz_make_bookmark() and fz_lookup_bookmark(), using long long instead of fz_bookmark because SWIG appears to treat fz_bookmark as an int despite it being a typedef for intptr_t, so ends up slicing. */ long long unsigned make_bookmark2(fz_document* doc, fz_location loc) {{ fz_bookmark bm = mupdf::make_bookmark(doc, loc); return (long long unsigned) bm; }} long long unsigned mfz_make_bookmark2(fz_document* doc, fz_location loc) {{ return make_bookmark2(doc, loc); }} fz_location lookup_bookmark2(fz_document *doc, long long unsigned mark) {{ return mupdf::lookup_bookmark(doc, (fz_bookmark) mark); }} fz_location mfz_lookup_bookmark2(fz_document *doc, long long unsigned mark) {{ return lookup_bookmark2(doc, mark); }} struct convert_color2_dv {{ float dv0; float dv1; float dv2; float dv3; }}; /* SWIG-friendly alternative for fz_convert_color(). */ void convert_color2( fz_colorspace *ss, const float *sv, fz_colorspace *ds, convert_color2_dv* dv, fz_colorspace *is, fz_color_params params ) {{ mupdf::convert_color(ss, sv, ds, &dv->dv0, is, params); }} /* SWIG-friendly support for fz_set_warning_callback() and fz_set_error_callback(). */ struct SetWarningCallback {{ SetWarningCallback( void* user=NULL) {{ this->user = user; mupdf::set_warning_callback( s_print, this); }} virtual void print( const char* message) {{ }} static void s_print( void* self0, const char* message) {{ SetWarningCallback* self = (SetWarningCallback*) self0; return self->print( message); }} void* user; }}; struct SetErrorCallback {{ SetErrorCallback( void* user=NULL) {{ this->user = user; mupdf::set_error_callback( s_print, this); }} virtual void print( const char* message) {{ }} static void s_print( void* self0, const char* message) {{ SetErrorCallback* self = (SetErrorCallback*) self0; return self->print( message); }} void* user; }}; ''') common += generated.swig_cpp common += translate_ucdn_macros(build_dirs) text = '' if state_.windows: # 2022-02-24: Director classes break Windows builds at the moment. pass else: text += '%module(directors="1") mupdf\n' for i in generated.virtual_fnptrs: text += f'%feature("director") {i};\n' text += f'%feature("director") SetWarningCallback;\n' text += f'%feature("director") SetErrorCallback;\n' text += textwrap.dedent(''' %feature("director:except") { if ($error != NULL) { throw Swig::DirectorMethodException(); } } ''') for fnname in generated.c_functions: if fnname in ('pdf_annot_type', 'pdf_widget_type'): # These are also enums which we don't want to ignore. SWIGing the # functions is hopefully harmless. pass elif 0 and fnname == 'pdf_string_from_annot_type': # causes duplicate symbol with classes2.cpp and python. pass else: text += f'%ignore {fnname};\n' for i in ( 'fz_append_vprintf', 'fz_error_stack_slot', 'fz_format_string', 'fz_vsnprintf', 'fz_vthrow', 'fz_vwarn', 'fz_write_vprintf', ): text += f'%ignore {i};\n' text += f'%ignore m{i};\n' text += textwrap.dedent(f''' // Not implemented in mupdf.so: fz_colorspace_name_process_colorants %ignore fz_colorspace_name_process_colorants; %ignore fz_open_file_w; %ignore {util.rename.function('fz_append_vprintf')}; %ignore {util.rename.function('fz_error_stack_slot_s')}; %ignore {util.rename.function('fz_format_string')}; %ignore {util.rename.function('fz_vsnprintf')}; %ignore {util.rename.function('fz_vthrow')}; %ignore {util.rename.function('fz_vwarn')}; %ignore {util.rename.function('fz_write_vprintf')}; %ignore {util.rename.function('fz_vsnprintf')}; %ignore {util.rename.function('fz_vthrow')}; %ignore {util.rename.function('fz_vwarn')}; %ignore {util.rename.function('fz_append_vprintf')}; %ignore {util.rename.function('fz_write_vprintf')}; %ignore {util.rename.function('fz_format_string')}; %ignore {util.rename.function('fz_open_file_w')}; // SWIG can't handle this because it uses a valist. %ignore {util.rename.function('Memento_vasprintf')}; // asprintf() isn't available on windows, so exclude Memento_asprintf because // it is #define-d to asprintf. %ignore {util.rename.function('Memento_asprintf')}; // Might prefer to #include mupdf/exceptions.h and make the // %exception block below handle all the different exception types, // but swig-3 cannot parse 'throw()' in mupdf/exceptions.h. // // So for now we just #include <stdexcept> and handle // std::exception only. %include "typemaps.i" %include "cpointer.i" // This appears to allow python to call fns taking an int64_t. %include "stdint.i" %{{ {common} %}} %include exception.i %include std_string.i %include carrays.i %include cdata.i %include std_vector.i {"%include argcargv.i" if language=="python" else ""} %array_class(unsigned char, uchar_array); %include <cstring.i> %cstring_output_allocate(char **OUTPUT, free($1)); namespace std {{ %template(vectoruc) vector<unsigned char>; %template(vectori) vector<int>; %template(vectors) vector<std::string>; %template(vectorq) vector<mupdf::{util.rename.class_("fz_quad")}>; }}; // Make sure that operator++() gets converted to __next__(). // // Note that swig already seems to do: // // operator* => __ref__ // operator== => __eq__ // operator!= => __ne__ // operator-> => __deref__ // // Just need to add this method to containers that already have // begin() and end(): // def __iter__( self): // return CppIterator( self) // %rename(__increment__) *::operator++; %array_functions(unsigned char, bytes); ''') text += textwrap.dedent(f''' %exception {{ try {{ $action }} ''') if not state_.windows: # Directors not currently supported on Windows. text += textwrap.dedent(f''' catch (Swig::DirectorException &e) {{ SWIG_fail; }} ''') text += textwrap.dedent(f''' catch(std::exception& e) {{ SWIG_exception(SWIG_RuntimeError, e.what()); }} catch(...) {{ SWIG_exception(SWIG_RuntimeError, "Unknown exception"); }} }} ''') text += textwrap.dedent(f''' // Ensure SWIG handles OUTPUT params. // %include "cpointer.i" // Don't wrap raw fz_*() functions. %rename("$ignore", regexmatch$name="^fz_", %$isfunction, %$not %$ismember) ""; ''') if swig_major < 4: text += textwrap.dedent(f''' // SWIG version is less than 4 so swig is not able to copy // across comments from header file into generated code. The // next best thing is to use autodoc to make swig at least show // some generic information about arg types. // %feature("autodoc", "3"); ''') text += textwrap.dedent(f''' // Tell swig about pdf_clean_file()'s (int,argv)-style args: %apply (int ARGC, char **ARGV) {{ (int retainlen, char *retainlist[]) }} ''') if language == 'python': text += textwrap.dedent(''' %include pybuffer.i /* Convert Python bytes to (const unsigned char*, size_t) pair for python_bytes_data(). */ %pybuffer_binary(const unsigned char* PYTHON_BYTES_DATA, size_t PYTHON_BYTES_SIZE); ''') text += common if language == 'python': text += textwrap.dedent(f''' %pointer_functions(int, pint); %pythoncode %{{ def Document_lookup_metadata(self, key): """ Python implementation override of Document.lookup_metadata(). Returns string or None if not found. """ e = new_pint() ret = lookup_metadata(self.m_internal, key, e) e = pint_value(e) if e < 0: return None return ret Document.lookup_metadata = Document_lookup_metadata def PdfDocument_lookup_metadata(self, key): """ Python implementation override of PdfDocument.lookup_metadata(). Returns string or None if not found. """ e = new_pint() ret = ppdf_lookup_metadata(self.m_internal, key, e) e = pint_value(e) if e < 0: return None return ret PdfDocument.lookup_metadata = PdfDocument_lookup_metadata ''') if language == 'python': # Make some additions to the generated Python module. # # E.g. python wrappers for functions that take out-params should return # tuples. # text += generated.swig_python text += textwrap.dedent(''' import re # Wrap parse_page_range() to fix SWIG bug where a NULL return # value seems to mess up the returned list - we end up with ret # containing two elements rather than three, e.g. [0, 2]. This # occurs with SWIG-3.0; maybe fixed in SWIG-4? # w_parse_page_range = parse_page_range def parse_page_range(s, n): ret = w_parse_page_range(s, n) if len(ret) == 2: return None, 0, 0 else: return ret[0], ret[1], ret[2] # Provide native python implementation of format_output_path() (-> # fz_format_output_path). # def format_output_path( format, page): m = re.search( '(%[0-9]*d)', format) if m: ret = format[ :m.start(1)] + str(page) + format[ m.end(1):] else: dot = format.rfind( '.') if dot < 0: dot = len( format) ret = format[:dot] + str(page) + format[dot:] return ret class IteratorWrap: """ This is a Python iterator for containers that have C++-style begin() and end() methods that return iterators. Iterators must have the following methods: __increment__(): move to next item in the container. __ref__(): return reference to item in the container. Must also be able to compare two iterators for equality. """ def __init__( self, container): self.container = container self.pos = None self.end = container.end() def __iter__( self): return self def __next__( self): # for python2. if self.pos is None: self.pos = self.container.begin() else: self.pos.__increment__() if self.pos == self.end: raise StopIteration() return self.pos.__ref__() def next( self): # for python3. return self.__next__() # The auto-generated Python class method Buffer.buffer_extract() # returns (size, data). # # But these raw values aren't particularly useful to Python code so # we change the method to return a Python bytes instance instead, # using the special C function buffer_storage_bytes() defined # above. # # We make the original method available as # Buffer.buffer_extract_raw(); this can be used to create a # mupdf.Stream by passing the raw values back to C++ with: # # data, size = buffer_.buffer_extract_raw() # stream = mupdf.Stream(data, size)) # # We don't provide a similar wrapper for Buffer.buffer_storage() # because we can't create a Python bytes object that # points into the buffer's storage. We still provide # Buffer.buffer_storage_raw() just in case there is a need for # Python code that can pass the raw (data, size) back in to C. # Buffer.buffer_extract_raw = Buffer.buffer_extract def Buffer_buffer_extract(self): """ Returns buffer data as a Python bytes instance, leaving the buffer empty. Note that this will make a copy of the underlying data. """ return buffer_extract_bytes(self.m_internal) Buffer.buffer_extract = Buffer_buffer_extract Buffer.buffer_storage_raw = Buffer.buffer_storage #delattr(Buffer, 'buffer_storage') def Buffer_buffer_storage(self): raise Exception("Buffer.buffer_storage() is not available; use Buffer.buffer_storage_raw() to get (size, data) where <data> is SWIG wrapper for buffer's 'unsigned char*' storage") Buffer.buffer_storage = Buffer_buffer_storage # Overwrite Buffer.new_buffer_from_copied_data() to take Python Bytes instance. # def Buffer_new_buffer_from_copied_data(bytes_): buffer_ = new_buffer_from_copied_data(python_bytes_data(bytes_), len(bytes_)) return Buffer(buffer_) Buffer.new_buffer_from_copied_data = Buffer_new_buffer_from_copied_data def mpdf_dict_getl(obj, *tail): """ Python implementation of pdf_dict_getl(fz_context *ctx, pdf_obj *obj, ...), because SWIG doesn't handle variadic args. """ for key in tail: if not obj.m_internal: break obj = obj.dict_get(key) assert isinstance(obj, PdfObj) return obj PdfObj.dict_getl = mpdf_dict_getl def mpdf_dict_putl(obj, val, *tail): """ Python implementation of pdf_dict_putl(fz_context *ctx, pdf_obj *obj, pdf_obj *val, ...) because SWIG doesn't handle variadic args. """ if obj.is_indirect(): obj = obj.resolve_indirect_chain() if not obj.is_dict(): raise Exception(f'not a dict: {obj}') if not tail: return doc = obj.get_bound_document() for key in tail[:-1]: next_obj = obj.dict_get(key) if not next_obj.m_internal: # We have to create entries next_obj = doc.new_dict(1) obj.dict_put(key, next_obj) obj = next_obj key = tail[-1] obj.dict_put(key, val) PdfObj.dict_putl = mpdf_dict_putl def mpdf_dict_putl_drop(obj, *tail): raise Exception('mupdf.PdfObj.dict_putl_drop() is unsupported and unnecessary in Python because reference counting is automatic. Instead use mupdf.PdfObj.dict_putl()') PdfObj.dict_putl_drop = mpdf_dict_putl_drop def ppdf_set_annot_color(annot, color): """ Python implementation of pdf_set_annot_color() using ppdf_set_annot_color2(). """ if isinstance(color, float): ppdf_set_annot_color2(annot, 1, color, 0, 0, 0) elif len(color) == 1: ppdf_set_annot_color2(annot, 1, color[0], 0, 0, 0) elif len(color) == 2: ppdf_set_annot_color2(annot, 2, color[0], color[1], 0, 0) elif len(color) == 3: ppdf_set_annot_color2(annot, 3, color[0], color[1], color[2], 0) elif len(color) == 4: ppdf_set_annot_color2(annot, 4, color[0], color[1], color[2], color[3]) else: raise Exception( f'Unexpected color should be float or list of 1-4 floats: {color}') # Override PdfAnnot.set_annot_color() to use the above. def mpdf_set_annot_color(self, color): return ppdf_set_annot_color(self.m_internal, color) PdfAnnot.set_annot_color = mpdf_set_annot_color def ppdf_set_annot_interior_color(annot, color): """ Python version of pdf_set_annot_color() using ppdf_set_annot_color2(). """ if isinstance(color, float): ppdf_set_annot_interior_color2(annot, 1, color, 0, 0, 0) elif len(color) == 1: ppdf_set_annot_interior_color2(annot, 1, color[0], 0, 0, 0) elif len(color) == 2: ppdf_set_annot_interior_color2(annot, 2, color[0], color[1], 0, 0) elif len(color) == 3: ppdf_set_annot_interior_color2(annot, 3, color[0], color[1], color[2], 0) elif len(color) == 4: ppdf_set_annot_interior_color2(annot, 4, color[0], color[1], color[2], color[3]) else: raise Exception( f'Unexpected color should be float or list of 1-4 floats: {color}') # Override PdfAnnot.set_interiorannot_color() to use the above. def mpdf_set_annot_interior_color(self, color): return ppdf_set_annot_interior_color(self.m_internal, color) PdfAnnot.set_annot_interior_color = mpdf_set_annot_interior_color # Override mfz_fill_text() to handle color as a Python tuple/list. def mfz_fill_text(dev, text, ctm, colorspace, color, alpha, color_params): """ Python version of mfz_fill_text() using mfz_fill_text2(). """ color = tuple(color) + (0,) * (4-len(color)) assert len(color) == 4, f'color not len 4: len={len(color)}: {color}' return mfz_fill_text2(dev, text, ctm, colorspace, *color, alpha, color_params) Device.fill_text = mfz_fill_text # Override set_warning_callback() and set_error_callback() to # use Python classes derived from our SWIG Director classes # SetWarningCallback and SetErrorCallback (defined in C), so # that fnptrs can call Python code. # set_warning_callback_s = None set_error_callback_s = None def set_warning_callback2( printfn): class Callback( SetWarningCallback): def print( self, message): printfn( message) global set_warning_callback_s set_warning_callback_s = Callback() # Override set_error_callback(). def set_error_callback2( printfn): class Callback( SetErrorCallback): def print( self, message): printfn( message) global set_error_callback_s set_error_callback_s = Callback() set_warning_callback = set_warning_callback2 set_error_callback = set_error_callback2 ''') # Add __iter__() methods for all classes with begin() and end() methods. # for classname in generated.container_classnames: text += f'{classname}.__iter__ = lambda self: IteratorWrap( self)\n' # For all wrapper classes with a to_string() method, add a __str__() # method to the underlying struct's Python class, which calls # to_string_<structname>(). # # E.g. this allows Python code to print a mupdf.fz_rect instance. # # [We could instead call our generated to_string() and rely on overloading, # but this will end up switching on the type in the SWIG code.] # for struct_name in generated.to_string_structnames: text += f'{struct_name}.__str__ = lambda s: to_string_{struct_name}(s)\n' # For all wrapper classes with a to_string() method, add a __str__() method # to the Python wrapper class, which calls the class's to_string() method. # # E.g. this allows Python code to print a mupdf.Rect instance. # for struct_name in generated.to_string_structnames: text += f'{util.rename.class_(struct_name)}.__str__ = lambda self: self.to_string()\n' text += '%}\n' if 1: # lgtm [py/constant-conditional-expression] # This is a horrible hack to avoid swig failing because # include/mupdf/pdf/object.h defines an enum which contains a #include. # # Would like to pre-process files in advance so that swig doesn't see # the #include, but this breaks swig in a different way - swig cannot # cope with some code in system headers. # # So instead we copy include/mupdf/pdf/object.h into # {build_dirs.dir_mupdf}/platform/python/include/mupdf/pdf/object.h, # manually expanding the #include using a Python .replace() call. Then # we specify {build_dirs.dir_mupdf}/platform/python/include as the # first include path so that our modified mupdf/pdf/object.h will get # included in preference to the original. # os.makedirs( f'{build_dirs.dir_mupdf}/platform/python/include/mupdf/pdf', exist_ok=True) with open(f'{build_dirs.dir_mupdf}/include/mupdf/pdf/object.h') as f: o = f.read() with open( f'{build_dirs.dir_mupdf}/include/mupdf/pdf/name-table.h') as f: name_table_h = f.read() oo = o.replace('#include "mupdf/pdf/name-table.h"\n', name_table_h) assert oo != o jlib.update_file( oo, f'{build_dirs.dir_mupdf}/platform/python/include/mupdf/pdf/object.h' ) swig_i = f'{build_dirs.dir_mupdf}/platform/{language}/mupdfcpp_swig.i' include1 = f'{build_dirs.dir_mupdf}/include/' include2 = f'{build_dirs.dir_mupdf}/platform/c++/include' swig_cpp = f'{build_dirs.dir_mupdf}/platform/{language}/mupdfcpp_swig.cpp' swig_py = f'{build_dirs.dir_so}/mupdf.py' os.makedirs(f'{build_dirs.dir_mupdf}/platform/{language}', exist_ok=True) os.makedirs(f'{build_dirs.dir_so}', exist_ok=True) util.update_file_regress(text, swig_i, check_regress) # Try to disable some unhelpful SWIG warnings;. unfortunately this doesn't # seem to have any effect. disable_swig_warnings = [ 201, # Warning 201: Unable to find 'stddef.h' 314, # Warning 314: 'print' is a python keyword, renaming to '_print' 312, # Warning 312: Nested union not currently supported (ignored). 321, # Warning 321: 'max' conflicts with a built-in name in python 362, # Warning 362: operator= ignored 451, # Warning 451: Setting a const char * variable may leak memory. 503, # Warning 503: Can't wrap 'operator <<' unless renamed to a valid identifier. 512, # Warning 512: Overloaded method mupdf::DrawOptions::internal() const ignored, using non-const method mupdf::DrawOptions::internal() instead. ] disable_swig_warnings = map(str, disable_swig_warnings) disable_swig_warnings = '-w' + ','.join(disable_swig_warnings) if language == 'python': # Need -D_WIN32 on Windows because as of 2022-03-17, C++ code for # SWIG Directors support doesn't work on Windows so is inside #ifndef # _WIN32...#endif. # # Maybe use '^' on windows as equivalent to unix '\\' for multiline # ending? command = (textwrap.dedent(f''' "{swig_command}" {"-D_WIN32" if state_.windows else ""} -Wall -c++ {"-doxygen" if swig_major >= 4 else ""} -python {disable_swig_warnings} -module mupdf -outdir {os.path.relpath(build_dirs.dir_so)} -o {os.path.relpath(swig_cpp)} -includeall -I{os.path.relpath(build_dirs.dir_mupdf)}/platform/python/include -I{os.path.relpath(include1)} -I{os.path.relpath(include2)} -ignoremissing {os.path.relpath(swig_i)} ''').strip().replace('\n', "" if state_.windows else "\\\n")) rebuilt = jlib.build( (swig_i, include1, include2), (swig_cpp, swig_py), command, force_rebuild, ) jlib.log('{rebuilt=}') if rebuilt: swig_py_tmp = f'{swig_py}-' jlib.remove(swig_py_tmp) os.rename(swig_py, swig_py_tmp) with open(swig_py_tmp) as f: swig_py_content = f.read() if state_.openbsd: # Write Python code that will automatically load the required # .so's when mupdf.py is imported. Unfortunately this doesn't # work on Linux. prefix = textwrap.dedent(f''' import ctypes import os import importlib # The required .so's are in the same directory as this # Python file. On OpenBSD we can explicitly load these # .so's here using ctypes.cdll.LoadLibrary(), which # avoids the need for LD_LIBRARY_PATH to be defined. # # Unfortunately this doesn't work on Linux. # for leaf in ('libmupdf.so', 'libmupdfcpp.so', '_mupdf.so'): path = os.path.abspath(f'{{__file__}}/../{{leaf}}') #print(f'path={{path}}') #print(f'exists={{os.path.exists(path)}}') ctypes.cdll.LoadLibrary( path) #print(f'have loaded {{path}}') ''') swig_py_content = prefix + swig_py_content elif state_.windows: jlib.log('Adding prefix to {swig_cpp=}') prefix = '' postfix = '' with open(swig_cpp) as f: swig_py_content = prefix + swig_py_content + postfix # Change all our PDF_ENUM_NAME_* enums so that they are actually # PdfObj instances so that they can be used like any other PdfObj. # jlib.log('{len(generated.c_enums)=}') for enum_type, enum_names in generated.c_enums.items(): for enum_name in enum_names: if enum_name.startswith('PDF_ENUM_NAME_'): swig_py_content += f'{enum_name} = PdfObj( obj_enum_to_obj( {enum_name}))\n' with open(swig_py_tmp, 'w') as f: f.write(swig_py_content) os.rename(swig_py_tmp, swig_py) elif language == 'csharp': outdir = os.path.relpath(f'{build_dirs.dir_mupdf}/platform/csharp') os.makedirs(outdir, exist_ok=True) # Looks like swig comes up with 'mupdfcpp_swig_wrap.cxx' leafname. # # We include platform/python/include in order to pick up the modified # include/mupdf/pdf/object.h that we generate elsewhere. dllimport = 'mupdfcsharp.so' if state_.windows: # Would like to specify relative path to .dll with: # dllimport = os.path.relpath( f'{build_dirs.dir_so}/mupdfcsharp.dll') # but Windows/.NET doesn't seem to support this, despite # https://stackoverflow.com/questions/31807289 "how can i add a # swig generated c dll reference to a c sharp project". # dllimport = 'mupdfcsharp.dll' command = (textwrap.dedent(f''' "{swig_command}" {"-D_WIN32" if state_.windows else ""} -Wall -c++ -csharp {disable_swig_warnings} -module mupdf -namespace mupdf -dllimport {dllimport} -outdir {outdir} -outfile mupdf.cs -o {os.path.relpath(swig_cpp)} -includeall -I{os.path.relpath(build_dirs.dir_mupdf)}/platform/python/include -I{os.path.relpath(include1)} -I{os.path.relpath(include2)} -ignoremissing {os.path.relpath(swig_i)} ''').strip().replace('\n', "" if state_.windows else "\\\n")) rebuilt = jlib.build( (swig_i, include1, include2), (f'{outdir}/mupdf.cs', os.path.relpath(swig_cpp)), command, force_rebuild, ) # fixme: use <rebuilt> line with language=='python' to avoid multiple # modifications to unchanged mupdf.cs? # # For classes that have our to_string() method, override C#'s # ToString() to call to_string(). with open(f'{outdir}/mupdf.cs') as f: cs = f.read() cs2 = re.sub( '(( *)public string to_string[(][)])', '\\2public override string ToString() { return to_string(); }\n\\1', cs, ) jlib.log('{len(cs)=}') jlib.log('{len(cs2)=}') assert cs2 != cs, f'Failed to add toString() methods.' jlib.log('{len(generated.swig_csharp)=}') assert len(generated.swig_csharp) cs2 += generated.swig_csharp jlib.update_file(cs2, f'{build_dirs.dir_so}/mupdf.cs') #jlib.copy(f'{outdir}/mupdf.cs', f'{build_dirs.dir_so}/mupdf.cs') jlib.log('{rebuilt=}') else: assert 0
def _try_init_clang(self, version): if state_.openbsd: clang_bin = glob.glob(f'/usr/local/bin/clang-{version}') if not clang_bin: jlib.log('Cannot find {clang_bin=}', 1) return clang_bin = clang_bin[0] self.clang_version = version libclang_so = glob.glob(f'/usr/local/lib/libclang.so*') assert len(libclang_so) == 1 self.libclang_so = libclang_so[0] self.resource_dir = jlib.system( f'{clang_bin} -print-resource-dir', out='return', ).strip() self.include_path = os.path.join(self.resource_dir, 'include') #logx('{self.libclang_so=} {self.resource_dir=} {self.include_path=}') if os.environ.get('VIRTUAL_ENV'): clang.cindex.Config.set_library_file(self.libclang_so) return True for p in os.environ.get('PATH').split(':'): clang_bins = glob.glob(os.path.join(p, f'clang-{version}*')) if not clang_bins: continue clang_bins.sort() for clang_bin in clang_bins: e, clang_search_dirs = jlib.system( f'{clang_bin} -print-search-dirs', #verbose=log, out='return', raise_errors=False, ) if e: jlib.log('[could not find {clang_bin}: {e=}]') return if version == 10: m = re.search('\nlibraries: =(.+)\n', clang_search_dirs) assert m clang_search_dirs = m.group(1) clang_search_dirs = clang_search_dirs.strip().split(':') for i in ['/usr/lib', '/usr/local/lib'] + clang_search_dirs: for leaf in f'libclang-{version}.*so*', f'libclang.so.{version}.*': p = os.path.join(i, leaf) p = os.path.abspath(p) jlib.log('{p=}') libclang_so = glob.glob(p) if not libclang_so: continue # We have found libclang.so. self.libclang_so = libclang_so[0] jlib.log('Using {self.libclang_so=}') clang.cindex.Config.set_library_file(self.libclang_so) self.resource_dir = jlib.system( f'{clang_bin} -print-resource-dir', out='return', ).strip() self.include_path = os.path.join( self.resource_dir, 'include') self.clang_version = version return True
import re import sys import jlib try: try: import clang.cindex except ModuleNotFoundError as e: # On devuan, clang-python isn't on python3's path, but python2's # clang-python works fine with python3, so we deviously get the path by # running some python 2. # e, clang_path = jlib.system( 'python2 -c "import clang; print clang.__path__[0]"', out='return', raise_errors=0) if e == 0: jlib.log( 'Retrying import of clang using info from python2 {clang_path=}' ) sys.path.append(os.path.dirname(clang_path)) import clang.cindex else: raise except Exception as e: jlib.log( 'Warning: failed to import clang.cindex: {e=}\n' f'We need Clang Python to build MuPDF python.\n'
def extract( extract_text_exe, mupdf_shared_dir, path_template, path_in, valgrind, squeeze, failat, method, ): ''' Extracts text, and compares .docx's word/document.xml if reference file exists. ''' log(f'Doing text extraction with {path_in}, method={method}') path_out = f'{path_in}-{method}.docx' path_content = f'{path_out}.content.xml' path_intermediate = f'{path_in}.intermediate.xml' executable = None command = None if method == 'trace': command = f'build/debug/mutool draw -F trace -o {path_intermediate} {path_in}' #jlib.system( command, out=log, verbose=1, prefix=' ') elif method == 'raw' or method == 'stext': command = f'build/debug/mutool draw -F raw -o {path_intermediate} {path_in}' # Run mutool.py to get intermediate xml. #command = '' #command += f'LD_LIBRARY_PATH={mupdf_shared_dir} PYTHONPATH={mupdf_shared_dir}' #command += f' scripts/mutool.py draw -F raw -o {path_intermediate} {path_in}' else: assert 0 if command: jlib.system( command, out=log, verbose=1, prefix=' ') command = '' command += ('' f' LD_LIBRARY_PATH=/home/jules/artifex/libbacktrace/.libs' f' MEMENTO_HIDE_MULTIPLE_REALLOCS=1' ) if squeeze: command += f' {squeeze}' if failat: command += f' {failat}' if valgrind: command += f' valgrind --leak-check=full' command += ( f' ./{extract_text_exe}' f' -i {path_intermediate}' f' -t {path_template}' f' -p 1' # preserve .docx temporary directory. f' -c {path_content}' f' -m {method}' f' -o {path_out}' ) if squeeze: command += ' 2>&1 | tee >(perl ../ghostpdl/toolbin/squeeze2html.pl | gzip -9 -c > squeeze.html.gz) | grep "Memory squeezing @"' executable='bash' jlib.system( command, out=log, verbose=1, prefix=' ', executable=executable) path_content = f'{path_out}.content.xml' path_content_ref = f'{path_out}.content.ref.xml' if os.path.exists(path_content_ref): jlib.system(f'diff -u {path_content_ref} {path_content}', out=log, verbose=1, prefix=' ') else: log(f'*** No reference content {path_content_ref} to compare with generated {path_content}. os.getcwd()={os.getcwd()}') path_document_xml = f'{path_out}.dir/word/document.xml' path_document_xml_ref = f'{path_out}.word.document.ref.xml' if os.path.exists(path_document_xml_ref): jlib.system(f'diff -u {path_document_xml_ref} {path_document_xml}', out=log, verbose=1, prefix=' ') else: log(f'*** No reference document {path_document_xml_ref} to compare with generated {path_document_xml}')
def test(mupdf_shared_dir, so_build, valgrind, squeeze, failat): if so_build: with jlib.LogPrefixScope('building mupdf.so: '): # Build mupdf.so and python wrapper. # command = f'./scripts/mupdfwrap.py -d build/shared-debug -b {so_build}' jlib.system( command, out=log, verbose=1, prefix=' ') with jlib.LogPrefixScope('building extract_text.exe: '): # Build extract_text.exe. # extract_text_c = 'source/tools/extract_text.c' extract_text_cc = 'source/tools/extract_text.c.c' extract_text_exe = 'extract_text.c.exe' memento_c = 'source/fitz/memento.c' memento_cc = 'source/fitz/memento.cc' if 0: jlib.build( extract_text_c, extract_text_cc, f'cc -E -dD -g -o {memento_cc} -DMEMENTO {memento_c} -pthread -I include -I /usr/local/include -W -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function build/shared-debug/libmupdf.so -Wl,--export-dynamic -L /usr/local/lib -lm -lexecinfo', out=log, ) jlib.build( extract_text_c, extract_text_cc, f'cc -E -dD -g -o {extract_text_cc} -DMEMENTO {extract_text_c} -pthread -I include -I /usr/local/include -W -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function build/shared-debug/libmupdf.so -Wl,--export-dynamic -L /usr/local/lib -lm -lexecinfo', out=log, ) command = ( f'cc -g' f' -o {extract_text_exe}' f' -DMEMENTO' f' {extract_text_c}' f' source/fitz/memento.c' f' -pthread' f' -I include' f' -I /usr/local/include' f' -W -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function' f' build/shared-debug/libmupdf.so' f' -lm' ) if os.uname()[0] == 'OpenBSD': command += ' -Wl,--export-dynamic -L /usr/local/lib -lexecinfo' else: command += ' -DHAVE_LIBDL -ldl' jlib.build( (extract_text_c, memento_c), extract_text_exe, command, out=log, ) # Extract text from various input files. # path_template = '../Untitled1.docx' assert os.path.isfile(path_template), '*** We require an empty .docx document template called %s' % path_template for in_pdf in ( f'{mupdf_root}/../ghostpdl/zlib/zlib.3.pdf', f'{mupdf_root}/../Python2.pdf', ): in_pdf_rel = os.path.relpath(in_pdf) # 2020-07-27: trace broken by recent changes. # for method in 'raw', 'stext', 'trace': # for method in 'raw', 'stext': with jlib.LogPrefixScope(f'{in_pdf_rel} method={method}: '): extract( extract_text_exe, mupdf_shared_dir, path_template, in_pdf_rel, valgrind=valgrind, squeeze=squeeze, failat=failat, method=method, ) log( 'finished')