def create_from_file(cls, f: io.BufferedReader, doc): char = f.peek(1)[0:1] # TODO: if char == b't' or char == b'f': return PdfBooleanObject.create_from_file(f) elif char in [ b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'+', b'-', b'.' ]: o = f.tell() n = PdfNumericObject.create_from_file(f) if n.value < 0 or n.value - int( n.value ) != 0: # a decimal or a negative number, never a indirect obj return n o2 = f.tell() utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) n2, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if re.match( rb'\d+$', n2 ) is None: # next token not a number, never an indirect obj f.seek(o2, io.SEEK_SET) return n else: utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) s, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if s == b'obj': # all 3 tokens are correct, an indirect obj f.seek(o, io.SEEK_SET) return PdfIndirectObject.create_from_file(f, doc) elif s == b'R': # all 3 tokens are correct, an indirect reference f.seek(o, io.SEEK_SET) return PdfReferenceObject.create_from_file(f, doc) else: f.seek(o2, io.SEEK_SET) return n elif char == b'(': return PdfLiteralStringObject.create_from_file(f) elif char == b'<': char = utils.peek_at_least(f, 2)[0:2] if char == b'<<': dictobj = PdfDictionaryObject.create_from_file(f, doc) return dictobj else: return PdfHexStringObject.create_from_file(f) elif char == b'/': return PdfNameObject.create_from_file(f) elif char == b'[': return PdfArrayObject.create_from_file(f, doc) elif char == b'n': return PdfNullObject.create_from_file(f) else: raise Exception(f'Unknown token at {f.tell()}')
def authenticate_user(self, user, password): if user == "": user = self.myuid need_su = self.myuid != user if not utils.check_ssh(user, self.hutch): if self.model.userIO != None: try: os.close(self.model.userIO) except: pass self.model.userIO = None self.ui.userLabel.setText("User: "******"/usr/bin/ssh", [ "ssh", user + "@" + utils.COMMITHOST, "/bin/tcsh", "-if" ]) else: os.execv("/usr/bin/ssh", ["ssh", utils.COMMITHOST, "/bin/tcsh", "-if"]) except: pass print "Say what? execv failed?" sys.exit(0) l = utils.read_until(fd, "(assword:|> )").group(1) if l != "> ": os.write(fd, password + "\n") l = utils.read_until(fd, "> ") if utils.KINIT != None and password != "": os.write(fd, utils.KINIT + "\n") l = utils.read_until(fd, ": ") os.write(fd, password + "\n") l = utils.read_until(fd, "> ") self.model.user = user if self.model.userIO != None: try: os.close(self.model.userIO) except: pass self.model.userIO = fd if need_su: self.utimer.start(10 * 60000) # Let's go for 10 minutes. self.ui.userLabel.setText("User: " + user)
def create_from_file(cls, f: io.BufferedReader): org_pos = f.tell() token: bytes = f.read(1) if token != b'<': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid hexadecimal string at offset {org_pos}.' ) token, endtoken = utils.read_until(f, [b'>']) if re.match(br'^[0-9A-Fa-f]*$', token) is None or endtoken != b'>': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid hexadecimal string at offset {org_pos}.' ) if len(token) == 0: return PdfHexStringObject(b'') else: f.read(1) # read '>' token = token.decode( 'iso-8859-1' ) # bytes.fromhex only accepts str. Moreover, we need to cater for odd length if len( token ) % 2 != 0: # PDF Reference 3.2.3, Hexadecimal Strings: if there is an odd number of digits, the final digit is assumed to be 0 token += '0' return PdfHexStringObject(bytes.fromhex(token))
def create_from_file(cls, f: io.BufferedReader): org_pos = f.tell() token: bytes = f.read(1) if token != b'/': raise Exception( f'Parse Error: Not a valid name object at offset {org_pos}.') result, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) return PdfNameObject(result)
def create_from_file(cls, f: io.BufferedReader): org_pos = f.tell() token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if re.match(br'^[+-]?\d+(?:\.\d*)?|[+-]?\.\d+$', token) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid Numeric object at offset {org_pos}.' ) return PdfNumericObject(Decimal(token.decode('iso-8859-1')))
def get_xref_trailer_at_offset(self, f, offset): # read xref, trailer should directly follow, and MUST be read TOGETHER with xref # linearized PDF specified the last appering trailer DOES NOT have Prev entry, and startxref points to 1st page xref table near start of file # which has its own trailer, making the last trailer technically the 'first' trailer # therefore, searching for trailer dict from end of file would get the wrong trailer dict # moreover, in a xref stream, the xref and trailer dict is lumped together as the stream object if offset in self.offset_xref_trailer: return self.offset_xref_trailer[offset] f.seek(offset, io.SEEK_SET) temp, _ = utils.read_until(f, syntax.EOL) f.seek(offset, io.SEEK_SET) # TODO: catch exception for parsing PdfXRefSection if temp == b'xref': # uncompressed xref section utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) xref_section = PdfXRefSection(f) # find trailer dict and Prev # trailer dict CAN contain references utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) temp, _ = utils.read_until(f, syntax.EOL) if temp == b'trailer': utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) trailer_dict = PdfDictionaryObject.create_from_file(f, self) self.offset_xref_trailer[offset] = (xref_section, trailer_dict) else: # TODO: check for objects between xref and trailer dict, and between trailer dict and startxref? raise Exception( f'trailer dict not found after xref table at {f.tell() - 7}' ) else: # may be compressed xref stream # trailer dict IS the stream dict, and CANNOT contain references try: xref_stream = PdfIndirectObject.create_from_file(f, self) except Exception as ex: raise Exception('Invalid xref stream') from ex xref_section = PdfXRefSection.from_xrefstm(xref_stream) self.offset_xref_trailer[offset] = (xref_section, xref_stream.value.dict) return self.offset_xref_trailer[offset]
def create_from_file(cls, f: io.BufferedReader): org_pos = f.tell() token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if token == b'true': return PdfBooleanObject(True) elif token == b'false': return PdfBooleanObject(False) else: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid Boolean object at offset {org_pos}.' )
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid reference at offset {org_pos}.') obj_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid reference at offset {org_pos}.') gen_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if tok != b'R': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid reference at offset {org_pos}.') return PdfReferenceObject(doc, obj_no, gen_no)
def __init__(self, f): '''Initialize a PdfXRefSection from a opened PDF file f. The file object’s current position should be at the beginning of the line with the sole keyword 'xref' ''' self.subsections = [] org_pos = f.tell() s, eol_marker = utils.read_until(f, syntax.EOL) if s != b'xref': f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference section should begin with keyword 'xref' at offset {org_pos}" ) f.seek(len(eol_marker), io.SEEK_CUR) # Following 'xref' line are one or more cross-reference subsections while True: s, eol_marker = utils.read_until(f, syntax.EOL) matches = re.match(rb'^\s*(\d+)\s+(\d+)\s*$', s) f.seek(-len(s), io.SEEK_CUR) if matches: # start of subsection self.subsections += [PdfXRefSubSection(f)] else: break
def parse_normal(self, f, progress_cb=None): '''Initialize a PdfDocument from a opened PDF file f by reading xref and trailers. After this is called, offset_obj, offset_obj_streams, compressed_obj, offset_xref_trailer, all xref sections are ready''' f.seek(0, io.SEEK_SET) filesize = os.fstat(f.fileno()).st_size # First line is header s, eol_marker = utils.read_until(f, syntax.EOL) header = re.match(rb'%PDF-(\d+\.\d+)', s) if header: self.version = Decimal(header.group(1).decode('iso-8859-1')) f.seek(len(eol_marker), io.SEEK_CUR) else: raise Exception('Not a PDF file') # read from end of file, find xref eof_found = -1 startxref_found = -1 temp_line = b'' temp_count = 2 temp_offset = 0 for line in utils.rlines(f): temp_offset -= len(line) if line.rstrip() == b'%%EOF': eof_found = temp_offset if eof_found != -1 and temp_count == 0: if line.rstrip() == b'startxref': startxref_found = temp_offset break else: raise Exception( 'startxref not found at 2 lines before EOF marker') elif eof_found != -1: temp_count -= 1 temp_line = line xref_offset = int(temp_line.decode('iso-8859-1')) self.startxref = xref_offset # The only required part for a trailer (and marks the end of an increment) is startxref and %%EOF self.increments[-1]['startxref'] = xref_offset self.increments[-1]['eof'] = True inuse_count = 0 while True: f.seek(xref_offset, io.SEEK_SET) xref_section, trailer = self.get_xref_trailer_at_offset( f, xref_offset) self.offset_xref_trailer[xref_offset] = (xref_section, trailer) for subsec in xref_section.subsections: inuse_count += len(subsec.inuse_entry) self.increments[0]['xref_section'] = xref_section self.increments[0]['trailer'] = trailer if trailer.get('Prev') is None: break if trailer['Prev'].value - int(trailer['Prev'].value) != 0: raise Exception( f'Prev must be an integer, in trailer dict at offset {xref_offset}' ) xref_offset = int(trailer['Prev'].value) # must not be indirect self.increments = [{ 'body': [], 'xref_section': None, 'trailer': None, 'startxref': None, 'eof': False }] + self.increments self.increments[0]['startxref'] = xref_offset self.ready = True inuse_parsed_count = 0 # parse each in use obj num for inc in self.increments: for subsec in inc['xref_section'].subsections: for entry in subsec.inuse_entry: if entry.get('compressed'): inuse_parsed_count += 1 continue offset = entry['offset'] f.seek(offset, io.SEEK_SET) new_obj = PdfObject.create_from_file(f, self) if not isinstance( new_obj, PdfIndirectObject) or new_obj.obj_no != entry[ 'obj_no'] or new_obj.gen_no != entry['gen_no']: raise Exception( f'Invalid obj referenced by xref at offset {offset}' ) self.offset_obj[offset] = new_obj if isinstance(new_obj.value, PdfStreamObject) and new_obj.value.dict.get( 'Type') == 'ObjStm': self.offset_obj_streams[offset] = new_obj inuse_parsed_count += 1 print('', end="\r") print( f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed', end='', flush=True) if progress_cb is not None: progress_cb( f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed', read=inuse_parsed_count, total=inuse_count) print('Decoding object streams...') if progress_cb is not None: progress_cb('Decoding object streams...', read=inuse_parsed_count, total=inuse_count) for k in self.offset_obj_streams: from objstm import decode_objstm self.compressed_obj = { **(self.compressed_obj), **(decode_objstm(self.offset_obj_streams[k], self)) } print('', end="\r") print('100% processed ') if progress_cb is not None: progress_cb('100% processed', read=inuse_parsed_count, total=inuse_count) print('Done') if progress_cb is not None: progress_cb('Done', read=inuse_parsed_count, total=inuse_count)
def create_from_file(cls, f: io.BufferedReader): org_pos = f.tell() token: bytes = f.read(1) if token != b'(': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid string at offset {org_pos}.') result = bytearray(b'') stack = 1 # 1 for initial ( while True: # Balanced pairs of parentheses within a string require no special treatment. # backslash ( \ ) is used as an escape character for various purposes, # such as to include ... unbalanced parentheses ... token, endtoken = utils.read_until(f, [b'(', b')', b'\\(', b'\\)']) result.extend(token) if endtoken in [b'\\(', b'\\)']: # escaped, read it for now f.seek(2, io.SEEK_CUR) result.extend(endtoken) continue elif endtoken == b'(': # open bracket, stack += 1, read it for now stack += 1 f.seek(1, io.SEEK_CUR) result.extend(endtoken) continue elif endtoken == b')': # close bracket, stack -= 1, read it for now if string not done stack -= 1 f.seek(1, io.SEEK_CUR) if stack == 0: # string is done break else: result.extend(endtoken) continue elif endtoken == b'' and stack > 0: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid string at offset {org_pos}.') # If a string is too long to be conveniently placed on a single line, it may be split # across multiple lines by using the backslash character at the end of a line to # indicate that the string continues on the following line. The backslash and the # end-of-line marker following it are not considered part of the string. result = result.replace(b'\\\r\n', b'') result = result.replace(b'\\\r', b'') result = result.replace(b'\\\n', b'') # If an end-of-line marker appears within a literal string without a preceding # backslash, the result is equivalent to \n (regardless of whether the end-of-line # marker was a carriage return, a line feed, or both). result = result.replace(b'\r\n', b'\n') result = result.replace(b'\r', b'\n') # TABLE 3.2 Escape sequences in literal strings result = result.replace(b'\\n', b'\n') result = result.replace(b'\\r', b'\r') result = result.replace(b'\\t', b'\t') result = result.replace(b'\\b', b'\b') result = result.replace(b'\\f', b'\f') result = result.replace(b'\\(', b'(') result = result.replace(b'\\)', b')') result = result.replace(b'\\\\', b'\\') result = re.sub( rb'\\([0-7]{1,3})', lambda m: chr(int(m.group(1), 8)).encode('iso-8859-1') if int(m.group(1), 8) < 256 else b'\\' + m.group(1), result) return PdfLiteralStringObject(result.decode('iso-8859-1'))
def __init__(self, f): '''Initialize a PdfXRefSubSection from a opened PDF file f. The file object’s current position should be at the line with two numbers, object number of the first object in this subsection and the umber of entries, separated by a space''' self.inuse_entry = [] self.free_entry = [] self.entries = [] org_pos = f.tell() s, eol_marker = utils.read_until(f, syntax.EOL) matches = re.match(rb'^\s*(\d+)\s+(\d+)\s*$', s) if matches is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference subsection should begin with two numbers at offset {org_pos}" ) self.first_objno = int(matches.group(1)) count = int(matches.group(2)) if self.first_objno < 0 or count < 0: f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference subsection at offset {org_pos} has invalid object number or object count" ) f.seek(len(eol_marker), io.SEEK_CUR) # Each entry is exactly 20 bytes long, including EOL marker. for i in range(count): entry = f.read(20) current_obj_no = self.first_objno + i # nnnnnnnnnn ggggg n/f matches = re.match(rb'^(\d{10})\s(\d{5})\s([nf])(?: \r| \n|\r\n)', entry) if matches is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}" ) # obj no 0 is always free and has a generation number of 65535 if current_obj_no == 0 and int(matches.group(2)) != 65535: f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}" ) # in-use entry: 1st 10-digit number is byte offset, free entry: 1st 10-digit number is an obj no of the next free object if matches.group(3) == b'n': self.inuse_entry += [{ 'obj_no': current_obj_no, 'gen_no': int(matches.group(2)), 'used': True, 'offset': int(matches.group(1)) }] self.entries += [self.inuse_entry[-1]] elif matches.group(3) == b'f': if (len(self.free_entry) > 0 and self.free_entry[-1]['next_free_obj_no'] != current_obj_no): f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}" ) self.free_entry += [{ 'obj_no': current_obj_no, 'gen_no': int(matches.group(2)), 'used': False, 'next_free_obj_no': int(matches.group(1)) }] self.entries += [self.free_entry[-1]] else: f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}" ) # last free entry (the tail of the linked list) links back to obj no 0 if len(self.free_entry ) > 0 and self.free_entry[-1]['next_free_obj_no'] != 0: f.seek(org_pos, io.SEEK_SET) raise Exception( f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}" )
def parse_multipart(content_type, content_length, bodystream, encoding): """ Parse the body of a multipart form request as specified in rfc1867. Ignores case in header-names and supports both '\\n' and '\\r\\n' as a separator (but not a mix in the same bodystream). @param content_type: The content type header of the HTTP request. @param content_length: content-length as a long. "content_length" bytes are read from the bodystream, unless EOF is reached. @param bodystream: A file like object. Must have a read() method which supports the "size" parameter. @param encoding: The encoding to use when converting parameters to unicode. Files are not converted to unicode. @return: (files, var) Where both are L{FormParams}. C{files} contains L{MultipartFile} instances, and C{var} contains unicode objects. """ files = FormParams() var = FormParams() bytes_left = content_length extra = "--" # parse content-type try: ignore, boundary = content_type.split("=", 1) except ValueError: raise FormParseClientError( "multipart/form-data POST request without a boundary.") boundary = extra + boundary # detect separator bytes_read = len(boundary) + 1 buf = bodystream.read(bytes_read) bytes_left -= bytes_read if buf[len(boundary)] == "\r": sep = "\r\n" bodystream.read(1) # also read \n bytes_left -= 1 else: sep = "\n" rest = sep while(True): # extract header ostream = StringIO() bytes_read, success, rest = read_until( bodystream, ostream, sep+sep, before=rest, maxread=bytes_left) bytes_left -= bytes_read if not success: break headers = Parser().parsestr( ostream.getvalue()[len(sep):], headersonly=True) # parse content-disposition header try: x = headers["content-disposition"].split(";") except AttributeError: continue # ignore blocks without a content-disposition header filename = None name = None for param in x[1:]: p = param.split("=", 1) if not len(p) == 2: continue # ignore invalid params key = p[0].strip() val = p[1][1:-1] if key == "name": name = val elif key == "filename": filename = val if not name: break # parse body into buffer if filename: ostream = TemporaryFile() else: ostream = StringIO() bytes_read, success, rest = read_until( bodystream, ostream, sep+boundary, before=rest, maxread=bytes_left) bytes_left -= bytes_read if not success: break # add to correct container if filename: ostream.seek(0) v = MultipartFile(ostream, filename, headers) files.add(unicode(name, encoding), v) else: var.add( unicode(name, encoding), unicode(ostream.getvalue(), encoding)) return (files, var)
def parse_multipart(content_type, content_length, bodystream, encoding): """ Parse the body of a multipart form request as specified in rfc1867. Ignores case in header-names and supports both '\\n' and '\\r\\n' as a separator (but not a mix in the same bodystream). @param content_type: The content type header of the HTTP request. @param content_length: content-length as a long. "content_length" bytes are read from the bodystream, unless EOF is reached. @param bodystream: A file like object. Must have a read() method which supports the "size" parameter. @param encoding: The encoding to use when converting parameters to unicode. Files are not converted to unicode. @return: (files, var) Where both are L{FormParams}. C{files} contains L{MultipartFile} instances, and C{var} contains unicode objects. """ files = FormParams() var = FormParams() bytes_left = content_length extra = "--" # parse content-type try: ignore, boundary = content_type.split("=", 1) except ValueError: raise FormParseClientError( "multipart/form-data POST request without a boundary.") boundary = extra + boundary # detect separator bytes_read = len(boundary) + 1 buf = bodystream.read(bytes_read) bytes_left -= bytes_read if buf[len(boundary)] == "\r": sep = "\r\n" bodystream.read(1) # also read \n bytes_left -= 1 else: sep = "\n" rest = sep while (True): # extract header ostream = StringIO() bytes_read, success, rest = read_until(bodystream, ostream, sep + sep, before=rest, maxread=bytes_left) bytes_left -= bytes_read if not success: break headers = Parser().parsestr(ostream.getvalue()[len(sep):], headersonly=True) # parse content-disposition header try: x = headers["content-disposition"].split(";") except AttributeError: continue # ignore blocks without a content-disposition header filename = None name = None for param in x[1:]: p = param.split("=", 1) if not len(p) == 2: continue # ignore invalid params key = p[0].strip() val = p[1][1:-1] if key == "name": name = val elif key == "filename": filename = val if not name: break # parse body into buffer if filename: ostream = TemporaryFile() else: ostream = StringIO() bytes_read, success, rest = read_until(bodystream, ostream, sep + boundary, before=rest, maxread=bytes_left) bytes_left -= bytes_read if not success: break # add to correct container if filename: ostream.seek(0) v = MultipartFile(ostream, filename, headers) files.add(unicode(name, encoding), v) else: var.add(unicode(name, encoding), unicode(ostream.getvalue(), encoding)) return (files, var)
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) obj_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) num, _ = utils.read_until(f, syntax.WHITESPACES) if re.match(rb'\d+$', num) is None: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) gen_no = int(num) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if tok != b'obj': f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) inner_content_pos = f.tell() # parse inner object def inner2(): f.seek(inner_content_pos, io.SEEK_SET) obj = PdfObject.create_from_file(f, doc) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) return obj inner_obj = inner2() # if inner object is a dict, and is followed by a stream extent, then the object should be stream object # otherwise, if there is no endobj token, it is an error temp = f.tell() utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) token, endtoken = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES, maxsize=7) f.seek(temp, io.SEEK_SET) if not (token == b'endobj' and (endtoken != b'' or endtoken is None)): # endtoken None to indicate EOF if utils.peek_at_least( f, 7)[0:7] == b'stream\n' or utils.peek_at_least( f, 8)[0:8] == b'stream\r\n': f.seek(inner_content_pos, io.SEEK_SET) streamObj = PdfStreamObject.create_from_file(f, doc) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) token, endtoken = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES, maxsize=7) if not (token == b'endobj' and (endtoken != b'' or endtoken is None)): f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) inner_obj = streamObj #if streamObj.dict.get('Type') == 'ObjStm': # Object Stream, decode and parse the content else: f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid indirect object at offset {org_pos}.' ) else: f.seek(6, io.SEEK_CUR) return PdfIndirectObject(inner_obj, obj_no, gen_no)
def create_from_file(cls, f: io.BufferedReader, doc): org_pos = f.tell() stream_dict = PdfObject.create_from_file(f, doc) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) if not (utils.peek_at_least(f, 7)[0:7] == b'stream\n' or utils.peek_at_least(f, 8)[0:8] == b'stream\r\n'): f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) # check if dict has the required key /Length with valid values if not isinstance(stream_dict, PdfDictionaryObject): f.seek(org_pos, io.SEEK_SET) raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') if stream_dict.get('Length') is None: raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') if isinstance(stream_dict['Length'], PdfReferenceObject): size = stream_dict['Length'].deref().value elif isinstance(stream_dict['Length'], PdfNumericObject): size = stream_dict['Length'].value else: raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') if size.as_integer_ratio()[0] <= 0 or size.as_integer_ratio()[1] != 1: raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') size = size.as_integer_ratio()[0] # check for filters filt = None if stream_dict.get('Filter') is not None: # TODO: Remove the assumption that Filter value is always a direct obj filt = stream_dict['Filter'] if isinstance(filt, PdfArrayObject): if any(not isinstance(x, PdfNameObject) for x in filt.value): raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.' ) filt = filt.value[0] # /Filter (or first element of the array) must specify a Name if not isinstance(filt, PdfNameObject): raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.' ) # read only /Length bytes # filter implementation is reponsible for checking if the data length is correct # e.g. if any needed end-of-data marker is present at only the end raw = f.read(size) # check if stream ends with b'endstream', optionally preceeded by b'\r', 'b'\n' or b'\r\n' if utils.peek_at_least(f, 2)[0:2] == b'\r\n': f.seek(2, io.SEEK_CUR) elif utils.peek_at_least(f, 1)[0:1] == b'\r' or utils.peek_at_least( f, 1)[0:1] == b'\n': f.seek(1, io.SEEK_CUR) token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES) if token != b'endstream': raise Exception( f'Parse Error: Not a valid stream object at offset {org_pos}.') # actual decoding is done in constructor return PdfStreamObject(stream_dict, raw)
def parse_linear(self, f, progress_cb=None): '''Initialize a PdfDocument from a opened PDF file f from the beginning''' def print_progress(): print('', end="\r") print(f'{f.tell() / filesize * 100:5.2f}% processed', end='', flush=True) if progress_cb is not None: progress_cb(f'{f.tell() / filesize * 100:5.2f}% processed', read=f.tell(), total=filesize) f.seek(0, io.SEEK_SET) filesize = os.fstat(f.fileno()).st_size print_progress() # First line is header s, eol_marker = utils.read_until(f, syntax.EOL) header = re.match(rb'%PDF-(\d+\.\d+)', s) if header: self.version = Decimal(header.group(1).decode('iso-8859-1')) f.seek(len(eol_marker), io.SEEK_CUR) else: raise Exception('Not a PDF file') while True: utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=False) if f.tell() >= filesize: break org_pos = f.tell() s, eol_marker = utils.read_until(f, syntax.EOL) if s == b'startxref': # the last startxref always override the ones before utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) t, _ = utils.read_until(f, syntax.EOL) self.startxref = int(t) self.increments[-1]['startxref'] = self.startxref continue elif s == b'xref': f.seek(-4, io.SEEK_CUR) self.increments[-1]['xref_section'] = PdfXRefSection(f) self.offset_xref[org_pos] = self.increments[-1]['xref_section'] continue elif s == b'trailer': utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True) self.increments[-1][ 'trailer'] = PdfDictionaryObject.create_from_file(f, self) continue elif s == b'%%EOF': # TODO: check if trailer dict immediately precedes %%EOF # since we are seeking until non-ws, the only case EOF marker # does not appear by itself it when it is preceded by some # whitespaces, which should be ignored self.increments[-1]['eof'] = True f.seek(5 + len(eol_marker), io.SEEK_CUR) continue elif s[0:1] == b'%': # otherwise, it is a comment, ignore the whole remaining line utils.seek_until(f, syntax.EOL) continue #else: f.seek(org_pos, io.SEEK_SET) if self.increments[-1]['eof']: self.increments += [{ 'body': [], 'xref_section': None, 'trailer': None, 'startxref': None, 'eof': False }] # TODO: how to handle object parse error? new_obj = PdfObject.create_from_file(f, self) self.increments[-1]['body'] += [new_obj] self.offset_obj[org_pos] = new_obj if isinstance(new_obj.value, PdfStreamObject ) and new_obj.value.dict.get('Type') == 'ObjStm': self.offset_obj_streams[org_pos] = new_obj print_progress() print('', end="\r") print('100% processed ') if progress_cb is not None: progress_cb('100% processed', read=f.tell(), total=filesize) self.ready = True print('Decoding object streams...') if progress_cb is not None: progress_cb('Decoding object streams...', read=f.tell(), total=filesize) for k in self.offset_obj_streams: from objstm import decode_objstm self.compressed_obj = { **(self.compressed_obj), **(decode_objstm(self.offset_obj_streams[k], self)) } print('Done') if progress_cb is not None: progress_cb('Done', read=f.tell(), total=filesize)
def authenticate_user(self, user, password): if user == "": user = self.myuid need_su = self.myuid != user if not utils.check_ssh(user, self.hutch): if self.model.userIO != None: try: os.close(self.model.userIO) except: pass self.model.userIO = None self.ui.userLabel.setText("User: "******".")[0]: os.execv("/usr/bin/su", ["su", user, "-c", "/bin/tcsh -if"]) else: os.execv("/usr/bin/ssh", [ "ssh", user + "@" + utils.COMMITHOST, "/bin/tcsh", "-if" ]) else: if utils.COMMITHOST == socket.gethostname().split(".")[0]: print "C" os.execv("/bin/tcsh", ["tcsh", "-if"]) else: print "D" os.execv("/usr/bin/ssh", ["ssh", utils.COMMITHOST, "/bin/tcsh", "-if"]) except: pass print "Say what? execv failed?" sys.exit(0) l = utils.read_until(fd, "(assword:|> )").group(1) if l != "> ": os.write(fd, password + "\n") l = utils.read_until(fd, "> ") if utils.KINIT != None and password != "": os.write(fd, utils.KINIT + "\n") l = utils.read_until(fd, ": ") os.write(fd, password + "\n") l = utils.read_until(fd, "> ") # # Sigh. Someone once had a file named time.py in their home # directory. So let's go somewhere where we know the files. # os.write(fd, "cd %s\n" % utils.TMP_DIR) l = utils.read_until(fd, "> ") self.model.user = user if self.model.userIO != None: try: os.close(self.model.userIO) except: pass self.model.userIO = fd if need_su: self.utimer.start(10 * 60000) # Let's go for 10 minutes. self.ui.userLabel.setText("User: " + user)
def read_until(self, pattern, size=None): return read_until(self.buffer, pattern, size)