Example #1
0
    def create_from_file(cls, f: io.BufferedReader, doc):
        char = f.peek(1)[0:1]
        # TODO:
        if char == b't' or char == b'f':
            return PdfBooleanObject.create_from_file(f)
        elif char in [
                b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9',
                b'+', b'-', b'.'
        ]:
            o = f.tell()
            n = PdfNumericObject.create_from_file(f)
            if n.value < 0 or n.value - int(
                    n.value
            ) != 0:  # a decimal or a negative number, never a indirect obj
                return n
            o2 = f.tell()

            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            n2, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
            if re.match(
                    rb'\d+$', n2
            ) is None:  # next token not a number, never an indirect obj
                f.seek(o2, io.SEEK_SET)
                return n
            else:
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                s, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
                if s == b'obj':  # all 3 tokens are correct, an indirect obj
                    f.seek(o, io.SEEK_SET)
                    return PdfIndirectObject.create_from_file(f, doc)
                elif s == b'R':  # all 3 tokens are correct, an indirect reference
                    f.seek(o, io.SEEK_SET)
                    return PdfReferenceObject.create_from_file(f, doc)
                else:
                    f.seek(o2, io.SEEK_SET)
                    return n
        elif char == b'(':
            return PdfLiteralStringObject.create_from_file(f)
        elif char == b'<':
            char = utils.peek_at_least(f, 2)[0:2]
            if char == b'<<':
                dictobj = PdfDictionaryObject.create_from_file(f, doc)
                return dictobj
            else:
                return PdfHexStringObject.create_from_file(f)
        elif char == b'/':
            return PdfNameObject.create_from_file(f)
        elif char == b'[':
            return PdfArrayObject.create_from_file(f, doc)
        elif char == b'n':
            return PdfNullObject.create_from_file(f)
        else:
            raise Exception(f'Unknown token at {f.tell()}')
Example #2
0
 def authenticate_user(self, user, password):
     if user == "":
         user = self.myuid
     need_su = self.myuid != user
     if not utils.check_ssh(user, self.hutch):
         if self.model.userIO != None:
             try:
                 os.close(self.model.userIO)
             except:
                 pass
         self.model.userIO = None
         self.ui.userLabel.setText("User: "******"/usr/bin/ssh", [
                     "ssh", user + "@" + utils.COMMITHOST, "/bin/tcsh",
                     "-if"
                 ])
             else:
                 os.execv("/usr/bin/ssh",
                          ["ssh", utils.COMMITHOST, "/bin/tcsh", "-if"])
         except:
             pass
         print "Say what?  execv failed?"
         sys.exit(0)
     l = utils.read_until(fd, "(assword:|> )").group(1)
     if l != "> ":
         os.write(fd, password + "\n")
         l = utils.read_until(fd, "> ")
     if utils.KINIT != None and password != "":
         os.write(fd, utils.KINIT + "\n")
         l = utils.read_until(fd, ": ")
         os.write(fd, password + "\n")
         l = utils.read_until(fd, "> ")
     self.model.user = user
     if self.model.userIO != None:
         try:
             os.close(self.model.userIO)
         except:
             pass
     self.model.userIO = fd
     if need_su:
         self.utimer.start(10 * 60000)  # Let's go for 10 minutes.
     self.ui.userLabel.setText("User: " + user)
Example #3
0
 def create_from_file(cls, f: io.BufferedReader):
     org_pos = f.tell()
     token: bytes = f.read(1)
     if token != b'<':
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid hexadecimal string at offset {org_pos}.'
         )
     token, endtoken = utils.read_until(f, [b'>'])
     if re.match(br'^[0-9A-Fa-f]*$', token) is None or endtoken != b'>':
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid hexadecimal string at offset {org_pos}.'
         )
     if len(token) == 0:
         return PdfHexStringObject(b'')
     else:
         f.read(1)  # read '>'
         token = token.decode(
             'iso-8859-1'
         )  # bytes.fromhex only accepts str. Moreover, we need to cater for odd length
         if len(
                 token
         ) % 2 != 0:  # PDF Reference 3.2.3, Hexadecimal Strings: if there is an odd number of digits, the final digit is assumed to be 0
             token += '0'
         return PdfHexStringObject(bytes.fromhex(token))
Example #4
0
 def create_from_file(cls, f: io.BufferedReader):
     org_pos = f.tell()
     token: bytes = f.read(1)
     if token != b'/':
         raise Exception(
             f'Parse Error: Not a valid name object at offset {org_pos}.')
     result, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
     return PdfNameObject(result)
Example #5
0
 def create_from_file(cls, f: io.BufferedReader):
     org_pos = f.tell()
     token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
     if re.match(br'^[+-]?\d+(?:\.\d*)?|[+-]?\.\d+$', token) is None:
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid Numeric object at offset {org_pos}.'
         )
     return PdfNumericObject(Decimal(token.decode('iso-8859-1')))
Example #6
0
    def get_xref_trailer_at_offset(self, f, offset):
        # read xref, trailer should directly follow, and MUST be read TOGETHER with xref
        # linearized PDF specified the last appering trailer DOES NOT have Prev entry, and startxref points to 1st page xref table near start of file
        # which has its own trailer, making the last trailer technically the 'first' trailer
        # therefore, searching for trailer dict from end of file would get the wrong trailer dict
        # moreover, in a xref stream, the xref and trailer dict is lumped together as the stream object
        if offset in self.offset_xref_trailer:
            return self.offset_xref_trailer[offset]
        f.seek(offset, io.SEEK_SET)
        temp, _ = utils.read_until(f, syntax.EOL)
        f.seek(offset, io.SEEK_SET)
        # TODO: catch exception for parsing PdfXRefSection
        if temp == b'xref':
            # uncompressed xref section
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            xref_section = PdfXRefSection(f)
            # find trailer dict and Prev
            # trailer dict CAN contain references
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            temp, _ = utils.read_until(f, syntax.EOL)
            if temp == b'trailer':
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                trailer_dict = PdfDictionaryObject.create_from_file(f, self)
                self.offset_xref_trailer[offset] = (xref_section, trailer_dict)
            else:
                # TODO: check for objects between xref and trailer dict, and between trailer dict and startxref?
                raise Exception(
                    f'trailer dict not found after xref table at {f.tell() - 7}'
                )
        else:
            # may be compressed xref stream
            # trailer dict IS the stream dict, and CANNOT contain references
            try:
                xref_stream = PdfIndirectObject.create_from_file(f, self)
            except Exception as ex:
                raise Exception('Invalid xref stream') from ex
            xref_section = PdfXRefSection.from_xrefstm(xref_stream)
            self.offset_xref_trailer[offset] = (xref_section,
                                                xref_stream.value.dict)

        return self.offset_xref_trailer[offset]
Example #7
0
 def create_from_file(cls, f: io.BufferedReader):
     org_pos = f.tell()
     token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
     if token == b'true':
         return PdfBooleanObject(True)
     elif token == b'false':
         return PdfBooleanObject(False)
     else:
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid Boolean object at offset {org_pos}.'
         )
Example #8
0
 def create_from_file(cls, f: io.BufferedReader, doc):
     org_pos = f.tell()
     num, _ = utils.read_until(f, syntax.WHITESPACES)
     if re.match(rb'\d+$', num) is None:
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid reference at offset {org_pos}.')
     obj_no = int(num)
     utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
     num, _ = utils.read_until(f, syntax.WHITESPACES)
     if re.match(rb'\d+$', num) is None:
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid reference at offset {org_pos}.')
     gen_no = int(num)
     utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
     tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
     if tok != b'R':
         f.seek(org_pos, io.SEEK_SET)
         raise Exception(
             f'Parse Error: Not a valid reference at offset {org_pos}.')
     return PdfReferenceObject(doc, obj_no, gen_no)
Example #9
0
    def __init__(self, f):
        '''Initialize a PdfXRefSection from a opened PDF file f.

        The file object’s current position should be at the beginning of the
        line with the sole keyword 'xref' '''
        self.subsections = []
        org_pos = f.tell()
        s, eol_marker = utils.read_until(f, syntax.EOL)
        if s != b'xref':
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f"cross-reference section should begin with keyword 'xref' at offset {org_pos}"
            )
        f.seek(len(eol_marker), io.SEEK_CUR)
        # Following 'xref' line are one or more cross-reference subsections
        while True:
            s, eol_marker = utils.read_until(f, syntax.EOL)
            matches = re.match(rb'^\s*(\d+)\s+(\d+)\s*$', s)
            f.seek(-len(s), io.SEEK_CUR)
            if matches:
                # start of subsection
                self.subsections += [PdfXRefSubSection(f)]
            else:
                break
Example #10
0
    def parse_normal(self, f, progress_cb=None):
        '''Initialize a PdfDocument from a opened PDF file f by reading xref and trailers. After this is called, offset_obj, offset_obj_streams, compressed_obj, offset_xref_trailer, all xref sections are ready'''
        f.seek(0, io.SEEK_SET)
        filesize = os.fstat(f.fileno()).st_size
        # First line is header
        s, eol_marker = utils.read_until(f, syntax.EOL)
        header = re.match(rb'%PDF-(\d+\.\d+)', s)
        if header:
            self.version = Decimal(header.group(1).decode('iso-8859-1'))
            f.seek(len(eol_marker), io.SEEK_CUR)
        else:
            raise Exception('Not a PDF file')

        # read from end of file, find xref
        eof_found = -1
        startxref_found = -1
        temp_line = b''
        temp_count = 2
        temp_offset = 0
        for line in utils.rlines(f):
            temp_offset -= len(line)
            if line.rstrip() == b'%%EOF':
                eof_found = temp_offset
            if eof_found != -1 and temp_count == 0:
                if line.rstrip() == b'startxref':
                    startxref_found = temp_offset
                    break
                else:
                    raise Exception(
                        'startxref not found at 2 lines before EOF marker')
            elif eof_found != -1:
                temp_count -= 1
                temp_line = line
        xref_offset = int(temp_line.decode('iso-8859-1'))
        self.startxref = xref_offset
        # The only required part for a trailer (and marks the end of an increment) is startxref and %%EOF
        self.increments[-1]['startxref'] = xref_offset
        self.increments[-1]['eof'] = True

        inuse_count = 0
        while True:
            f.seek(xref_offset, io.SEEK_SET)
            xref_section, trailer = self.get_xref_trailer_at_offset(
                f, xref_offset)
            self.offset_xref_trailer[xref_offset] = (xref_section, trailer)
            for subsec in xref_section.subsections:
                inuse_count += len(subsec.inuse_entry)
            self.increments[0]['xref_section'] = xref_section
            self.increments[0]['trailer'] = trailer
            if trailer.get('Prev') is None:
                break
            if trailer['Prev'].value - int(trailer['Prev'].value) != 0:
                raise Exception(
                    f'Prev must be an integer, in trailer dict at offset {xref_offset}'
                )
            xref_offset = int(trailer['Prev'].value)  # must not be indirect
            self.increments = [{
                'body': [],
                'xref_section': None,
                'trailer': None,
                'startxref': None,
                'eof': False
            }] + self.increments
            self.increments[0]['startxref'] = xref_offset
        self.ready = True

        inuse_parsed_count = 0
        # parse each in use obj num
        for inc in self.increments:
            for subsec in inc['xref_section'].subsections:
                for entry in subsec.inuse_entry:
                    if entry.get('compressed'):
                        inuse_parsed_count += 1
                        continue
                    offset = entry['offset']
                    f.seek(offset, io.SEEK_SET)
                    new_obj = PdfObject.create_from_file(f, self)
                    if not isinstance(
                            new_obj,
                            PdfIndirectObject) or new_obj.obj_no != entry[
                                'obj_no'] or new_obj.gen_no != entry['gen_no']:
                        raise Exception(
                            f'Invalid obj referenced by xref at offset {offset}'
                        )
                    self.offset_obj[offset] = new_obj
                    if isinstance(new_obj.value,
                                  PdfStreamObject) and new_obj.value.dict.get(
                                      'Type') == 'ObjStm':
                        self.offset_obj_streams[offset] = new_obj
                    inuse_parsed_count += 1
                    print('', end="\r")
                    print(
                        f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed',
                        end='',
                        flush=True)
                    if progress_cb is not None:
                        progress_cb(
                            f'{inuse_parsed_count / inuse_count * 100:5.2f}% processed',
                            read=inuse_parsed_count,
                            total=inuse_count)

        print('Decoding object streams...')
        if progress_cb is not None:
            progress_cb('Decoding object streams...',
                        read=inuse_parsed_count,
                        total=inuse_count)
        for k in self.offset_obj_streams:
            from objstm import decode_objstm
            self.compressed_obj = {
                **(self.compressed_obj),
                **(decode_objstm(self.offset_obj_streams[k], self))
            }
        print('', end="\r")
        print('100% processed    ')
        if progress_cb is not None:
            progress_cb('100% processed',
                        read=inuse_parsed_count,
                        total=inuse_count)
        print('Done')
        if progress_cb is not None:
            progress_cb('Done', read=inuse_parsed_count, total=inuse_count)
Example #11
0
    def create_from_file(cls, f: io.BufferedReader):
        org_pos = f.tell()
        token: bytes = f.read(1)
        if token != b'(':
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid string at offset {org_pos}.')
        result = bytearray(b'')
        stack = 1  # 1 for initial (
        while True:
            # Balanced pairs of parentheses within a string require no special treatment.
            # backslash ( \ ) is used as an escape character for various purposes,
            # such as to include ... unbalanced parentheses ...
            token, endtoken = utils.read_until(f, [b'(', b')', b'\\(', b'\\)'])
            result.extend(token)
            if endtoken in [b'\\(', b'\\)']:  # escaped, read it for now
                f.seek(2, io.SEEK_CUR)
                result.extend(endtoken)
                continue
            elif endtoken == b'(':  # open bracket, stack += 1, read it for now
                stack += 1
                f.seek(1, io.SEEK_CUR)
                result.extend(endtoken)
                continue
            elif endtoken == b')':  # close bracket, stack -= 1, read it for now if string not done
                stack -= 1
                f.seek(1, io.SEEK_CUR)
                if stack == 0:  # string is done
                    break
                else:
                    result.extend(endtoken)
                    continue
            elif endtoken == b'' and stack > 0:
                f.seek(org_pos, io.SEEK_SET)
                raise Exception(
                    f'Parse Error: Not a valid string at offset {org_pos}.')

        # If a string is too long to be conveniently placed on a single line, it may be split
        # across multiple lines by using the backslash character at the end of a line to
        # indicate that the string continues on the following line. The backslash and the
        # end-of-line marker following it are not considered part of the string.
        result = result.replace(b'\\\r\n', b'')
        result = result.replace(b'\\\r', b'')
        result = result.replace(b'\\\n', b'')

        # If an end-of-line marker appears within a literal string without a preceding
        # backslash, the result is equivalent to \n (regardless of whether the end-of-line
        # marker was a carriage return, a line feed, or both).
        result = result.replace(b'\r\n', b'\n')
        result = result.replace(b'\r', b'\n')

        # TABLE 3.2 Escape sequences in literal strings
        result = result.replace(b'\\n', b'\n')
        result = result.replace(b'\\r', b'\r')
        result = result.replace(b'\\t', b'\t')
        result = result.replace(b'\\b', b'\b')
        result = result.replace(b'\\f', b'\f')
        result = result.replace(b'\\(', b'(')
        result = result.replace(b'\\)', b')')
        result = result.replace(b'\\\\', b'\\')
        result = re.sub(
            rb'\\([0-7]{1,3})',
            lambda m: chr(int(m.group(1), 8)).encode('iso-8859-1')
            if int(m.group(1), 8) < 256 else b'\\' + m.group(1), result)

        return PdfLiteralStringObject(result.decode('iso-8859-1'))
Example #12
0
    def __init__(self, f):
        '''Initialize a PdfXRefSubSection from a opened PDF file f.

        The file object’s current position should be at the line with two
        numbers, object number of the first object in this subsection and the
        umber of entries, separated by a space'''
        self.inuse_entry = []
        self.free_entry = []
        self.entries = []
        org_pos = f.tell()
        s, eol_marker = utils.read_until(f, syntax.EOL)
        matches = re.match(rb'^\s*(\d+)\s+(\d+)\s*$', s)
        if matches is None:
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f"cross-reference subsection should begin with two numbers at offset {org_pos}"
            )
        self.first_objno = int(matches.group(1))
        count = int(matches.group(2))
        if self.first_objno < 0 or count < 0:
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f"cross-reference subsection at offset {org_pos} has invalid object number or object count"
            )

        f.seek(len(eol_marker), io.SEEK_CUR)
        # Each entry is exactly 20 bytes long, including EOL marker.
        for i in range(count):
            entry = f.read(20)
            current_obj_no = self.first_objno + i
            # nnnnnnnnnn ggggg n/f
            matches = re.match(rb'^(\d{10})\s(\d{5})\s([nf])(?: \r| \n|\r\n)',
                               entry)
            if matches is None:
                f.seek(org_pos, io.SEEK_SET)
                raise Exception(
                    f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}"
                )
            # obj no 0 is always free and has a generation number of 65535
            if current_obj_no == 0 and int(matches.group(2)) != 65535:
                f.seek(org_pos, io.SEEK_SET)
                raise Exception(
                    f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}"
                )
            # in-use entry: 1st 10-digit number is byte offset, free entry: 1st 10-digit number is an obj no of the next free object
            if matches.group(3) == b'n':
                self.inuse_entry += [{
                    'obj_no': current_obj_no,
                    'gen_no': int(matches.group(2)),
                    'used': True,
                    'offset': int(matches.group(1))
                }]
                self.entries += [self.inuse_entry[-1]]
            elif matches.group(3) == b'f':
                if (len(self.free_entry) > 0
                        and self.free_entry[-1]['next_free_obj_no'] !=
                        current_obj_no):
                    f.seek(org_pos, io.SEEK_SET)
                    raise Exception(
                        f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}"
                    )
                self.free_entry += [{
                    'obj_no': current_obj_no,
                    'gen_no': int(matches.group(2)),
                    'used': False,
                    'next_free_obj_no': int(matches.group(1))
                }]
                self.entries += [self.free_entry[-1]]
            else:
                f.seek(org_pos, io.SEEK_SET)
                raise Exception(
                    f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}"
                )
            # last free entry (the tail of the linked list) links back to obj no 0
            if len(self.free_entry
                   ) > 0 and self.free_entry[-1]['next_free_obj_no'] != 0:
                f.seek(org_pos, io.SEEK_SET)
                raise Exception(
                    f"cross-reference subsection contains an invalid entry at offset {f.tell() - 20}"
                )
Example #13
0
def parse_multipart(content_type, content_length, bodystream, encoding):
	""" Parse the body of a multipart form request as specified
	in rfc1867.

	Ignores case in header-names and supports both '\\n' and '\\r\\n'
	as a separator (but not a mix in the same bodystream).

	@param content_type: The content type header of the HTTP request.
	@param content_length: content-length as a long. "content_length"
			bytes are read from the bodystream, unless EOF is reached.
	@param bodystream: A file like object. Must have a read() method
			which supports the "size" parameter.
	@param encoding: The encoding to use when converting parameters
			to unicode. Files are not converted to unicode.

	@return: (files, var) Where both are L{FormParams}. C{files} contains
			L{MultipartFile} instances, and C{var} contains unicode objects.
	"""
	files = FormParams()
	var = FormParams()
	bytes_left = content_length
	extra = "--"

	# parse content-type
	try:
		ignore, boundary = content_type.split("=", 1)
	except ValueError:
		raise FormParseClientError(
			"multipart/form-data POST request without a boundary.")
	boundary = extra + boundary

	# detect separator
	bytes_read = len(boundary) + 1
	buf = bodystream.read(bytes_read)
	bytes_left -= bytes_read
	if buf[len(boundary)] == "\r":
		sep = "\r\n"
		bodystream.read(1) # also read \n
		bytes_left -= 1
	else:
		sep = "\n"


	rest = sep
	while(True):

		# extract header
		ostream = StringIO()
		bytes_read, success, rest = read_until(
				bodystream, ostream, sep+sep, before=rest,
				maxread=bytes_left)
		bytes_left -= bytes_read
		if not success:
			break
		headers = Parser().parsestr(
				ostream.getvalue()[len(sep):], headersonly=True)

		# parse content-disposition header
		try:
			x = headers["content-disposition"].split(";")
		except AttributeError:
			continue # ignore blocks without a content-disposition header
		filename = None
		name = None
		for param in x[1:]:
			p = param.split("=", 1)
			if not len(p) == 2:
				continue # ignore invalid params
			key = p[0].strip()
			val = p[1][1:-1]
			if key == "name":
				name = val
			elif key == "filename":
				filename = val
		if not name:
			break

		# parse body into buffer
		if filename:
			ostream = TemporaryFile()
		else:
			ostream = StringIO()
		bytes_read, success, rest = read_until(
				bodystream, ostream, sep+boundary, before=rest,
				maxread=bytes_left)
		bytes_left -= bytes_read
		if not success:
			break

		# add to correct container
		if filename:
			ostream.seek(0)
			v = MultipartFile(ostream, filename, headers)
			files.add(unicode(name, encoding), v)
		else:
			var.add(
				unicode(name, encoding),
				unicode(ostream.getvalue(), encoding))

	return (files, var)
Example #14
0
def parse_multipart(content_type, content_length, bodystream, encoding):
    """ Parse the body of a multipart form request as specified
	in rfc1867.

	Ignores case in header-names and supports both '\\n' and '\\r\\n'
	as a separator (but not a mix in the same bodystream).

	@param content_type: The content type header of the HTTP request.
	@param content_length: content-length as a long. "content_length"
			bytes are read from the bodystream, unless EOF is reached.
	@param bodystream: A file like object. Must have a read() method
			which supports the "size" parameter.
	@param encoding: The encoding to use when converting parameters
			to unicode. Files are not converted to unicode.

	@return: (files, var) Where both are L{FormParams}. C{files} contains
			L{MultipartFile} instances, and C{var} contains unicode objects.
	"""
    files = FormParams()
    var = FormParams()
    bytes_left = content_length
    extra = "--"

    # parse content-type
    try:
        ignore, boundary = content_type.split("=", 1)
    except ValueError:
        raise FormParseClientError(
            "multipart/form-data POST request without a boundary.")
    boundary = extra + boundary

    # detect separator
    bytes_read = len(boundary) + 1
    buf = bodystream.read(bytes_read)
    bytes_left -= bytes_read
    if buf[len(boundary)] == "\r":
        sep = "\r\n"
        bodystream.read(1)  # also read \n
        bytes_left -= 1
    else:
        sep = "\n"

    rest = sep
    while (True):

        # extract header
        ostream = StringIO()
        bytes_read, success, rest = read_until(bodystream,
                                               ostream,
                                               sep + sep,
                                               before=rest,
                                               maxread=bytes_left)
        bytes_left -= bytes_read
        if not success:
            break
        headers = Parser().parsestr(ostream.getvalue()[len(sep):],
                                    headersonly=True)

        # parse content-disposition header
        try:
            x = headers["content-disposition"].split(";")
        except AttributeError:
            continue  # ignore blocks without a content-disposition header
        filename = None
        name = None
        for param in x[1:]:
            p = param.split("=", 1)
            if not len(p) == 2:
                continue  # ignore invalid params
            key = p[0].strip()
            val = p[1][1:-1]
            if key == "name":
                name = val
            elif key == "filename":
                filename = val
        if not name:
            break

        # parse body into buffer
        if filename:
            ostream = TemporaryFile()
        else:
            ostream = StringIO()
        bytes_read, success, rest = read_until(bodystream,
                                               ostream,
                                               sep + boundary,
                                               before=rest,
                                               maxread=bytes_left)
        bytes_left -= bytes_read
        if not success:
            break

        # add to correct container
        if filename:
            ostream.seek(0)
            v = MultipartFile(ostream, filename, headers)
            files.add(unicode(name, encoding), v)
        else:
            var.add(unicode(name, encoding),
                    unicode(ostream.getvalue(), encoding))

    return (files, var)
Example #15
0
    def create_from_file(cls, f: io.BufferedReader, doc):
        org_pos = f.tell()
        num, _ = utils.read_until(f, syntax.WHITESPACES)
        if re.match(rb'\d+$', num) is None:
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid indirect object at offset {org_pos}.'
            )
        obj_no = int(num)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        num, _ = utils.read_until(f, syntax.WHITESPACES)
        if re.match(rb'\d+$', num) is None:
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid indirect object at offset {org_pos}.'
            )
        gen_no = int(num)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        tok, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
        if tok != b'obj':
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid indirect object at offset {org_pos}.'
            )

        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        inner_content_pos = f.tell()

        # parse inner object
        def inner2():
            f.seek(inner_content_pos, io.SEEK_SET)
            obj = PdfObject.create_from_file(f, doc)
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
            return obj

        inner_obj = inner2()

        # if inner object is a dict, and is followed by a stream extent, then the object should be stream object
        # otherwise, if there is no endobj token, it is an error
        temp = f.tell()
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)
        token, endtoken = utils.read_until(f,
                                           syntax.DELIMS + syntax.WHITESPACES,
                                           maxsize=7)
        f.seek(temp, io.SEEK_SET)
        if not (token == b'endobj' and
                (endtoken != b''
                 or endtoken is None)):  # endtoken None to indicate EOF
            if utils.peek_at_least(
                    f, 7)[0:7] == b'stream\n' or utils.peek_at_least(
                        f, 8)[0:8] == b'stream\r\n':
                f.seek(inner_content_pos, io.SEEK_SET)
                streamObj = PdfStreamObject.create_from_file(f, doc)
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                token, endtoken = utils.read_until(f,
                                                   syntax.DELIMS +
                                                   syntax.WHITESPACES,
                                                   maxsize=7)
                if not (token == b'endobj' and
                        (endtoken != b'' or endtoken is None)):
                    f.seek(org_pos, io.SEEK_SET)
                    raise Exception(
                        f'Parse Error: Not a valid indirect object at offset {org_pos}.'
                    )
                inner_obj = streamObj
                #if streamObj.dict.get('Type') == 'ObjStm': # Object Stream, decode and parse the content

            else:
                f.seek(org_pos, io.SEEK_SET)
                raise Exception(
                    f'Parse Error: Not a valid indirect object at offset {org_pos}.'
                )
        else:
            f.seek(6, io.SEEK_CUR)

        return PdfIndirectObject(inner_obj, obj_no, gen_no)
Example #16
0
    def create_from_file(cls, f: io.BufferedReader, doc):
        org_pos = f.tell()

        stream_dict = PdfObject.create_from_file(f, doc)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)

        if not (utils.peek_at_least(f, 7)[0:7] == b'stream\n'
                or utils.peek_at_least(f, 8)[0:8] == b'stream\r\n'):
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
        utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=True)

        # check if dict has the required key /Length with valid values
        if not isinstance(stream_dict, PdfDictionaryObject):
            f.seek(org_pos, io.SEEK_SET)
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        if stream_dict.get('Length') is None:
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')

        if isinstance(stream_dict['Length'], PdfReferenceObject):
            size = stream_dict['Length'].deref().value
        elif isinstance(stream_dict['Length'], PdfNumericObject):
            size = stream_dict['Length'].value
        else:
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        if size.as_integer_ratio()[0] <= 0 or size.as_integer_ratio()[1] != 1:
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')
        size = size.as_integer_ratio()[0]

        # check for filters
        filt = None
        if stream_dict.get('Filter') is not None:
            # TODO: Remove the assumption that Filter value is always a direct obj
            filt = stream_dict['Filter']
            if isinstance(filt, PdfArrayObject):
                if any(not isinstance(x, PdfNameObject) for x in filt.value):
                    raise Exception(
                        f'Parse Error: Not a valid stream object at offset {org_pos}.'
                    )
                filt = filt.value[0]
            # /Filter (or first element of the array) must specify a Name
            if not isinstance(filt, PdfNameObject):
                raise Exception(
                    f'Parse Error: Not a valid stream object at offset {org_pos}.'
                )

        # read only /Length bytes
        # filter implementation is reponsible for checking if the data length is correct
        # e.g. if any needed end-of-data marker is present at only the end
        raw = f.read(size)

        # check if stream ends with b'endstream', optionally preceeded by b'\r', 'b'\n' or b'\r\n'
        if utils.peek_at_least(f, 2)[0:2] == b'\r\n':
            f.seek(2, io.SEEK_CUR)
        elif utils.peek_at_least(f, 1)[0:1] == b'\r' or utils.peek_at_least(
                f, 1)[0:1] == b'\n':
            f.seek(1, io.SEEK_CUR)
        token, _ = utils.read_until(f, syntax.DELIMS + syntax.WHITESPACES)
        if token != b'endstream':
            raise Exception(
                f'Parse Error: Not a valid stream object at offset {org_pos}.')

        # actual decoding is done in constructor
        return PdfStreamObject(stream_dict, raw)
Example #17
0
    def parse_linear(self, f, progress_cb=None):
        '''Initialize a PdfDocument from a opened PDF file f from the beginning'''
        def print_progress():
            print('', end="\r")
            print(f'{f.tell() / filesize * 100:5.2f}% processed',
                  end='',
                  flush=True)
            if progress_cb is not None:
                progress_cb(f'{f.tell() / filesize * 100:5.2f}% processed',
                            read=f.tell(),
                            total=filesize)

        f.seek(0, io.SEEK_SET)
        filesize = os.fstat(f.fileno()).st_size

        print_progress()

        # First line is header
        s, eol_marker = utils.read_until(f, syntax.EOL)
        header = re.match(rb'%PDF-(\d+\.\d+)', s)
        if header:
            self.version = Decimal(header.group(1).decode('iso-8859-1'))
            f.seek(len(eol_marker), io.SEEK_CUR)
        else:
            raise Exception('Not a PDF file')

        while True:
            utils.seek_until(f, syntax.NON_WHITESPACES, ignore_comment=False)
            if f.tell() >= filesize:
                break
            org_pos = f.tell()
            s, eol_marker = utils.read_until(f, syntax.EOL)
            if s == b'startxref':  # the last startxref always override the ones before
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                t, _ = utils.read_until(f, syntax.EOL)
                self.startxref = int(t)
                self.increments[-1]['startxref'] = self.startxref
                continue
            elif s == b'xref':
                f.seek(-4, io.SEEK_CUR)
                self.increments[-1]['xref_section'] = PdfXRefSection(f)
                self.offset_xref[org_pos] = self.increments[-1]['xref_section']
                continue
            elif s == b'trailer':
                utils.seek_until(f,
                                 syntax.NON_WHITESPACES,
                                 ignore_comment=True)
                self.increments[-1][
                    'trailer'] = PdfDictionaryObject.create_from_file(f, self)
                continue
            elif s == b'%%EOF':
                # TODO: check if trailer dict immediately precedes %%EOF
                # since we are seeking until non-ws, the only case EOF marker
                # does not appear by itself it when it is preceded by some
                # whitespaces, which should be ignored
                self.increments[-1]['eof'] = True
                f.seek(5 + len(eol_marker), io.SEEK_CUR)
                continue
            elif s[0:1] == b'%':
                # otherwise, it is a comment, ignore the whole remaining line
                utils.seek_until(f, syntax.EOL)
                continue
            #else:

            f.seek(org_pos, io.SEEK_SET)
            if self.increments[-1]['eof']:
                self.increments += [{
                    'body': [],
                    'xref_section': None,
                    'trailer': None,
                    'startxref': None,
                    'eof': False
                }]

            # TODO: how to handle object parse error?
            new_obj = PdfObject.create_from_file(f, self)
            self.increments[-1]['body'] += [new_obj]
            self.offset_obj[org_pos] = new_obj
            if isinstance(new_obj.value, PdfStreamObject
                          ) and new_obj.value.dict.get('Type') == 'ObjStm':
                self.offset_obj_streams[org_pos] = new_obj
            print_progress()

        print('', end="\r")
        print('100% processed    ')
        if progress_cb is not None:
            progress_cb('100% processed', read=f.tell(), total=filesize)
        self.ready = True

        print('Decoding object streams...')
        if progress_cb is not None:
            progress_cb('Decoding object streams...',
                        read=f.tell(),
                        total=filesize)
        for k in self.offset_obj_streams:
            from objstm import decode_objstm
            self.compressed_obj = {
                **(self.compressed_obj),
                **(decode_objstm(self.offset_obj_streams[k], self))
            }
        print('Done')
        if progress_cb is not None:
            progress_cb('Done', read=f.tell(), total=filesize)
Example #18
0
 def authenticate_user(self, user, password):
     if user == "":
         user = self.myuid
     need_su = self.myuid != user
     if not utils.check_ssh(user, self.hutch):
         if self.model.userIO != None:
             try:
                 os.close(self.model.userIO)
             except:
                 pass
         self.model.userIO = None
         self.ui.userLabel.setText("User: "******".")[0]:
                     os.execv("/usr/bin/su",
                              ["su", user, "-c", "/bin/tcsh -if"])
                 else:
                     os.execv("/usr/bin/ssh", [
                         "ssh", user + "@" + utils.COMMITHOST, "/bin/tcsh",
                         "-if"
                     ])
             else:
                 if utils.COMMITHOST == socket.gethostname().split(".")[0]:
                     print "C"
                     os.execv("/bin/tcsh", ["tcsh", "-if"])
                 else:
                     print "D"
                     os.execv("/usr/bin/ssh",
                              ["ssh", utils.COMMITHOST, "/bin/tcsh", "-if"])
         except:
             pass
         print "Say what?  execv failed?"
         sys.exit(0)
     l = utils.read_until(fd, "(assword:|> )").group(1)
     if l != "> ":
         os.write(fd, password + "\n")
         l = utils.read_until(fd, "> ")
     if utils.KINIT != None and password != "":
         os.write(fd, utils.KINIT + "\n")
         l = utils.read_until(fd, ": ")
         os.write(fd, password + "\n")
         l = utils.read_until(fd, "> ")
     #
     # Sigh.  Someone once had a file named time.py in their home
     # directory.  So let's go somewhere where we know the files.
     #
     os.write(fd, "cd %s\n" % utils.TMP_DIR)
     l = utils.read_until(fd, "> ")
     self.model.user = user
     if self.model.userIO != None:
         try:
             os.close(self.model.userIO)
         except:
             pass
     self.model.userIO = fd
     if need_su:
         self.utimer.start(10 * 60000)  # Let's go for 10 minutes.
     self.ui.userLabel.setText("User: " + user)
Example #19
0
 def read_until(self, pattern, size=None):
     return read_until(self.buffer, pattern, size)