def parse(self, lines): ''' Parse signature file lines. @lines - A list of lines from a signature file. Returns None. ''' signature = None for line in lines: # Split at the first comment delimiter (if any) and strip the # result line = line.split('#')[0].strip() # Ignore blank lines and lines that are nothing but comments. # We also don't support the '!mime' style line entries. if line and line[0] != '!': # Parse this signature line sigline = SignatureLine(line) # Level 0 means the first line of a signature entry if sigline.level == 0: # If there is an existing signature, append it to the signature list, # unless the text in its title field has been filtered by user-defined # filter rules. if signature and not self._filtered(signature.title): self.signatures.append(signature) # Create a new signature object; use the size of self.signatures to # assign each signature a unique ID. signature = Signature(len(self.signatures), sigline) # Else, just append this line to the existing signature elif signature: # signature.append(sigline) signature.lines.append(sigline) # If this is not the first line of a signature entry and there is no other # existing signature entry, something is very wrong with the # signature file. else: raise ParserException("Invalid signature line: '%s'" % line) # Add the final signature to the signature list if signature: if not self._filtered(signature.lines[0].format): self.signatures.append(signature) # Sort signatures by confidence (aka, length of their magic bytes), # largest first self.signatures.sort(key=lambda x: x.confidence, reverse=True)
def _analyze(self, signature, offset): ''' Analyzes self.data for the specified signature data at the specified offset . @signature - The signature to apply to the data. @offset - The offset in self.data to apply the signature to. Returns a dictionary of tags parsed from the data. ''' description = [] max_line_level = 0 previous_line_end = 0 tags = { 'id': signature.id, 'offset': offset, 'invalid': False, 'once': False } # Apply each line of the signature to self.data, starting at the # specified offset for n in range(0, len(signature.lines)): line = signature.lines[n] # Ignore indentation levels above the current max indent level if line.level <= max_line_level: # If the relative offset of this signature line is just an # integer value, use it if isinstance(line.offset, int): line_offset = line.offset # Else, evaluate the complex expression else: # Format the previous_line_end value into a string. Add the '+' sign to explicitly # state that this value is to be added to any subsequent values in the expression # (e.g., '&0' becomes '4+0'). ple = '%d+' % previous_line_end # Allow users to use either the '&0' (libmagic) or '&+0' (explcit addition) sytaxes; # replace both with the ple text. line_offset_text = line.offset.replace('&+', ple).replace( '&', ple) # Evaluate the expression line_offset = self._do_math(offset, line_offset_text) # Sanity check if not isinstance(line_offset, int): raise ParserException( "Failed to convert offset '%s' to a number: '%s'" % (line.offset, line.text)) # The start of the data needed by this line is at offset + line_offset. # The end of the data will be line.size bytes later. start = offset + line_offset end = start + line.size # If the line has a packed format string, unpack it if line.pkfmt: try: dvalue = struct.unpack( line.pkfmt, binwalk.core.compat.str2bytes( self.data[start:end]))[0] # Not enough bytes left in self.data for the specified # format size except struct.error as e: dvalue = 0 # Else, this is a string else: # Wildcard strings have line.value == None if line.value is None: # Check to see if this is a string whose size is known and has been specified on a previous # signature line. if binwalk.core.compat.has_key( tags, 'strlen') and binwalk.core.compat.has_key( line.tags, 'string'): dvalue = self.data[start:(start + tags['strlen'])] # Else, just terminate the string at the first newline, # carriage return, or NULL byte else: dvalue = self.data[start:end].split( '\x00')[0].split('\r')[0].split('\n')[0] # Non-wildcard strings have a known length, specified in # the signature line else: dvalue = self.data[start:end] # Some integer values have special operations that need to be performed on them # before comparison (e.g., "belong&0x0000FFFF"). Complex math expressions are # supported here as well. # if isinstance(dvalue, int) and line.operator: if line.operator: try: # If the operator value of this signature line is just # an integer value, use it if isinstance(line.opvalue, int) or isinstance( line.opvalue, long): opval = line.opvalue # Else, evaluate the complex expression else: opval = self._do_math(offset, line.opvalue) # Perform the specified operation if line.operator == '&': dvalue &= opval elif line.operator == '|': dvalue |= opval elif line.operator == '*': dvalue *= opval elif line.operator == '+': dvalue += opval elif line.operator == '-': dvalue -= opval elif line.operator == '/': dvalue /= opval elif line.operator == '~': dvalue = ~opval elif line.operator == '^': dvalue ^= opval except KeyboardInterrupt as e: raise e except Exception as e: raise ParserException("Operation '" + str(dvalue) + " " + str(line.operator) + "= " + str(line.opvalue) + "' failed: " + str(e)) # Does the data (dvalue) match the specified comparison? if ((line.value is None) or (line.regex and line.value.match(dvalue)) or (line.condition == '=' and dvalue == line.value) or (line.condition == '>' and dvalue > line.value) or (line.condition == '<' and dvalue < line.value) or (line.condition == '!' and dvalue != line.value) or (line.condition == '~' and (dvalue == ~line.value)) or (line.condition == '^' and (dvalue ^ line.value)) or (line.condition == '&' and (dvalue & line.value)) or (line.condition == '|' and (dvalue | line.value))): # Up until this point, date fields are treated as integer values, # but we want to display them as nicely formatted strings. if line.type == 'date': try: ts = datetime.datetime.utcfromtimestamp(dvalue) dvalue = ts.strftime("%Y-%m-%d %H:%M:%S") except KeyboardInterrupt as e: raise e except Exception: dvalue = "invalid timestamp" # Generate the tuple for the format string dvalue_tuple = () for x in self.fmtstr.finditer(line.format): dvalue_tuple += (dvalue, ) # Format the description string desc = line.format % dvalue_tuple # If there was any description string, append it to the # list of description string parts if desc: description.append(desc) # Process tag keywords specified in the signature line. These have already been parsed out of the # original format string so that they can be processed # separately from the printed description string. for (tag_name, tag_value) in binwalk.core.compat.iterator(line.tags): # If the tag value is a string, try to format it if isinstance(tag_value, str): # Generate the tuple for the format string dvalue_tuple = () for x in self.fmtstr.finditer(tag_value): dvalue_tuple += (dvalue, ) # Format the tag string tags[tag_name] = tag_value % dvalue_tuple # Else, just use the raw tag value else: tags[tag_name] = tag_value # Some tag values are intended to be integer values, so # try to convert them as such try: tags[tag_name] = int(tags[tag_name], 0) except KeyboardInterrupt as e: raise e except Exception as e: pass # Abort processing soon as this signature is marked invalid, unless invalid results # were explicitly requested. This means that the sooner invalid checks are made in a # given signature, the faster the scan can filter out false # positives. if not self.show_invalid and tags['invalid']: break # Look ahead to the next line in the signature; if its indent level is greater than # that of the current line, then track the end of data for the current line. This is # so that subsequent lines can use the '>>&0' offset syntax to specify relative offsets # from previous lines. try: next_line = signature.lines[n + 1] if next_line.level > line.level: if line.type == 'string': previous_line_end = line_offset + len(dvalue) else: previous_line_end = line_offset + line.size except IndexError as e: pass # If this line satisfied its comparison, +1 the max # indentation level max_line_level = line.level + 1 else: # No match on the first line, abort if line.level == 0: break else: # If this line did not satisfy its comparison, then higher # indentation levels will not be accepted. max_line_level = line.level # Join the formatted description strings and remove backspace # characters (plus the preceeding character as well) tags['description'] = self.bspace.sub('', " ".join(description)) # This should never happen if not tags['description']: tags['display'] = False tags['invalid'] = True # If the formatted string contains non-printable characters, consider # it invalid if self.printable.match( tags['description']).group() != tags['description']: tags['invalid'] = True return tags
def __init__(self, line): ''' Class constructor. Responsible for parsing a line from a signature file. @line - A line of text from the signature file. Returns None. ''' self.tags = {} self.text = line self.regex = False # Split the line on any white space; for this to work, backslash-escaped # spaces ('\ ') are replaced with their escaped hex value ('\x20'). # # [offset] [data type] [comparison value] [format string] # 0 belong 0x12345678 Foo file type, # >4 string x file name: %s, parts = line.replace('\\ ', '\\x20').split(None, 3) # Sanity check on the split line if len(parts) not in [3, 4]: raise ParserException("Invalid signature line: '%s'" % line) # The indentation level is determined by the number of '>' characters at # the beginning of the signature line. self.level = parts[0].count('>') # Get rid of the indentation characters and try to convert the remaining # characters to an integer offset. This will fail if the offset is a complex # value (e.g., '(4.l+16)'). self.offset = parts[0].replace('>', '') try: self.offset = int(self.offset, 0) except ValueError as e: pass # self.type is the specified data type ('belong', 'string', etc) self.type = parts[1] self.opvalue = None self.operator = None # Each data type can specify an additional operation to be performed on the # data being scanned before performing a comparison (e.g., 'belong&0xFF' will # AND the data with 0xFF before the comparison is performed). # # We support the following operators: for operator in ['&', '|', '*', '+', '-', '/', '~', '^']: # Look for each operator in self.type if operator in self.type: # If found, split self.type into the type and operator value (self.type, self.opvalue) = self.type.split(operator, 1) # Keep a record of the specified operator self.operator = operator # Try to convert the operator value into an integer. This works for # simple operator values, but not for complex types (e.g., # '(4.l+12)'). try: self.opvalue = int(self.opvalue, 0) except ValueError as e: pass # Only one operator type is supported, so break as soon as one # is found break # If the specified type starts with 'u' (e.g., 'ubelong'), then it is # unsigned; else, it is signed if self.type[0] == 'u': self.signed = False self.type = self.type[1:] else: self.signed = True # Big endian values start with 'be' ('belong'), little endian values start with 'le' ('lelong'). # The struct module uses '>' to denote big endian and '<' to denote # little endian. if self.type.startswith('be'): self.type = self.type[2:] self.endianness = '>' elif self.type.startswith('le'): self.endianness = '<' self.type = self.type[2:] # Assume big endian if no endianness was explicitly specified else: self.endianness = '>' # Check the comparison value for the type of comparison to be performed (e.g., # '=0x1234', '>0x1234', etc). If no operator is specified, '=' is implied. if parts[2][0] in ['=', '!', '>', '<', '&', '|', '^', '~']: self.condition = parts[2][0] self.value = parts[2][1:] else: self.condition = '=' self.value = parts[2] # If this is a wildcard value, explicitly set self.value to None if self.value == 'x': self.value = None # String values need to be decoded, as they may contain escape # characters (e.g., '\x20') elif self.type == 'string': # String types support multiplication to easily match large # repeating byte sequences if '*' in self.value: try: p = self.value.split('*') self.value = p[0] for n in p[1:]: self.value *= int(n, 0) except KeyboardInterrupt as e: raise e except Exception as e: raise ParserException( "Failed to expand string '%s' with integer '%s' in line '%s'" % (self.value, n, line)) try: self.value = binwalk.core.compat.string_decode(self.value) except ValueError as e: raise ParserException( "Failed to decode string value '%s' in line '%s'" % (self.value, line)) # If a regex was specified, compile it elif self.type == 'regex': self.regex = True try: self.value = re.compile(self.value) except KeyboardInterrupt as e: raise e except Exception as e: raise ParserException("Invalid regular expression '%s': %s" % (self.value, str(e))) # Non-string types are integer values else: try: self.value = int(self.value, 0) except ValueError as e: raise ParserException( "Failed to convert value '%s' to an integer on line '%s'" % (self.value, line)) # Sanity check to make sure the first line of a signature has an # explicit value if self.level == 0 and self.value is None: raise ParserException( "First element of a signature must specify a non-wildcard value: '%s'" % (line)) # Set the size and struct format value for the specified data type. # This must be done, obviously, after the value has been parsed out # above. if self.type == 'string': # Strings don't have a struct format value, since they don't have # to be unpacked self.fmt = None # If a string type has a specific value, set the comparison size to # the length of that string if self.value: self.size = len(self.value) # Else, truncate the string to self.MAX_STRING_SIZE else: self.size = self.MAX_STRING_SIZE elif self.type == 'regex': # Regular expressions don't have a struct format value, since they # don't have to be unpacked self.fmt = None # The size of a matching regex is unknown until it is applied to # some data self.size = self.MAX_STRING_SIZE elif self.type == 'byte': self.fmt = 'b' self.size = 1 elif self.type == 'short': self.fmt = 'h' self.size = 2 elif self.type == 'quad': self.fmt = 'q' self.size = 8 # Assume 4 byte length for all other supported data types elif self.type in ['long', 'date']: self.fmt = 'i' self.size = 4 else: raise ParserException("Unknown data type '%s' in line '%s'" % (self.type, line)) # The struct module uses the same characters for specifying signed and unsigned data types, # except that signed data types are upper case. The above if-else code sets self.fmt to the # lower case (unsigned) values. if not self.signed: self.fmt = self.fmt.upper() # If a struct format was identified, create a format string to be passed to struct.unpack # which specifies the endianness and data type format. if self.fmt: self.pkfmt = '%c%c' % (self.endianness, self.fmt) else: self.pkfmt = None # Check if a format string was specified (this is optional) if len(parts) == 4: # %lld formats are only supported if Python was built with HAVE_LONG_LONG self.format = parts[3].replace('%ll', '%l') # Regex to parse out tags, which are contained within curly braces retag = re.compile(r'\{.*?\}') # Parse out tag keywords from the format string for match in retag.finditer(self.format): # Get rid of the curly braces. tag = match.group().replace('{', '').replace('}', '') # If the tag specifies a value, it will be colon delimited # (e.g., '{name:%s}') if ':' in tag: (n, v) = tag.split(':', 1) else: n = tag v = True # Create a new SignatureTag instance and append it to self.tags self.tags[n] = v # Remove all tags from the printable format string self.format = retag.sub('', self.format).strip() else: self.format = ""