Python get_extension Examples, repo_scraper.filetype.get_extension Python Examples

Example #1

0

Show file

File: DiffChecker.py Project: digideskio/repo-scraper

    def check(self):
        #Build the identifier using the filename and commit hashes
        identifier = '%s (%s)' % (self.filename, self.commit_hashes[1])

        #The comments is a list to keep track of useful information
        #encountered when checking, right now, its only being used
        #to annotate when base64 code was removed
        comments = []

        #Check the number of additions, if there are too many
        #send a warning and skip, this may be due to a big data file addition
        if self.error:
            return Result(self.filename, self.error)

        #Check if extension/mimetype is allowed
        if filetype.get_extension(self.filename) not in self.allowed_extensions:
            return Result(identifier, FILETYPE_NOT_ALLOWED)
        
        #Start applying rules...
        #First check if additions contain base64, if there is remove it
        has_base64, self.content = m.base64_matcher(self.content, remove=True)
        if has_base64:
            comments.append('BASE64_REMOVED')
        
        #Create matcher for amazonaws.com
        amazonaws_matcher = m.create_domain_matcher('amazonaws.com')
        #Apply matchers: password, ips and aws
        match, matches = m.multi_matcher(self.content, m.password_matcher, m.ip_matcher, amazonaws_matcher)

        if match:
            return Result(identifier, MATCH, matches=matches, comments=comments)
        else:
            return Result(identifier, NOT_MATCH, comments=comments)

Example #2

0

Show file

File: DiffChecker.py Project: pombredanne/repo-scraper

    def check(self):
        #Build the identifier using the filename and commit hashes
        identifier = '%s from commit %s to commit %s' % (self.filename, self.commit_hashes[0], self.commit_hashes[1])

        #The commments is a list to keep track of useful information
        #encountered when checking, right now, its only being used
        #to annotate when base64 code was removed
        commments = []

        #Git is smart enough to detect changes binary files when doing diff,
        #will not show any differences, only a message similar to this:
        #Binary files /dev/null and b/img.JPG differ 

        #Check the number of additions, if there are too many
        #send a warning and skip, this may be due to a big data file addition
        #print 'Characters %d' % len(self.content)
        if self.error:
            return Result(self.filename, self.error)
    
        #Check file extension, if it's a text file continue, if it's not,
        #send a warning and skip
        #if filetype.mime_from_name(self.filename) is None:
        #    return Result(self.filename, NOT_PLAIN_TEXT)

        #Check if extension/mimetype is allowed
        if filetype.get_extension(self.filename) not in self.allowed_extensions:
            return Result(identifier, FILETYPE_NOT_ALLOWED)
        
        #Start applying rules...
        #First check if additions contain base64, if there is remove it
        has_base64, self.content = matchers.base64_matcher(self.content, remove=True)
        if has_base64:
            commments.append('BASE64_REMOVED')
        
        #Now check for passwords
        has_pwd, matches = matchers.password_matcher(self.content)

        if has_pwd:
            return Result(identifier, MATCH, matches=matches, comments=commments)
        else:
            return Result(identifier, NOT_MATCH, comments=commments)

Example #3

0

Show file

File: DiffChecker.py Project: yinny/repo-scraper

    def check(self):
        #Build the identifier using the filename and commit hashes
        identifier = '%s (%s)' % (self.filename, self.commit_hashes[1])

        #The comments is a list to keep track of useful information
        #encountered when checking, right now, its only being used
        #to annotate when base64 code was removed
        comments = []

        #Check the number of additions, if there are too many
        #send a warning and skip, this may be due to a big data file addition
        if self.error:
            return Result(self.filename, self.error)

        #Check if extension/mimetype is allowed
        if filetype.get_extension(
                self.filename) not in self.allowed_extensions:
            return Result(identifier, FILETYPE_NOT_ALLOWED)

        #Start applying rules...
        #First check if additions contain base64, if there is remove it
        has_base64, self.content = m.base64_matcher(self.content, remove=True)
        if has_base64:
            comments.append('BASE64_REMOVED')

        #Create matcher for amazonaws.com
        amazonaws_matcher = m.create_domain_matcher('amazonaws.com')
        #Apply matchers: password, ips and aws
        match, matches = m.multi_matcher(self.content, m.password_matcher,
                                         m.ip_matcher, amazonaws_matcher)

        if match:
            return Result(identifier,
                          MATCH,
                          matches=matches,
                          comments=comments)
        else:
            return Result(identifier, NOT_MATCH, comments=comments)

Example #4

0

Show file

    def check(self):
        #The comments is a list to keep track of useful information
        #encountered when checking, right now, its only being used
        #to annotate when base64 code was removed
        comments = []

        #Check file size if it's more than max_file_size_bytes (default is 1MB)
        #send just a warning and do not open the file,
        #since pattern matching is going to be really slow
        f_size = os.stat(self.path).st_size
        if f_size > self.max_file_size_bytes:
            return Result(self.path, BIG_FILE)

        #Check if extension is allowed
        if filetype.get_extension(self.path) not in self.allowed_extensions:
            return Result(self.path, FILETYPE_NOT_ALLOWED)

        #At this point you only have files with allowed extensions and
        #smaller than max_file_size_bytes
        #open the file and then apply all rules
        with open(self.path, 'r') as f:
            content = f.read()

        #Last check: search for potential base64 strings and remove them, send a warning
        has_base64, content = m.base64_matcher(content, remove=True)
        if has_base64:
            comments.append('BASE64_REMOVED')

        #Create matcher for amazonaws.com
        amazonaws_matcher = m.create_domain_matcher('amazonaws.com')
        #Apply matchers: password, ips and aws
        match, matches = m.multi_matcher(content, m.password_matcher,
                                         m.ip_matcher, amazonaws_matcher)

        if match:
            return Result(self.path, MATCH, matches=matches, comments=comments)
        else:
            return Result(self.path, NOT_MATCH, comments=comments)

Example #5

0

Show file

File: FileChecker.py Project: digideskio/repo-scraper

    def check(self):
        #The comments is a list to keep track of useful information
        #encountered when checking, right now, its only being used
        #to annotate when base64 code was removed
        comments = []

        #Check file size if it's more than max_file_size_bytes (default is 1MB)
        #send just a warning and do not open the file,
        #since pattern matching is going to be really slow
        f_size = os.stat(self.path).st_size
        if f_size > self.max_file_size_bytes:
            return Result(self.path, BIG_FILE)
  
        #Check if extension is allowed
        if filetype.get_extension(self.path) not in self.allowed_extensions:
            return Result(self.path, FILETYPE_NOT_ALLOWED)

        #At this point you only have files with allowed extensions and
        #smaller than max_file_size_bytes
        #open the file and then apply all rules
        with open(self.path, 'r') as f:
            content = f.read()

        #Last check: search for potential base64 strings and remove them, send a warning
        has_base64, content = m.base64_matcher(content, remove=True)
        if has_base64:
            comments.append('BASE64_REMOVED')

        #Create matcher for amazonaws.com
        amazonaws_matcher = m.create_domain_matcher('amazonaws.com')
        #Apply matchers: password, ips and aws
        match, matches = m.multi_matcher(content, m.password_matcher, m.ip_matcher, amazonaws_matcher)

        if match:
            return Result(self.path, MATCH, matches=matches, comments=comments)
        else:
            return Result(self.path, NOT_MATCH, comments=comments)