Example #1
0
    def _fixup_column_headers(self, filename):
        '''
        If any columns were added to a file in the middle of the job run this after
        file is closed to make sure the first row column names are matched up.
        This is the only way to ensure the file has the correct information;
        which is unfortunate due to the potential expense of this operation.
        '''
        # Create random file temp file in the same place as the outfile
        randomLetters = b64encode(os.urandom(16)).decode('utf-8')
        ValidChars = string.ascii_letters + string.digits
        randomLetters = ''.join(
            [c for c in randomLetters if (c in ValidChars)])
        tmpFileName = "_surveyor_tmp{}_{}".format(randomLetters, filename)
        tempPath = os.path.join(self._outDir, tmpFileName)

        log.msg(
            1,
            "Fixing output headers: {} ==> {}".format(tmpFileName, filename))
        with open(tempPath, 'w', encoding='utf-8') as tempFile:
            # Write new header line
            tempFile.write(
                self._delimiter.join(self._col_create_names_from_keys(
                    filename)) + '\n')
            # Move lines from original file to new, skipping header line
            oldPath = os.path.join(self._outDir, filename)
            with open(oldPath, 'r', encoding='utf-8') as oldFile:
                _header_line = oldFile.readline()
                for line in oldFile:
                    tempFile.write(line)

        shutil.move(tempPath, oldPath)
Example #2
0
    def _is_file_survey_dupe(self, filePath, measures):
        '''
        Simple mechanism to identify duplicate and near-dupicate code by tracking
        a dictionary of file measures.  There are two modes:

        1) File Size: Build a dictionary in memory based on a hash of fileName
        and config info. In the hash buckets store a dict of file sizes for
        the first of each size seen that is not within the dupe threshold.
        If a file size within the threshold of an existing hashed size, treat
        it as a dupe and increment count for reporting.

        2) NBNC CRC: use the nbnc.crc measure to identify duplicates

        Note ASSUME the necessary file metadata will be present in the
        measures dicitonary, as basemodule.py puts it there for the Dupe option.
        '''
        firstDupeFilePath = None

        # 1) File name and Size check
        if isinstance(self._dupeThreshold, int):
            fileSize = int(measures[basemodule.METADATA_FILESIZE])
            dupeKey = (measures[basemodule.METADATA_FULLNAME] +
                        measures[basemodule.METADATA_CONFIG].replace(' ', ''))
            if dupeKey in self._dupeFileSurveys:
                for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].items():
                    if (dupeFileSize - self._dupeThreshold) <= fileSize and (
                            fileSize <= (dupeFileSize + self._dupeThreshold)):
                        firstDupeFilePath = firstFilePath
                        self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath)
                        log.msg(1, "Dupe {} by {} of {} bytes: {}".format(
                                    fileCount, fileSize - dupeFileSize, fileSize, filePath))
                        break
            else:
                self._dupeFileSurveys[dupeKey] = {}

            if firstDupeFilePath is None:
                self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath)
                log.file(2, "Added {} -- {} to dupe dictionary".format(dupeKey, fileSize))

        # 2) Code CRC check
        # Our relying on the nbnc.crc is brittle, because it is both a code and runtime
        # dependency on the Code csmodule being used. And there are valid scenarios
        # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the
        # measure isn't present, we fail silently
        else:
            fileCrc = None
            try:
                fileCrc = measures['nbnc.crc']
            except:
                log.file(2, "CRC Dupe - nbnc.crc missing: {}".format(filePath))
            if fileCrc in self._dupeFileSurveys:
                fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc]
                self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath)
                log.msg(1, "Dupe {}: {} DUPE_OF {}".format(fileCount, filePath, firstDupeFilePath))
            elif fileCrc is not None:
                self._dupeFileSurveys[fileCrc] = (1, filePath)
                log.file(2, "Added {} -- {} to dupe dictionary".format(filePath, fileCrc))

        return firstDupeFilePath
Example #3
0
    def __init__(self, configStack, options, file_measured_callback,
                 status_callback):

        # Options define the life a job and cannot be modified
        self._options = options

        # All UI output is done through the status callback
        self._status_callback = status_callback

        # Keep track of (and allow access to) raw file metrics
        self.numFolders = 0
        self.numUnfilteredFiles = 0
        self.numFilteredFiles = 0
        self.numFilesToProcess = 0

        # Exceptions that occurred in workers are collected and displayed
        # Unlike errors, exceptions will not generate rows in output
        self.exceptions = []

        # Queues to communicate with Workers, and the output thread
        self._taskQueue = multiprocessing.Queue()
        self._controlQueue = multiprocessing.Queue()
        self._outQueue = multiprocessing.Queue()
        self._outThread = jobout.OutThread(self._outQueue, self._controlQueue,
                                           self._options.profileName,
                                           file_measured_callback)

        # Create max number of workers (they will be started later as needed)
        assert self._options.numWorkers > 0, "Less than 1 worker requested!"
        context = (log.get_context(), self._options.profileName)
        self._workers = self.Workers(self._controlQueue, self._taskQueue,
                                     self._outQueue, context,
                                     self._options.numWorkers)
        log.msg(1, "Created {} workers".format(self._workers.num_max()))

        # Create our object for tracking state of folder walking
        self._pathsToMeasure = options.pathsToMeasure
        self._folderWalker = folderwalk.FolderWalker(
            options.deltaPath, configStack, options.recursive,
            options.includeFolders, options.skipFolders, options.fileFilters,
            options.skipFiles, self.add_folder_files)

        # Utility object for managing work packages; holds the state of the
        # work package that is being prepared for sending to queue
        self._workPackage = self.WorkPackage()

        # Other processing state
        self._continueProcessing = True
        self._taskPackagesSent = 0
        self._filesSinceLastSend = 0
 def _validate_line(self, configEntry):
     '''
     Is the module being asked to do what it was designed to do?
     '''
     measureOk = configEntry.module.can_do_measure(
         configEntry.measureFilters)
     verbOk = configEntry.module.can_do_verb(configEntry.verb)
     if not (measureOk and verbOk):
         log.msg(
             1, "Failed module validate measureOk/verbOk: {}/{}".format(
                 measureOk, verbOk))
         raise utils.ConfigError(
             uistrings.STR_ErrorConfigInvalidMeasure.format(
                 configEntry.verb, configEntry.measureFilter))
 def read_file(self, filePath):
     '''
     Read a Surveyor configuration file and return a list of ConfigEntrys
     to store on the configuration stack with this folder location.
     '''
     try:
         log.msg(1, "Config file: {}".format(filePath))
         configEntries = self._read_file(filePath, [])
         self._validate_file(configEntries)
         log.config(2, "Finsihed reading config file: {}".format(filePath))
         log.config(3, configEntries)
         return configEntries
     except Exception as e:
         raise utils.ConfigError(
             uistrings.STR_ErrorConfigFile.format(filePath, str(e)))
Example #6
0
def _open_file(filePath, forceAll):
    """
    Manage the file opening with correct encoding based on any errors in 
    decoding utf-8 default and through inspection of file start.
    This isn't foolproof - files that use different encodings farther 
    in may blow up later if decoded, but that is rare.
    """

    # Use buffering to reduce the cost of open on larger files
    fileObj = open(filePath, 'r', buffering=FILE_BUFFERING, encoding='utf_8')

    # Grab the first bytes of the file
    start = None
    try:
        try:
            start = _get_file_start(fileObj, FILE_START_UTF8_CHECK)

        except UnicodeDecodeError as e:
            fileObj.close()
            log.file(1, "UTF-8 error, using binary: {}".format(filePath))
            fileObj = open(filePath, 'rb', buffering=FILE_BUFFERING)
            start = _get_file_start(fileObj, FILE_START_CHECK)

    except Exception as e2:
        log.msg(1, "Cannot open and read {}: {}".format(filePath, e2))
        fileObj.close()

    # Do tests that look at start of the file
    if start:
        keepFileOpen = forceAll
        if not forceAll:
            if _is_noncode_file(start):
                log.file(1, "Skipping, non-code start: {}".format(filePath))
            elif not filetype.is_text_file(start):
                log.file(1, "Skipping, binary char: {}".format(filePath))
            else:
                keepFileOpen = True
        if not keepFileOpen:
            fileObj.close()
            fileObj = None
    else:
        if fileObj:
            fileObj.close()
        fileObj = None

    return fileObj
Example #7
0
    def run(self):
        log.cc(1, "STARTING: Begining to process output queue...")
        try:
            if self._profileName is not None:
                import cProfile
                cProfile.runctx('self._run()', globals(), {'self': self},
                                self._profileName + self.name)
            else:
                self._run()
            log.cc(1, "FINISHED processing output queue")

        except KeyboardInterrupt:
            log.cc(1, "Ctrl-c occurred in OUTPUT THREAD")
            _thread.interrupt_main()
        except Exception as e:
            log.msg(1, "EXCEPTION processing output queue: " + str(e))
            log.stack()
            self._controlQueue.put_nowait(('JOB', 'EXCEPTION', e))
        finally:
            log.cc(1, "TERMINATING")
Example #8
0
 def _write_aggregates(self):
     '''
     For each set of aggregates, create an output file with aggregates
     that exceed threshold.
     HACK - use the output writer by creating a dummy OUT file tag
     '''
     for keyName in list(self._aggregateNames.keys()):
         fileName = str(keyName).replace('.', '')
         hackOutTagMeasure = {'tag_write_aggregates': 'OUT:' + fileName}
         analysisRows = []
         for valueRow in list(self._aggregates[keyName].values()):
             writeRow = self._aggregateThresholdKey is None
             if not writeRow:
                 try:
                     writeRow = valueRow[self._aggregateThresholdKey] > self._aggregateThreshold
                 except KeyError as e:
                     raise utils.InputException(STR_AggregateThresholdKeyError.format(str(e)))
             if writeRow:
                 analysisRows.append(valueRow)
         log.msg(1, "Aggregate: {}".format(analysisRows))
         self._writer.write_items(hackOutTagMeasure, analysisRows)
Example #9
0
    def _put_files_in_queue(self, path, deltaPath, filesAndConfigs):
        '''
        Package files from the path into workItems that are grouped
        into workPackages and placed into the task queue for jobworkers.
        Packages are broken up if files number or total size exceeds
        thresholds to help evenly distribute load across cores
        '''
        if not filesAndConfigs:
            return

        for fileName, configEntrys in filesAndConfigs:

            # Expensive to check file size here, but worth it for pracelling widely
            # varying file sizes out to cores for CPU intensive jobs.
            # Profiling shows it is not worth caching this
            try:
                fileSize = utils.get_file_size(os.path.join(path, fileName))
            except Exception as e:
                # It is possible (at least in Windows) for a fileName to exist
                # in the file system but be invalid for Windows calls. This is
                # the first place the file is accessed through the file system;
                # if it blows up don't want the job to fall apart, and this is
                # an unusual case, so don't bother with a pathway back to the main
                # application; just swallow it and provide debug
                log.msg(1, str(e))
                log.stack()
                continue

            log.cc(3, "WorkItem: {}, {}".format(fileSize, fileName))
            self.numFilesToProcess += 1
            workItem = (path, deltaPath, fileName, configEntrys, self._options,
                        len(filesAndConfigs))
            self._workPackage.add(workItem, fileSize)

            if self._workPackage.ready_to_send() or (self._filesSinceLastSend >
                                                     MAX_FILES_BEFORE_SEND):
                self._send_current_package()

            if not self._check_command():
                break
Example #10
0
    def __init__(self,
                 configFileName,
                 configOverrides,
                 defaultConfigOptions=[]):
        log.config(2, "Creating ConfigStack with {}".format(configFileName))
        self._modules = CodeSurveyorModules()
        self._reader = configreader.ConfigReader(self.load_csmodule)
        self._measureRootDir = ''

        # Stack of config files, represented as paths and lists of ConfigEntrys
        self._configStack = []

        # Cache of config file information
        # Key is path name, value is list entries that represent the config file
        self._configFileCache = {}

        # List of default config option tags passed by the application
        self._defaultConfigOptions = defaultConfigOptions

        # Either use overrides or try to read config files
        if configOverrides:
            log.msg(1, "Ignoring config files: {}".format(configOverrides))
            self._configName = ''
            self._setup_config_overrides(configOverrides)

        else:
            self._configName = configFileName
            # Make sure the config file name does not include a path, as the point is
            # to look for a config file in each folder we visit
            if not os.path.dirname(self._configName) == '':
                raise utils.ConfigError(
                    uistrings.STR_ErrorConfigFileNameHasPath)
            # Load the default config file to use for this job
            # First try in the root of the job folder; then in the surveyor folder
            if not self._push_file(runtime_dir()):
                if not self._push_file(surveyor_dir()):
                    log.msg(
                        1, "{} not present in default locations".format(
                            self._configName))
    def _validate_entries(self, configEntries):
        '''
        Are all config file entries consistent with each other, to avoid silent
        double counting? Throws an error exception if not.
        '''
        log.config(2, "Checking for duplicate config entries")

        # Create list of all possible measure/file combos
        # Ask the module to match each measure, to catch wildcard overlap
        fileFilters = []
        possibleMeasures = []
        for entry in configEntries:
            for fileFilter in entry.fileFilters:
                fileFilters.append(fileFilter)
                possibleMeasures.append(
                    (fileFilter, entry.measureFilter, entry.moduleName,
                     entry.verb, entry.tags, entry.paramsRaw))
        log.config(4, fileFilters)
        log.config(4, possibleMeasures)

        # Check that no file type would have a measure be double counted
        # If a problem, throw an exception based on the first problem item
        if len(fileFilters) > len(set(fileFilters)):
            while possibleMeasures:
                possibleMeasureTuple = possibleMeasures.pop()
                log.config(2,
                           "possibleMeasure: {}".format(possibleMeasureTuple))
                (fileFilter, measureFilter, modName, verb, tags,
                 extraParams) = possibleMeasureTuple

                # Don't attempt the do conflict resolution on regex files extensions,
                # both because it doesn't make sense
                if fileFilter.startswith(fileext.CUSTOM_FILE_REGEX):
                    continue

                # Shallow warning check for double counting by creatubg a list of entries
                # based on matching verb and file type
                warningList = [
                    (ff, mf, mn, v, t, ep)
                    for ff, mf, mn, v, t, ep in possibleMeasures
                    if v == verb and fileext.file_ext_match(ff, fileFilter)
                ]
                if warningList:
                    log.config(
                        1, "WARNING - Possible double-count: {}".format(
                            str(warningList)))

                    # For the deep check look at tag values and measure filter
                    dupeList = [
                        (v, modName, mn, mf, fileFilter, ff, t, tags, ep,
                         extraParams) for ff, mf, mn, v, t, ep in warningList
                        if len(t) == len(tags)
                        and len(t) == len(set(t) & set(tags))
                        and entry.module.match_measure(mf, measureFilter)
                    ]
                    if dupeList:
                        log.msg(
                            1,
                            "ERROR - Double-count: {}".format(str(dupeList)))
                        dupe = dupeList[0]
                        raise utils.ConfigError(
                            uistrings.STR_ErrorConfigDupeMeasures.format(
                                dupe[0], dupe[1], dupe[2], dupe[3], dupe[4],
                                dupe[5], dupe[6], dupe[7], dupe[8], dupe[9]))
Example #12
0
    def parse_args(self):
        '''
        Do simple command line parsing and set the internal state of our
        Surveyor class based on the arguments.
        For any syntax we don't recognize or help is requested, return help text.
        Otherwise return None which indicates success.
        '''
        try:
            while not self.args.finished():
                self.args.move_next()

                # Disambiguation case for measurePath/fileFilter
                # A '-' may be used to replace optional arg with path/filter
                if self.args.is_cmd() and len(self.args.get_current()) == 1:
                    if self.args.is_param_next():
                        self.args.move_next()
                        self._parse_measurement_path()
                    continue

                # Assume non-Arg is a measurePath/fileFilter definition
                elif not self.args.is_cmd():
                    self._parse_measurement_path()
                    continue

                # Our processing is based on matching first character
                fc = self.args.get_current()[1].lower()

                # Debug and profiling support
                if fc in CMDARG_DEBUG:
                    self._parse_debug_options()
                    log.msg(2, "Args: {}".format(str(self.args)))
                elif fc in CMDARG_PROFILE:
                    self._app._profiling = True
                    self._app._profileCalls = self._get_next_int(optional=True, default=self._app._profileCalls)
                    self._app._profileCalledBy = self._get_next_int(optional=True, default=self._app._profileCalledBy)
                    self._app._profileCalled = self._get_next_int(optional=True, default=self._app._profileCalled)
                    self._app._profileThreadFilter = self._get_next_str(optional=True, default=self._app._profileThreadFilter)
                    self._app._profileNameFilter = self._get_next_str(optional=True, default=self._app._profileNameFilter)

                # Config file settings
                elif fc in CMDARG_CONFIG_CUSTOM:
                    self._parse_config_options()

                # Delta path
                elif fc in CMDARG_DELTA:
                    self._parse_delta_options()

                # Duplicate processing
                # Can have an optional integer or string after this option
                elif fc in CMDARG_DUPE_PROCESSING:
                    self._app._dupeTracking = True
                    self._metaDataOptions['DUPE'] = None
                    dupeParam = self._get_next_param(optional=True)
                    try:
                        dupeParam = int(dupeParam)
                    except Exception as e:
                        pass
                    self._app._dupeThreshold = dupeParam

                # Scan and skip options
                elif fc in CMDARG_SCAN_ALL:
                    self._parse_scan_options()
                elif fc in CMDARG_SKIP:
                    self._parse_skip_options()
                elif fc in CMDARG_INCLUDE_ONLY:
                    self._app._jobOpt.includeFolders.extend(self._get_next_param().split(CMDLINE_SEPARATOR))

                # Output
                elif fc in CMDARG_METADATA == fc:
                    self._parse_metadata_options()
                elif fc in CMDARG_OUTPUT_FILTER:
                    self._measureFilter = self._get_next_str()
                elif fc in CMDARG_OUTPUT_TYPE == fc:
                    self._app._outType = self._get_next_str()
                elif fc in CMDARG_OUTPUT_FILE == fc:
                    self._parse_output_file()
                elif fc in CMDARG_SUMMARY_ONLY == fc:
                    self._app._summaryOnly = True
                elif fc in CMDARG_DETAILED == fc:
                    self._app._detailed = True
                    self._app._detailedPrintSummaryMax = self._get_next_int(
                            optional=True, default=self._app._detailedPrintSummaryMax)
                elif fc in CMDARG_PROGRESS == fc:
                    self._app._progress = True
                    self._app._printMaxWidth = self._get_next_int(
                            optional=True, default=self._app._printMaxWidth)
                elif fc in CMDARG_QUIET == fc:
                    self._app._quiet = True

                # Other options
                elif fc in CMDARG_NUM_WORKERS:
                    self._app._jobOpt.numWorkers = self._get_next_int(validRange=range(1,MAX_WORKERS))
                elif fc in CMDARG_RECURSION:
                    self._app._jobOpt.recursive = False
                elif fc in CMDARG_BREAK_ERROR:
                    self._app._jobOpt.breakOnError = True
                elif fc in CMDARG_AGGREGATES:
                    self._parse_aggregate_options()

                # Help/invalid parameter request
                else:
                    return self._parse_help_options()

            # Setup the default measurement path if not provided
            if not self._app._jobOpt.pathsToMeasure:
                self._app._jobOpt.pathsToMeasure.append(utils.CURRENT_FOLDER)

            # Setup the default config name if not provided
            if not self.configOverrides and self.configCustom is None:
                self.configCustom = CONFIG_FILE_DEFAULT_NAME


        except Args.ArgsFinishedException as e:
            raise utils.InputException(STR_ErrorParsingEnd.format(str(e)))
        else:
            log.config(4, vars(self._app))
Example #13
0
 def move_next(self):
     if self.finished():
         raise self.ArgsFinishedException(self.get_current())
     self.argPos += 1
     log.msg(1, "Arg: {}".format(str(self.argList[self.argPos])))