Exemple #1
0
    def _fixup_column_headers(self, filename):
        '''
        If any columns were added to a file in the middle of the job, this is
        run after the file is closed to make sure that the first row has the
        right name header for each column.
        This is the only way to ensure the file has the correct information;
        which is unfortunate due to the potential expense of this operation.

        We create our own random file temp file, to ensure it is created
        in the same place as the outfile
        '''
        ValidChars = string.ascii_letters + string.digits
        randomLetters = ''.join(
            [char for char in os.urandom(16) if (char in ValidChars)])
        tmpFileName = "_surveyor_tmp{0}_{1}".format(randomLetters, filename)
        tempPath = os.path.join(self._outDir, tmpFileName)
        trace.msg(
            1,
            "Fixing output headers: {0} ==> {1}".format(tmpFileName, filename))

        rowList = self._col_create_names_from_keys(filename)
        tempFile = file(tempPath, 'wb')
        self._write_row(tempFile, rowList)

        oldPath = os.path.join(self._outDir, filename)
        oldFile = file(oldPath, 'rb')
        _ = oldFile.readline()
        for line in oldFile:
            tempFile.write(line)

        oldFile.close()
        tempFile.close()
        shutil.move(tempPath, oldPath)
Exemple #2
0
    def __init__(self, configFileName, configOverrides, defaultConfigOptions=[]):
        trace.config(2, "Creating ConfigStack with {0}".format(configFileName))
        self._modules = CodeSurveyorModules()
        self._reader = configreader.ConfigReader(self.load_csmodule)
        self._measureRootDir = ''

        # Stack of config files, represented as paths and lists of ConfigEntrys
        self._configStack = []

        # Cache of config file information
        # Key is path name, value is list entries that represent the config file
        self._configFileCache = {}

        # List of default config option tags passed by the application
        self._defaultConfigOptions = defaultConfigOptions

        # We either use overrides or try to read config files
        if configOverrides:
            trace.msg(1, "Ignoring config files: {0}".format(configOverrides))
            self._configName = ''
            self._setup_config_overrides(configOverrides)

        else:
            self._configName = configFileName
            # Make sure the config file name does not include a path, as the point is
            # to look for a config file in each folder we visit
            if not os.path.dirname(self._configName) == '':
                raise utils.ConfigError(uistrings.STR_ErrorConfigFileNameHasPath)
            # Load the default config file to use for this job
            # First try in the root of the job folder; then in the surveyor folder
            if not self._push_file(utils.runtime_dir()):
                 if not self._push_file(utils.surveyor_dir()):
                    trace.msg(1, "{0} not present in default locations".format(
                            self._configName))
Exemple #3
0
    def _fixup_column_headers(self, filename):
        '''
        If any columns were added to a file in the middle of the job, this is
        run after the file is closed to make sure that the first row has the
        right name header for each column.
        This is the only way to ensure the file has the correct information;
        which is unfortunate due to the potential expense of this operation.

        We create our own random file temp file, to ensure it is created
        in the same place as the outfile
        '''
        ValidChars = string.ascii_letters + string.digits
        randomLetters = ''.join([char for char in os.urandom(16) if (char in ValidChars)])
        tmpFileName = "_surveyor_tmp{0}_{1}".format(randomLetters, filename)
        tempPath = os.path.join(self._outDir, tmpFileName )
        trace.msg(1, "Fixing output headers: {0} ==> {1}".format(tmpFileName, filename))

        rowList = self._col_create_names_from_keys(filename)
        tempFile = file(tempPath, 'wb')
        self._write_row(tempFile, rowList)

        oldPath = os.path.join(self._outDir, filename)
        oldFile = file(oldPath, 'rb')
        _ = oldFile.readline()
        for line in oldFile:
            tempFile.write(line)

        oldFile.close()
        tempFile.close()
        shutil.move(tempPath, oldPath)
Exemple #4
0
    def _is_file_survey_dupe(self, filePath, measures):
        '''
        Simple mechanism to identify duplicate and near-dupicate code by tracking
        a dictionary of files we see as measures.  There are two modes:

        1) File Size: Build a dictionary in memory based on a hash of fileName
        and config info. In the hash buckets we store a dict of file sizes for
        the first of each size we see that is not within the dupe threshold.
        If we see a file size within the threshold of one of our existing
        hashed sizes, we treat it as a dupe and increment count for reporting.

        2) NBNC CRC: We use the nbnc.crc measure to identify duplicates

        Note that we ASSUME the necessary file metadata will be present in the
        measures dicitonary, as basemodule.py puts it there for the Dupe option.
        '''
        firstDupeFilePath = None

        # 1) File name and Size check
        if isinstance(self._dupeThreshold, int):
            fileSize = int(measures[basemodule.METADATA_FILESIZE])
            dupeKey = (measures[basemodule.METADATA_FULLNAME] +
                        measures[basemodule.METADATA_CONFIG].replace(' ', ''))
            if dupeKey in self._dupeFileSurveys:
                for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].iteritems():
                    if (dupeFileSize - self._dupeThreshold) <= fileSize and (
                            fileSize <= (dupeFileSize + self._dupeThreshold)):
                        firstDupeFilePath = firstFilePath
                        self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath)
                        trace.msg(1, "Dupe {0} by {1} of {2} bytes: {3}".format(
                                    fileCount, fileSize - dupeFileSize, fileSize, filePath))
                        break
            else:
                self._dupeFileSurveys[dupeKey] = {}

            if firstDupeFilePath is None:
                self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath)
                trace.file(2, "Added {0} -- {1} to dupe dictionary".format(dupeKey, fileSize))

        # 2) Code CRC check
        # Our relying on the nbnc.crc is brittle, because it is both a code and runtime
        # dependency on the Code csmodule being used. And there are valid scenarios
        # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the
        # measure isn't present, we fail silently
        else:
            fileCrc = None
            try:
                fileCrc = measures['nbnc.crc']
            except:
                trace.file(2, "CRC Dupe - nbnc.crc missing: {0}".format(filePath))
            if fileCrc in self._dupeFileSurveys:
                fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc]
                self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath)
                trace.msg(1, "Dupe {0}: {1} DUPE_OF {2}".format(fileCount, filePath, firstDupeFilePath))
            elif fileCrc is not None:
                self._dupeFileSurveys[fileCrc] = (1, filePath)
                trace.file(2, "Added {0} -- {1} to dupe dictionary".format(filePath, fileCrc))

        return firstDupeFilePath
Exemple #5
0
    def _validate_entries(self, configEntries):
        '''
        Are all config file entries consistent with each other, to avoid silent
        double counting? Throws an error exception if not.
        '''
        trace.config(2, "Checking for duplicate config entries")

        # Create list of all possible measure/file combos
        # We ask the module to match each measure, to catch wildcard overlap
        fileFilters = []
        possibleMeasures = []
        for entry in configEntries:
            for fileFilter in entry.fileFilters:
                fileFilters.append(fileFilter)
                possibleMeasures.append((fileFilter, entry.measureFilter,
                        entry.moduleName, entry.verb, entry.tags, entry.paramsRaw))
        trace.config(4, fileFilters)
        trace.config(4, possibleMeasures)

        # Check that no file type would have a measure be double counted
        # If we have a problem, throw an exception based on the first problem item
        if len(fileFilters) > len(set(fileFilters)):
            while possibleMeasures:
                possibleMeasureTuple = possibleMeasures.pop()
                trace.config(2, "possibleMeasure: {0}".format(possibleMeasureTuple))
                (fileFilter, measureFilter, modName, verb, tags, extraParams) = possibleMeasureTuple

                # We don't attempt the do conflict resolution on regex files extensions,
                # both because it doesn't make sense
                if fileFilter.startswith(fileext.CUSTOM_FILE_REGEX):
                    continue

                # Shallow warning check for double counting by creatubg a list of entries
                # based on matching verb and file type
                warningList = [
                        (ff, mf, mn, v, t, ep)
                            for ff, mf, mn, v, t, ep in possibleMeasures if
                                v == verb and
                                fileext.file_ext_match(ff, fileFilter) ]
                if warningList:
                    trace.config(1, "WARNING - Possible double-count: {0}".format(str(warningList)))

                    # For the deep check look at tag values and measure filter
                    dupeList = [
                            (v, modName, mn, mf, fileFilter, ff, t, tags, ep, extraParams)
                                for ff, mf, mn, v, t, ep in warningList if
                                    len(t) == len(tags) and
                                    len(t) == len(set(t) & set(tags)) and
                                    entry.module.match_measure(mf, measureFilter) ]
                    if dupeList:
                        trace.msg(1, "ERROR - Double-count: {0}".format(str(dupeList)))
                        dupe = dupeList[0]
                        raise utils.ConfigError(uistrings.STR_ErrorConfigDupeMeasures.format(
                            dupe[0],
                            dupe[1], dupe[2],
                            dupe[3],
                            dupe[4], dupe[5],
                            dupe[6], dupe[7],
                            dupe[8], dupe[9]))
Exemple #6
0
 def _validate_line(self, configEntry):
     '''
     Is the module being asked to do what it was designed to do?
     '''
     measureOk = configEntry.module.can_do_measure(configEntry.measureFilters)
     verbOk = configEntry.module.can_do_verb(configEntry.verb)
     if not (measureOk and verbOk):
         trace.msg(1, "Failed module validate measureOk/verbOk: {0}/{1}".format(measureOk, verbOk))
         raise utils.ConfigError(uistrings.STR_ErrorConfigInvalidMeasure.format(
                 configEntry.verb, configEntry.measureFilter))
Exemple #7
0
    def __init__(self, configStack, options,
                    file_measured_callback, status_callback):

        # Options define the life a job and cannot be modified
        self._options = options

        # All UI output is done through the status callback
        self._status_callback = status_callback

        # Keep track of (and allow access to) raw file metrics
        self.numFolders = 0
        self.numFoldersMeasured = 0
        self.numUnfilteredFiles = 0
        self.numFilteredFiles = 0
        self.numFilesToProcess = 0

        # Queues to communicate with Workers, and the output thread
        self._taskQueue = multiprocessing.Queue()
        self._controlQueue = multiprocessing.Queue()
        self._outQueue = multiprocessing.Queue()
        self._outThread = jobout.OutThread(
                self._outQueue, self._controlQueue,
                self._options.profileName, file_measured_callback)

        # Create max number of workers (they will be started later as needed)
        assert self._options.numWorkers > 0, "Less than 1 worker requested!"
        context = (trace.get_context(), self._options.profileName)
        self._workers = self.Workers(
                self._controlQueue, self._taskQueue, self._outQueue,
                context, self._options.numWorkers)
        trace.msg(1, "Created {0} workers".format(self._workers.num_max()))

        # Create our object for tracking state of folder walking
        self._pathsToMeasure = options.pathsToMeasure
        self._folderWalker = folderwalk.FolderWalker(
                options.deltaPath,
                configStack,
                options.recursive,
                options.includeFolders,
                options.skipFolders,
                options.fileFilters,
                options.skipFiles,
                self.add_folder_files)

        # Utility object for managing work packages; holds the state of the
        # work package that is being prepared for sending to queue
        self._workPackage = self.WorkPackage()

        # Other processing state
        self._continueProcessing = True
        self._taskPackagesSent = 0
        self._filesSinceLastSend = 0
Exemple #8
0
 def read_file(self, filePath):
     '''
     Read a Surveyor configuration file and return a list of ConfigEntrys
     to be stored on the configuration stack with this folder location.
     '''
     try:
         trace.msg(1, "Config file: {0}".format(filePath))
         configEntries = self._read_file(filePath, [])
         self._validate_file(configEntries)
         trace.config(2, "Finsihed reading config file: {0}".format(filePath))
         trace.config(3, configEntries)
         return configEntries
     except Exception, e:
         raise utils.ConfigError(uistrings.STR_ErrorConfigFile.format(filePath, str(e)))
Exemple #9
0
    def _put_files_in_queue(self, path, deltaPath, filesAndConfigs):
        '''
        Package files from the given folder into workItems that are then grouped
        into workPackages that are placed into the task queue for jobworkers.
        Packages are broken up if files number or total size exceeds
        thresholds to help evenly distribute load across cores
        '''
        if not filesAndConfigs:
            return
        self.numFoldersMeasured += 1

        for fileName, configEntrys in filesAndConfigs:

            # Expensive to check file size here, but it is worth it for
            # pracelling widely varying file sizes out to cores for CPU intensive
            # jobs. Profiling shows it is not worth caching this
            try:
                fileSize = utils.get_file_size(os.path.join(path, fileName))
            except Exception, e:
                # It is possible (at least in Windows) for a fileName to exist
                # in the file system but be invalid for Windows calls. This is
                # the first place we try to access the file through the file
                # system; if it blows up we don't want the job to fall apart,
                # and this is an unusual case, so unlike more likely errors,
                # we don't bother with a pathway back to the main application
                # to handle the error; we just swallow it and provide debug
                trace.msg(1, str(e))
                continue

            trace.cc(3, "WorkItem: {0}, {1}".format(fileSize, fileName))
            self.numFilesToProcess += 1
            workItem = (path,
                        deltaPath,
                        fileName,
                        configEntrys,
                        self._options,
                        len(filesAndConfigs))
            self._workPackage.add(workItem, fileSize)

            if self._workPackage.ready_to_send() or (
                    self._filesSinceLastSend > MAX_FILES_BEFORE_SEND):
                self._send_current_package()

            if not self._check_command():
                break
Exemple #10
0
 def _write_aggregates(self):
     '''
     For each set of aggregates, we create an output file with aggregates
     that exceed threshold.
     HACK - We use the output writer by creating a dummy OUT file tag
     '''
     for keyName in self._aggregateNames.keys():
         fileName = str(keyName).replace('.', '')
         hackOutTagMeasure = {'tag_write_aggregates': 'OUT:' + fileName}
         analysisRows = []
         for valueRow in self._aggregates[keyName].values():
             writeRow = self._aggregateThresholdKey is None
             if not writeRow:
                 try:
                     writeRow = valueRow[self._aggregateThresholdKey] > self._aggregateThreshold
                 except KeyError, e:
                     raise utils.InputException(STR_AggregateThresholdKeyError.format(str(e)))
             if writeRow:
                 analysisRows.append(valueRow)
         trace.msg(1, "Aggregate: {0}".format(analysisRows))
         self._writer.write_items(hackOutTagMeasure, analysisRows)
Exemple #11
0
    def __init__(self,
                 configFileName,
                 configOverrides,
                 defaultConfigOptions=[]):
        trace.config(2, "Creating ConfigStack with {0}".format(configFileName))
        self._modules = CodeSurveyorModules()
        self._reader = configreader.ConfigReader(self.load_csmodule)
        self._measureRootDir = ''

        # Stack of config files, represented as paths and lists of ConfigEntrys
        self._configStack = []

        # Cache of config file information
        # Key is path name, value is list entries that represent the config file
        self._configFileCache = {}

        # List of default config option tags passed by the application
        self._defaultConfigOptions = defaultConfigOptions

        # We either use overrides or try to read config files
        if configOverrides:
            trace.msg(1, "Ignoring config files: {0}".format(configOverrides))
            self._configName = ''
            self._setup_config_overrides(configOverrides)

        else:
            self._configName = configFileName
            # Make sure the config file name does not include a path, as the point is
            # to look for a config file in each folder we visit
            if not os.path.dirname(self._configName) == '':
                raise utils.ConfigError(
                    uistrings.STR_ErrorConfigFileNameHasPath)
            # Load the default config file to use for this job
            # First try in the root of the job folder; then in the surveyor folder
            if not self._push_file(utils.runtime_dir()):
                if not self._push_file(utils.surveyor_dir()):
                    trace.msg(
                        1, "{0} not present in default locations".format(
                            self._configName))
Exemple #12
0
    def parse_args(self):
        '''
        Do simple command line parsing and set the internal state of our
        Surveyor class based on the arguments.
        If we encounter any syntax we don't recognize or help is requested we
        bail and return help text.
        Otherwise we return None which indicates success.
        '''
        try:
            while not self.args.finished():
                self.args.move_next()

                # Disambiguation case for measurePath/fileFilter
                # A '-' may be used to replace optional arg with path/filter
                if self.args.is_cmd() and len(self.args.get_current()) == 1:
                    if self.args.is_param_next():
                        self.args.move_next()
                        self._parse_measurement_path()
                    continue

                # Assume non-Arg is a measurePath/fileFilter definition
                elif not self.args.is_cmd():
                    self._parse_measurement_path()
                    continue

                # Our processing is based on matching first character
                fc = self.args.get_current()[1].lower()

                # Debug and profiling support
                if fc in CMDARG_DEBUG:
                    self._parse_debug_options()
                    trace.msg(2, "Args: {0}".format(str(self.args)))
                elif fc in CMDARG_PROFILE:
                    self._app._profiling = True
                    self._app._profileCalls = self._get_next_int(optional=True, default=self._app._profileCalls)
                    self._app._profileCalledBy = self._get_next_int(optional=True, default=self._app._profileCalledBy)
                    self._app._profileCalled = self._get_next_int(optional=True, default=self._app._profileCalled)
                    self._app._profileThreadFilter = self._get_next_str(optional=True, default=self._app._profileThreadFilter)
                    self._app._profileNameFilter = self._get_next_str(optional=True, default=self._app._profileNameFilter)

                # Config file settings
                elif fc in CMDARG_CONFIG_CUSTOM:
                    self._parse_config_options()

                # Delta path
                elif fc in CMDARG_DELTA:
                    self._parse_delta_options()

                # Duplicate processing
                # We can have an optional integer or string after this option
                elif fc in CMDARG_DUPE_PROCESSING:
                    self._app._dupeTracking = True
                    self._metaDataOptions['DUPE'] = None
                    dupeParam = self._get_next_param(optional=True)
                    try:
                        dupeParam = int(dupeParam)
                    except Exception, e:
                        pass
                    self._app._dupeThreshold = dupeParam

                # Scan and skip options
                elif fc in CMDARG_SCAN_ALL:
                    self._parse_scan_options()
                elif fc in CMDARG_SKIP:
                    self._parse_skip_options()
Exemple #13
0
 def move_next(self):
     if self.finished():
         raise self.ArgsFinishedException(self.get_current())
     self.argPos += 1
     trace.msg(1, "Arg: {0}".format(str(self.argList[self.argPos])))
Exemple #14
0
    def parse_args(self):
        '''
        Do simple command line parsing and set the internal state of our
        Surveyor class based on the arguments.
        If we encounter any syntax we don't recognize or help is requested we
        bail and return help text.
        Otherwise we return None which indicates success.
        '''
        try:
            while not self.args.finished():
                self.args.move_next()

                # Disambiguation case for measurePath/fileFilter
                # A '-' may be used to replace optional arg with path/filter
                if self.args.is_cmd() and len(self.args.get_current()) == 1:
                    if self.args.is_param_next():
                        self.args.move_next()
                        self._parse_measurement_path()
                    continue

                # Assume non-Arg is a measurePath/fileFilter definition
                elif not self.args.is_cmd():
                    self._parse_measurement_path()
                    continue

                # Our processing is based on matching first character
                fc = self.args.get_current()[1].lower()

                # Debug and profiling support
                if fc in CMDARG_DEBUG:
                    self._parse_debug_options()
                    trace.msg(2, "Args: {0}".format(str(self.args)))
                elif fc in CMDARG_PROFILE:
                    self._app._profiling = True
                    self._app._profileCalls = self._get_next_int(
                        optional=True, default=self._app._profileCalls)
                    self._app._profileCalledBy = self._get_next_int(
                        optional=True, default=self._app._profileCalledBy)
                    self._app._profileCalled = self._get_next_int(
                        optional=True, default=self._app._profileCalled)
                    self._app._profileThreadFilter = self._get_next_str(
                        optional=True, default=self._app._profileThreadFilter)
                    self._app._profileNameFilter = self._get_next_str(
                        optional=True, default=self._app._profileNameFilter)

                # Config file settings
                elif fc in CMDARG_CONFIG_CUSTOM:
                    self._parse_config_options()

                # Delta path
                elif fc in CMDARG_DELTA:
                    self._parse_delta_options()

                # Duplicate processing
                # We can have an optional integer or string after this option
                elif fc in CMDARG_DUPE_PROCESSING:
                    self._app._dupeTracking = True
                    self._metaDataOptions['DUPE'] = None
                    dupeParam = self._get_next_param(optional=True)
                    try:
                        dupeParam = int(dupeParam)
                    except Exception, e:
                        pass
                    self._app._dupeThreshold = dupeParam

                # Scan and skip options
                elif fc in CMDARG_SCAN_ALL:
                    self._parse_scan_options()
                elif fc in CMDARG_SKIP:
                    self._parse_skip_options()
Exemple #15
0
 def move_next(self):
     if self.finished():
         raise self.ArgsFinishedException(self.get_current())
     self.argPos += 1
     trace.msg(1, "Arg: {0}".format(str(self.argList[self.argPos])))