def _fixup_column_headers(self, filename): ''' If any columns were added to a file in the middle of the job, this is run after the file is closed to make sure that the first row has the right name header for each column. This is the only way to ensure the file has the correct information; which is unfortunate due to the potential expense of this operation. We create our own random file temp file, to ensure it is created in the same place as the outfile ''' ValidChars = string.ascii_letters + string.digits randomLetters = ''.join( [char for char in os.urandom(16) if (char in ValidChars)]) tmpFileName = "_surveyor_tmp{0}_{1}".format(randomLetters, filename) tempPath = os.path.join(self._outDir, tmpFileName) trace.msg( 1, "Fixing output headers: {0} ==> {1}".format(tmpFileName, filename)) rowList = self._col_create_names_from_keys(filename) tempFile = file(tempPath, 'wb') self._write_row(tempFile, rowList) oldPath = os.path.join(self._outDir, filename) oldFile = file(oldPath, 'rb') _ = oldFile.readline() for line in oldFile: tempFile.write(line) oldFile.close() tempFile.close() shutil.move(tempPath, oldPath)
def __init__(self, configFileName, configOverrides, defaultConfigOptions=[]): trace.config(2, "Creating ConfigStack with {0}".format(configFileName)) self._modules = CodeSurveyorModules() self._reader = configreader.ConfigReader(self.load_csmodule) self._measureRootDir = '' # Stack of config files, represented as paths and lists of ConfigEntrys self._configStack = [] # Cache of config file information # Key is path name, value is list entries that represent the config file self._configFileCache = {} # List of default config option tags passed by the application self._defaultConfigOptions = defaultConfigOptions # We either use overrides or try to read config files if configOverrides: trace.msg(1, "Ignoring config files: {0}".format(configOverrides)) self._configName = '' self._setup_config_overrides(configOverrides) else: self._configName = configFileName # Make sure the config file name does not include a path, as the point is # to look for a config file in each folder we visit if not os.path.dirname(self._configName) == '': raise utils.ConfigError(uistrings.STR_ErrorConfigFileNameHasPath) # Load the default config file to use for this job # First try in the root of the job folder; then in the surveyor folder if not self._push_file(utils.runtime_dir()): if not self._push_file(utils.surveyor_dir()): trace.msg(1, "{0} not present in default locations".format( self._configName))
def _fixup_column_headers(self, filename): ''' If any columns were added to a file in the middle of the job, this is run after the file is closed to make sure that the first row has the right name header for each column. This is the only way to ensure the file has the correct information; which is unfortunate due to the potential expense of this operation. We create our own random file temp file, to ensure it is created in the same place as the outfile ''' ValidChars = string.ascii_letters + string.digits randomLetters = ''.join([char for char in os.urandom(16) if (char in ValidChars)]) tmpFileName = "_surveyor_tmp{0}_{1}".format(randomLetters, filename) tempPath = os.path.join(self._outDir, tmpFileName ) trace.msg(1, "Fixing output headers: {0} ==> {1}".format(tmpFileName, filename)) rowList = self._col_create_names_from_keys(filename) tempFile = file(tempPath, 'wb') self._write_row(tempFile, rowList) oldPath = os.path.join(self._outDir, filename) oldFile = file(oldPath, 'rb') _ = oldFile.readline() for line in oldFile: tempFile.write(line) oldFile.close() tempFile.close() shutil.move(tempPath, oldPath)
def _is_file_survey_dupe(self, filePath, measures): ''' Simple mechanism to identify duplicate and near-dupicate code by tracking a dictionary of files we see as measures. There are two modes: 1) File Size: Build a dictionary in memory based on a hash of fileName and config info. In the hash buckets we store a dict of file sizes for the first of each size we see that is not within the dupe threshold. If we see a file size within the threshold of one of our existing hashed sizes, we treat it as a dupe and increment count for reporting. 2) NBNC CRC: We use the nbnc.crc measure to identify duplicates Note that we ASSUME the necessary file metadata will be present in the measures dicitonary, as basemodule.py puts it there for the Dupe option. ''' firstDupeFilePath = None # 1) File name and Size check if isinstance(self._dupeThreshold, int): fileSize = int(measures[basemodule.METADATA_FILESIZE]) dupeKey = (measures[basemodule.METADATA_FULLNAME] + measures[basemodule.METADATA_CONFIG].replace(' ', '')) if dupeKey in self._dupeFileSurveys: for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].iteritems(): if (dupeFileSize - self._dupeThreshold) <= fileSize and ( fileSize <= (dupeFileSize + self._dupeThreshold)): firstDupeFilePath = firstFilePath self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath) trace.msg(1, "Dupe {0} by {1} of {2} bytes: {3}".format( fileCount, fileSize - dupeFileSize, fileSize, filePath)) break else: self._dupeFileSurveys[dupeKey] = {} if firstDupeFilePath is None: self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath) trace.file(2, "Added {0} -- {1} to dupe dictionary".format(dupeKey, fileSize)) # 2) Code CRC check # Our relying on the nbnc.crc is brittle, because it is both a code and runtime # dependency on the Code csmodule being used. And there are valid scenarios # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the # measure isn't present, we fail silently else: fileCrc = None try: fileCrc = measures['nbnc.crc'] except: trace.file(2, "CRC Dupe - nbnc.crc missing: {0}".format(filePath)) if fileCrc in self._dupeFileSurveys: fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc] self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath) trace.msg(1, "Dupe {0}: {1} DUPE_OF {2}".format(fileCount, filePath, firstDupeFilePath)) elif fileCrc is not None: self._dupeFileSurveys[fileCrc] = (1, filePath) trace.file(2, "Added {0} -- {1} to dupe dictionary".format(filePath, fileCrc)) return firstDupeFilePath
def _validate_entries(self, configEntries): ''' Are all config file entries consistent with each other, to avoid silent double counting? Throws an error exception if not. ''' trace.config(2, "Checking for duplicate config entries") # Create list of all possible measure/file combos # We ask the module to match each measure, to catch wildcard overlap fileFilters = [] possibleMeasures = [] for entry in configEntries: for fileFilter in entry.fileFilters: fileFilters.append(fileFilter) possibleMeasures.append((fileFilter, entry.measureFilter, entry.moduleName, entry.verb, entry.tags, entry.paramsRaw)) trace.config(4, fileFilters) trace.config(4, possibleMeasures) # Check that no file type would have a measure be double counted # If we have a problem, throw an exception based on the first problem item if len(fileFilters) > len(set(fileFilters)): while possibleMeasures: possibleMeasureTuple = possibleMeasures.pop() trace.config(2, "possibleMeasure: {0}".format(possibleMeasureTuple)) (fileFilter, measureFilter, modName, verb, tags, extraParams) = possibleMeasureTuple # We don't attempt the do conflict resolution on regex files extensions, # both because it doesn't make sense if fileFilter.startswith(fileext.CUSTOM_FILE_REGEX): continue # Shallow warning check for double counting by creatubg a list of entries # based on matching verb and file type warningList = [ (ff, mf, mn, v, t, ep) for ff, mf, mn, v, t, ep in possibleMeasures if v == verb and fileext.file_ext_match(ff, fileFilter) ] if warningList: trace.config(1, "WARNING - Possible double-count: {0}".format(str(warningList))) # For the deep check look at tag values and measure filter dupeList = [ (v, modName, mn, mf, fileFilter, ff, t, tags, ep, extraParams) for ff, mf, mn, v, t, ep in warningList if len(t) == len(tags) and len(t) == len(set(t) & set(tags)) and entry.module.match_measure(mf, measureFilter) ] if dupeList: trace.msg(1, "ERROR - Double-count: {0}".format(str(dupeList))) dupe = dupeList[0] raise utils.ConfigError(uistrings.STR_ErrorConfigDupeMeasures.format( dupe[0], dupe[1], dupe[2], dupe[3], dupe[4], dupe[5], dupe[6], dupe[7], dupe[8], dupe[9]))
def _validate_line(self, configEntry): ''' Is the module being asked to do what it was designed to do? ''' measureOk = configEntry.module.can_do_measure(configEntry.measureFilters) verbOk = configEntry.module.can_do_verb(configEntry.verb) if not (measureOk and verbOk): trace.msg(1, "Failed module validate measureOk/verbOk: {0}/{1}".format(measureOk, verbOk)) raise utils.ConfigError(uistrings.STR_ErrorConfigInvalidMeasure.format( configEntry.verb, configEntry.measureFilter))
def __init__(self, configStack, options, file_measured_callback, status_callback): # Options define the life a job and cannot be modified self._options = options # All UI output is done through the status callback self._status_callback = status_callback # Keep track of (and allow access to) raw file metrics self.numFolders = 0 self.numFoldersMeasured = 0 self.numUnfilteredFiles = 0 self.numFilteredFiles = 0 self.numFilesToProcess = 0 # Queues to communicate with Workers, and the output thread self._taskQueue = multiprocessing.Queue() self._controlQueue = multiprocessing.Queue() self._outQueue = multiprocessing.Queue() self._outThread = jobout.OutThread( self._outQueue, self._controlQueue, self._options.profileName, file_measured_callback) # Create max number of workers (they will be started later as needed) assert self._options.numWorkers > 0, "Less than 1 worker requested!" context = (trace.get_context(), self._options.profileName) self._workers = self.Workers( self._controlQueue, self._taskQueue, self._outQueue, context, self._options.numWorkers) trace.msg(1, "Created {0} workers".format(self._workers.num_max())) # Create our object for tracking state of folder walking self._pathsToMeasure = options.pathsToMeasure self._folderWalker = folderwalk.FolderWalker( options.deltaPath, configStack, options.recursive, options.includeFolders, options.skipFolders, options.fileFilters, options.skipFiles, self.add_folder_files) # Utility object for managing work packages; holds the state of the # work package that is being prepared for sending to queue self._workPackage = self.WorkPackage() # Other processing state self._continueProcessing = True self._taskPackagesSent = 0 self._filesSinceLastSend = 0
def read_file(self, filePath): ''' Read a Surveyor configuration file and return a list of ConfigEntrys to be stored on the configuration stack with this folder location. ''' try: trace.msg(1, "Config file: {0}".format(filePath)) configEntries = self._read_file(filePath, []) self._validate_file(configEntries) trace.config(2, "Finsihed reading config file: {0}".format(filePath)) trace.config(3, configEntries) return configEntries except Exception, e: raise utils.ConfigError(uistrings.STR_ErrorConfigFile.format(filePath, str(e)))
def _put_files_in_queue(self, path, deltaPath, filesAndConfigs): ''' Package files from the given folder into workItems that are then grouped into workPackages that are placed into the task queue for jobworkers. Packages are broken up if files number or total size exceeds thresholds to help evenly distribute load across cores ''' if not filesAndConfigs: return self.numFoldersMeasured += 1 for fileName, configEntrys in filesAndConfigs: # Expensive to check file size here, but it is worth it for # pracelling widely varying file sizes out to cores for CPU intensive # jobs. Profiling shows it is not worth caching this try: fileSize = utils.get_file_size(os.path.join(path, fileName)) except Exception, e: # It is possible (at least in Windows) for a fileName to exist # in the file system but be invalid for Windows calls. This is # the first place we try to access the file through the file # system; if it blows up we don't want the job to fall apart, # and this is an unusual case, so unlike more likely errors, # we don't bother with a pathway back to the main application # to handle the error; we just swallow it and provide debug trace.msg(1, str(e)) continue trace.cc(3, "WorkItem: {0}, {1}".format(fileSize, fileName)) self.numFilesToProcess += 1 workItem = (path, deltaPath, fileName, configEntrys, self._options, len(filesAndConfigs)) self._workPackage.add(workItem, fileSize) if self._workPackage.ready_to_send() or ( self._filesSinceLastSend > MAX_FILES_BEFORE_SEND): self._send_current_package() if not self._check_command(): break
def _write_aggregates(self): ''' For each set of aggregates, we create an output file with aggregates that exceed threshold. HACK - We use the output writer by creating a dummy OUT file tag ''' for keyName in self._aggregateNames.keys(): fileName = str(keyName).replace('.', '') hackOutTagMeasure = {'tag_write_aggregates': 'OUT:' + fileName} analysisRows = [] for valueRow in self._aggregates[keyName].values(): writeRow = self._aggregateThresholdKey is None if not writeRow: try: writeRow = valueRow[self._aggregateThresholdKey] > self._aggregateThreshold except KeyError, e: raise utils.InputException(STR_AggregateThresholdKeyError.format(str(e))) if writeRow: analysisRows.append(valueRow) trace.msg(1, "Aggregate: {0}".format(analysisRows)) self._writer.write_items(hackOutTagMeasure, analysisRows)
def __init__(self, configFileName, configOverrides, defaultConfigOptions=[]): trace.config(2, "Creating ConfigStack with {0}".format(configFileName)) self._modules = CodeSurveyorModules() self._reader = configreader.ConfigReader(self.load_csmodule) self._measureRootDir = '' # Stack of config files, represented as paths and lists of ConfigEntrys self._configStack = [] # Cache of config file information # Key is path name, value is list entries that represent the config file self._configFileCache = {} # List of default config option tags passed by the application self._defaultConfigOptions = defaultConfigOptions # We either use overrides or try to read config files if configOverrides: trace.msg(1, "Ignoring config files: {0}".format(configOverrides)) self._configName = '' self._setup_config_overrides(configOverrides) else: self._configName = configFileName # Make sure the config file name does not include a path, as the point is # to look for a config file in each folder we visit if not os.path.dirname(self._configName) == '': raise utils.ConfigError( uistrings.STR_ErrorConfigFileNameHasPath) # Load the default config file to use for this job # First try in the root of the job folder; then in the surveyor folder if not self._push_file(utils.runtime_dir()): if not self._push_file(utils.surveyor_dir()): trace.msg( 1, "{0} not present in default locations".format( self._configName))
def parse_args(self): ''' Do simple command line parsing and set the internal state of our Surveyor class based on the arguments. If we encounter any syntax we don't recognize or help is requested we bail and return help text. Otherwise we return None which indicates success. ''' try: while not self.args.finished(): self.args.move_next() # Disambiguation case for measurePath/fileFilter # A '-' may be used to replace optional arg with path/filter if self.args.is_cmd() and len(self.args.get_current()) == 1: if self.args.is_param_next(): self.args.move_next() self._parse_measurement_path() continue # Assume non-Arg is a measurePath/fileFilter definition elif not self.args.is_cmd(): self._parse_measurement_path() continue # Our processing is based on matching first character fc = self.args.get_current()[1].lower() # Debug and profiling support if fc in CMDARG_DEBUG: self._parse_debug_options() trace.msg(2, "Args: {0}".format(str(self.args))) elif fc in CMDARG_PROFILE: self._app._profiling = True self._app._profileCalls = self._get_next_int(optional=True, default=self._app._profileCalls) self._app._profileCalledBy = self._get_next_int(optional=True, default=self._app._profileCalledBy) self._app._profileCalled = self._get_next_int(optional=True, default=self._app._profileCalled) self._app._profileThreadFilter = self._get_next_str(optional=True, default=self._app._profileThreadFilter) self._app._profileNameFilter = self._get_next_str(optional=True, default=self._app._profileNameFilter) # Config file settings elif fc in CMDARG_CONFIG_CUSTOM: self._parse_config_options() # Delta path elif fc in CMDARG_DELTA: self._parse_delta_options() # Duplicate processing # We can have an optional integer or string after this option elif fc in CMDARG_DUPE_PROCESSING: self._app._dupeTracking = True self._metaDataOptions['DUPE'] = None dupeParam = self._get_next_param(optional=True) try: dupeParam = int(dupeParam) except Exception, e: pass self._app._dupeThreshold = dupeParam # Scan and skip options elif fc in CMDARG_SCAN_ALL: self._parse_scan_options() elif fc in CMDARG_SKIP: self._parse_skip_options()
def move_next(self): if self.finished(): raise self.ArgsFinishedException(self.get_current()) self.argPos += 1 trace.msg(1, "Arg: {0}".format(str(self.argList[self.argPos])))
def parse_args(self): ''' Do simple command line parsing and set the internal state of our Surveyor class based on the arguments. If we encounter any syntax we don't recognize or help is requested we bail and return help text. Otherwise we return None which indicates success. ''' try: while not self.args.finished(): self.args.move_next() # Disambiguation case for measurePath/fileFilter # A '-' may be used to replace optional arg with path/filter if self.args.is_cmd() and len(self.args.get_current()) == 1: if self.args.is_param_next(): self.args.move_next() self._parse_measurement_path() continue # Assume non-Arg is a measurePath/fileFilter definition elif not self.args.is_cmd(): self._parse_measurement_path() continue # Our processing is based on matching first character fc = self.args.get_current()[1].lower() # Debug and profiling support if fc in CMDARG_DEBUG: self._parse_debug_options() trace.msg(2, "Args: {0}".format(str(self.args))) elif fc in CMDARG_PROFILE: self._app._profiling = True self._app._profileCalls = self._get_next_int( optional=True, default=self._app._profileCalls) self._app._profileCalledBy = self._get_next_int( optional=True, default=self._app._profileCalledBy) self._app._profileCalled = self._get_next_int( optional=True, default=self._app._profileCalled) self._app._profileThreadFilter = self._get_next_str( optional=True, default=self._app._profileThreadFilter) self._app._profileNameFilter = self._get_next_str( optional=True, default=self._app._profileNameFilter) # Config file settings elif fc in CMDARG_CONFIG_CUSTOM: self._parse_config_options() # Delta path elif fc in CMDARG_DELTA: self._parse_delta_options() # Duplicate processing # We can have an optional integer or string after this option elif fc in CMDARG_DUPE_PROCESSING: self._app._dupeTracking = True self._metaDataOptions['DUPE'] = None dupeParam = self._get_next_param(optional=True) try: dupeParam = int(dupeParam) except Exception, e: pass self._app._dupeThreshold = dupeParam # Scan and skip options elif fc in CMDARG_SCAN_ALL: self._parse_scan_options() elif fc in CMDARG_SKIP: self._parse_skip_options()