def _file_match(fileName, fileFilter): ''' Performs the match check of filename to filter In the case of blank detection, look for no extension Otherwise use regex comparison using cached version of either the re from fnmatch.translate or custom RE string provided in filter ''' if BLANK_FILE_EXT == fileFilter: root, ext = os.path.splitext(fileName) filterMatch = ('' == ext and not root.startswith('.')) else: filterRe = None try: filterRe = _FilterCache[fileFilter] except KeyError: if fileFilter.startswith(CUSTOM_FILE_REGEX): filterRe = re.compile(fileFilter.replace(CUSTOM_FILE_REGEX, ''), RE_OPTIONS) else: filterRe = re.compile(fnmatch.translate(fileFilter), RE_OPTIONS) _FilterCache[fileFilter] = filterRe filterMatch = filterRe.match(fileName) if trace.level() and filterMatch is None: trace.file(3, "FilterExtFilter: %s, no match: %s" % (filterRe.pattern[:10], fileName)) return filterMatch is not None
def is_text_file(fileObject): textChars = string.letters + string.digits + string.punctuation + string.whitespace bytesToCheck = 128 # Big enough window to grab, but small for speed startPoint = 4 # Skip start of file, for hidden text codes minWindowSize = 32 # Get a big enough min window to be feasible nonTextThreshold = 0.2 # Have some tolerance to avoid false positives # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files) fileBytes = utils.strip_null_chars( utils.get_file_start(fileObject, bytesToCheck)) # Special case for PDF that looks like text but isn't if is_pdf_file(fileObject): isBelowThreshold = False trace.file( 3, " IsTextFile({0}): {1} ==> PDF File detected".format( isBelowThreshold, os.path.basename(fileObject.name))) else: isBelowThreshold = utils.check_bytes_below_threshold( fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold) trace.file( 3, " IsTextFile({0}): {1} ==> {2}".format( isBelowThreshold, os.path.basename(fileObject.name), fileBytes)) return isBelowThreshold
def _run(self): # We keep processing queue until the job signals it is done and # the queue is empty, or we receive an abort command while self._continue_processing(): try: if self._workDone and self._outQueue.empty(): break filesOutput = self._outQueue.get_nowait() except Empty: trace.cc(3, "EMPTY OUTPUT") time.sleep(OUTPUT_EMPTY_WAIT) else: self.taskPackagesReceived += 1 trace.cc(2, "GOT {0} measures".format(len(filesOutput))) # We get a set of output for multiple files with each # outputQueue item. Each file has a set of output # and potential errors that we pack to the app for filePath, outputList, errorList in filesOutput: # Synchronus callback to applicaiton # Output writing and screen update occurs in this call self._file_measure_callback(filePath, outputList, errorList) if errorList: trace.file(1, "ERROR measuring: {0}".format(filePath)) self._controlQueue.put_nowait(('JOB', 'ERROR', filePath))
def _valid_folder(self, folderName): ''' Is this folder one we should process? ''' if not self._skipFolders and not self._includeFolders: return True validFolder = True # First verfiy this folder is not to be skipped if self._skipFolders: _root, currentFolder = os.path.split(folderName) for folderPattern in self._skipFolders: if fnmatch.fnmatch(currentFolder, folderPattern): trace.file(1, "Skiping folder: %s" % folderName) validFolder = False break # Next verify if it is on the include list if validFolder and self._includeFolders: includeMatch = False for folderPattern in self._includeFolders: if fnmatch.fnmatch(folderName, folderPattern): includeMatch = True break if not includeMatch: trace.file(1, "Excluding folder: %s" % folderName) validFolder = False return validFolder
def is_noncode_ext(filePath): rv = False if is_compressed_ext(filePath) or _has_ext(filePath, NonCodeFileExtensions): rv = True trace.file(3, " NonCodeExt: {0}".format(os.path.basename(filePath))) return rv
def file_measured_callback(self, filePath, outputList, errorList): ''' Job output thread callback to provide file measurements. A list of output and potential errors is provided for each file. Called ONCE for each file in the job; if there were multiple config entries for the file, outputList will have multiple items. ''' self._numFilesProcessed += 1 self._errorList.extend(errorList) fileTime = 0 fileMeasured = False for measures, analysisResults in outputList: trace.file(2, "Callback: {0} -- {1}".format(filePath, measures)) if measures.items(): # Zero out dupe measures in place if self._dupeTracking: self._filter_dupes(filePath, measures, analysisResults) # Send results to metrics writer fileMeasured = True self._numMeasures += max(1, len(analysisResults)) if not self._summaryOnly: self._writer.write_items(measures, analysisResults) # Capture summary metrics and aggregates self._stash_summary_metrics(filePath, measures, analysisResults) self._stash_aggregates(filePath, analysisResults) fileTime += utils.safe_dict_get_float(measures, basemodule.METADATA_TIMING) self._numFilesMeasured += (1 if fileMeasured else 0) self._display_file_progress(filePath, fileTime) self._display_feedback()
def is_noncode_file(fileObject): maxWindowSize = 30 fileStart = utils.get_file_start(fileObject, maxWindowSize) phraseFound = utils.check_start_phrases(fileStart, NonCodeFileStart) trace.file(3, " NonCodeFileStart({0}): {1} ==> {2}".format( phraseFound, fileStart, os.path.basename(fileObject.name))) return phraseFound is not None
def _valid_folder(self, folderName): ''' Is this folder one we should process? ''' if not self._skipFolders and not self._includeFolders: return True validFolder = True # First verfiy this folder is not to be skipped if self._skipFolders: _root, currentFolder = os.path.split(folderName) for folderPattern in self._skipFolders: if fnmatch.fnmatch(currentFolder, folderPattern): trace.file(1, "Skiping folder: {0}".format(folderName)) validFolder = False break # Next verify if it is on the include list if validFolder and self._includeFolders: includeMatch = False for folderPattern in self._includeFolders: if fnmatch.fnmatch(folderName, folderPattern): includeMatch = True break if not includeMatch: trace.file(1, "Excluding folder: {0}".format(folderName)) validFolder = False return validFolder
def _open_file(self, filename): MeasureWriter._open_file(self, filename) filePath = os.path.join(self._outDir, filename) outFile = file(filePath, 'w') doc = minidom.Document() outFile.write(doc.toprettyxml()) trace.file(2, "Opened XML Output File: {0}".format(filePath)) return outFile
def is_noncode_file(fileObject): maxWindowSize = 30 fileStart = utils.get_file_start(fileObject, maxWindowSize) phraseFound = utils.check_start_phrases(fileStart, NonCodeFileStart) trace.file( 3, " NonCodeFileStart({0}): {1} ==> {2}".format( phraseFound, fileStart, os.path.basename(fileObject.name))) return phraseFound is not None
def _open_file(self, fileName): MeasureWriter._open_file(self, fileName) filePath = os.path.join(self._outDir, fileName) self._rawFiles[fileName] = file(filePath, 'wb') outWriter = csv.writer( self._rawFiles[fileName], delimiter=self._delimiter, quoting=csv.QUOTE_NONNUMERIC) trace.file(2, "Opened Delimited Output File: {0}".format(filePath)) return outWriter
def _open_file(self, fileName): MeasureWriter._open_file(self, fileName) filePath = os.path.join(self._outDir, fileName) self._rawFiles[fileName] = file(filePath, 'wb') outWriter = csv.writer(self._rawFiles[fileName], delimiter=self._delimiter, quoting=csv.QUOTE_NONNUMERIC) trace.file(2, "Opened Delimited Output File: {0}".format(filePath)) return outWriter
def file_measured_callback(self, filePath, measures, analysisResults): ''' Callback from the masurement module We store up a list of tuples with the work output for a given file ''' assert filePath == self._currentFilePath, "Measure callback out of sync" trace.cc(3, "_file_measured_callback: {0}".format(filePath)) trace.file(3, " measures: {0}".format(measures)) trace.file(3, " analysis: {0}".format(analysisResults)) self._currentFileOutput.append((measures, analysisResults))
def _remove_skip_dirs(self, root, dirs): ''' Decide what children dirs should be skipped Filter out dirs in place (vs a copy), so os.walk will skip ''' dirsToRemove = [] for folderPattern in self._skipFolders: dirsToRemove += fnmatch.filter(dirs, folderPattern) dirsToRemove = set(dirsToRemove) for folder in dirsToRemove: trace.file(1, "Skiping over: %s\\%s" % (root, folder)) dirs.remove(folder)
def _remove_skip_dirs(self, root, dirs): ''' Decide what children dirs should be skipped Filter out dirs in place (vs a copy), so os.walk will skip ''' dirsToRemove = [] for folderPattern in self._skipFolders: dirsToRemove += fnmatch.filter(dirs, folderPattern) dirsToRemove = set(dirsToRemove) for folder in dirsToRemove: trace.file(1, "Skiping over: {0}\\{1}".format(root, folder)) dirs.remove(folder)
def _has_ext(filePath, extensions): ''' Does file have an extension, stripping off any numeric only extensions ''' fileExt = None while True: (base, extension) = os.path.splitext(filePath) fileExt = str(extension).strip('.') if fileExt.isdigit(): filePath = base else: break trace.file(3, " File Extension: {0}".format(fileExt)) return fileExt.lower() in extensions
def _get_files_to_process(self, folderName, fileNames, fileFilters, configPath): ''' Filter the list of files based on command line options and active config file filters ''' # if fileFilters is empty it means an empty config file, so skip all files if not fileFilters: return [] # Optimize the most common matching of extensions by creating cache of # simple '*.xxx' extensions from config filters for each config file filterExts = [] try: filterExts = self._configFilterCache[configPath] except KeyError: filterSplits = [os.path.splitext(fileFilter) for fileFilter in fileFilters if os.path.splitext(fileFilter)[0] == '*'] filterExts = [ext for _root, ext in filterSplits] self._configFilterCache[configPath] = filterExts # Select files based on matching filters filesToProcess = [] for fileName in fileNames: # Filter file list by command-line postive filter, if provided if fileext.file_matches_filters(fileName, self._fileExtFilters): # Optimize most common case of direct match of file extension, then # fall back to doing a full filter match on config file filter _root, fileExt = os.path.splitext(fileName) fileFilter = None if fileExt in filterExts: fileFilter = '*' + fileExt else: fileFilter = fileext.file_matches_filters(fileName, fileFilters) if fileFilter is not None: filesToProcess.append((fileName, fileFilter)) # Remove files that should be skipped if self._skipFiles: filesToProcess = [(fileName, fileFilter) for fileName, fileFilter in filesToProcess if not fileext.file_matches_filters(fileName, self._skipFiles)] # Debug tracing of files that were not measured if trace.level(): filesSkipped = set(fileNames) - set([f for f, _filter in filesToProcess]) if filesSkipped: trace.file(2, "SkippingFiles: %s" % filesSkipped) return filesToProcess
def _measure_text(self, fileObject, measurements): ''' Default handler For text based files, go through each file line ''' if self._traceLevel: trace.file(4, "Document: {0}".format(fileObject)) for rawLine in fileObject: self.totalLines += 1 line = utils.strip_null_chars(rawLine) # Detect blank lines if self.reBlankLine.match(line): self.blankLines += 1 continue # Content line self.contentLines += 1
def is_text_file(fileObject): textChars = string.letters + string.digits + string.punctuation + string.whitespace bytesToCheck = 128 # Big enough window to grab, but small for speed startPoint = 4 # Skip start of file, for hidden text codes minWindowSize = 32 # Get a big enough min window to be feasible nonTextThreshold = 0.2 # Have some tolerance to avoid false positives # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files) fileBytes = utils.strip_null_chars(utils.get_file_start(fileObject, bytesToCheck)) isBelowThreshold = utils.check_bytes_below_threshold( fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold) trace.file(3," IsTextFile({0}): {1} ==> {2}".format( isBelowThreshold, os.path.basename(fileObject.name), fileBytes)) return isBelowThreshold
def walk(self, pathToMeasure): ''' Walk folders while filtering sending updates via callback We may be asked to terminate in our callback ''' self._configStack.set_measure_root(pathToMeasure) for folderName, childFolders, fileNames in os.walk(pathToMeasure, topdown=True): trace.file(2, "Scanning: {0}".format(folderName)) numUnfilteredFiles = len(fileNames) filesAndConfigs = [] if fileNames and self._valid_folder(folderName): # Get the current set of active config filters fileFilters, activeConfigs, configPath = self._configStack.get_configuration( folderName) # Filter out files by options and config items filesToProcess = self._get_files_to_process( folderName, fileNames, fileFilters, configPath) # Create list of tuples with fileName and configEntrys for each file for fileName, fileFilter in filesToProcess: configEntrys = self._get_configs_for_file( fileName, fileFilter, activeConfigs, configPath) filesAndConfigs.append((fileName, configEntrys)) # For delta measure create a fully qualified delta path name # Note when we split on path to measure, it will start with seperator deltaFolder = None if self._deltaPath is not None: deltaFolder = self._deltaPath + folderName[len(pathToMeasure):] # Call back to job with files and configs continueProcessing = self._add_files_to_job( folderName, deltaFolder, filesAndConfigs, numUnfilteredFiles) if not continueProcessing or not self._expandSubdirs: break # Remove any folders, and sort remaining to ensure consistent walk # order across file systems (for our testing if nothing else) self._remove_skip_dirs(folderName, childFolders) childFolders.sort()
def walk(self, pathToMeasure): ''' Walk folders while filtering sending updates via callback We may be asked to terminate in our callback ''' self._configStack.set_measure_root(pathToMeasure) for folderName, childFolders, fileNames in os.walk(pathToMeasure, topdown=True): trace.file(2, "Scanning: {0}".format(folderName)) numUnfilteredFiles = len(fileNames) filesAndConfigs = [] if fileNames and self._valid_folder(folderName): # Get the current set of active config filters fileFilters, activeConfigs, configPath = self._configStack.get_configuration(folderName) # Filter out files by options and config items filesToProcess = self._get_files_to_process(folderName, fileNames, fileFilters, configPath) # Create list of tuples with fileName and configEntrys for each file for fileName, fileFilter in filesToProcess: configEntrys = self._get_configs_for_file(fileName, fileFilter, activeConfigs, configPath) filesAndConfigs.append((fileName, configEntrys)) # For delta measure create a fully qualified delta path name # Note when we split on path to measure, it will start with seperator deltaFolder = None if self._deltaPath is not None: deltaFolder = self._deltaPath + folderName[len(pathToMeasure):] # Call back to job with files and configs continueProcessing = self._add_files_to_job( folderName, deltaFolder, filesAndConfigs, numUnfilteredFiles) if not continueProcessing or not self._expandSubdirs: break # Remove any folders, and sort remaining to ensure consistent walk # order across file systems (for our testing if nothing else) self._remove_skip_dirs(folderName, childFolders) childFolders.sort()
def _measure_file(self, workItem): ''' Unpack workItem and run all measures requested by the configItems for the file ''' ( path, deltaPath, fileName, configItems, options, numFilesInFolder ) = workItem self._currentFilePath = os.path.join(path, fileName) trace.file(3, "Processing: {0}".format(self._currentFilePath)) deltaFilePath = None if deltaPath is not None: deltaFilePath = os.path.join(deltaPath, fileName) continueProcessing = True try: for configItem in configItems: if self._check_for_stop(): break self._open_file(configItem.module, deltaFilePath) # # Synchronus delegation to the measure module defined in the config file # configItem.module.process_file( self._currentFilePath, self._currentFileIterator, configItem, numFilesInFolder, self.file_measured_callback) except utils.FileMeasureError, e: trace.traceback(2) self._currentFileErrors.append( uistrings.STR_ErrorMeasuringFile.format(self._currentFilePath, str(e))) continueProcessing = not options.breakOnError
def _stash_aggregates(self, filePath, analysisResults): ''' As we receive results for files, if we have requests to aggregate results, store away aggregate information. The aggreate functionality is based on names of items generated by specific csmodules; we consider it a fatal error if what is requested for aggregation and what is present in analysisResults are out of sync ''' # For each set of aggregates we go through results and add # them to the appropriate aggregate set for aggKey, aggNames in self._aggregateNames.iteritems(): aggregateDict = self._aggregates.setdefault(aggKey, {}) trace.file(2, "Aggregating {0} items in {1}".format(len(analysisResults), aggKey)) for result in analysisResults: # aggKey has the name for the value from results that we # will be keying the aggreate dictionary on try: newKey = result[aggKey] except KeyError, e: raise utils.InputException(STR_AggregateKeyError.format(str(e))) else: aggregate = aggregateDict.setdefault(newKey, {'aggregate.count':0}) # Sepcific names can be provided to aggregate, or can do all namesToAggregate = aggNames if isinstance(aggNames, basestring): if aggNames == 'all': namesToAggregate = result.keys() # Take each value from the result and aggregate according to type for itemName in namesToAggregate: self._aggregate_update(itemName, result[itemName], aggregate) # Count the item aggregate['aggregate.count'] += 1 # Update the aggregate aggregateDict[newKey] = aggregate # The dictionary for this aggKey has been updated, so stash it self._aggregates[aggKey] = aggregateDict
def _get_delta_lines(self, filePath, deltaFilePath): ''' Return a line buffer that represents additional lines relative to the delta path. We are not doing a full diff, only taking into account new files, and lines in existing files that are new/modified. ''' self._deltaFilePath = deltaFilePath deltaLines = None # If no correpsonding file exists in delta, we do a normal file open if not os.path.exists(deltaFilePath): trace.file(1, "Delta file doesn't exist for: {0}".format(deltaFilePath)) deltaLines = self._open_file(filePath) # We only do a diff if there is an identical file name that has been modified elif not filecmp.cmp(deltaFilePath, filePath): fileToMeasure = self._open_file(filePath) if fileToMeasure is not None: measureFileLines = fileToMeasure.readlines() fileToMeasure.close() deltaFileLines = None with open(deltaFilePath, 'rU') as deltaFile: deltaFileLines = deltaFile.readlines() diffLines = difflib.unified_diff(deltaFileLines, measureFileLines) if diffLines: deltaLines = [] for line in diffLines: if line.startswith('+') or (self._deltaIncludeDeleted and line.startswith('-')): deltaLines.append(line[2:]) trace.file(1, "{0} delta lines with: {1}".format(len(deltaLines), deltaFilePath)) else: trace.file(1, "Delta skip: {0} == {1}".format(filePath, deltaFilePath)) return deltaLines
def _is_file_survey_dupe(self, filePath, measures): ''' Simple mechanism to identify duplicate and near-dupicate code by tracking a dictionary of files we see as measures. There are two modes: 1) File Size: Build a dictionary in memory based on a hash of fileName and config info. In the hash buckets we store a dict of file sizes for the first of each size we see that is not within the dupe threshold. If we see a file size within the threshold of one of our existing hashed sizes, we treat it as a dupe and increment count for reporting. 2) NBNC CRC: We use the nbnc.crc measure to identify duplicates Note that we ASSUME the necessary file metadata will be present in the measures dicitonary, as basemodule.py puts it there for the Dupe option. ''' firstDupeFilePath = None # 1) File name and Size check if isinstance(self._dupeThreshold, int): fileSize = int(measures[basemodule.METADATA_FILESIZE]) dupeKey = (measures[basemodule.METADATA_FULLNAME] + measures[basemodule.METADATA_CONFIG].replace(' ', '')) if dupeKey in self._dupeFileSurveys: for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].iteritems(): if (dupeFileSize - self._dupeThreshold) <= fileSize and ( fileSize <= (dupeFileSize + self._dupeThreshold)): firstDupeFilePath = firstFilePath self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath) trace.msg(1, "Dupe {0} by {1} of {2} bytes: {3}".format( fileCount, fileSize - dupeFileSize, fileSize, filePath)) break else: self._dupeFileSurveys[dupeKey] = {} if firstDupeFilePath is None: self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath) trace.file(2, "Added {0} -- {1} to dupe dictionary".format(dupeKey, fileSize)) # 2) Code CRC check # Our relying on the nbnc.crc is brittle, because it is both a code and runtime # dependency on the Code csmodule being used. And there are valid scenarios # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the # measure isn't present, we fail silently else: fileCrc = None try: fileCrc = measures['nbnc.crc'] except: trace.file(2, "CRC Dupe - nbnc.crc missing: {0}".format(filePath)) if fileCrc in self._dupeFileSurveys: fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc] self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath) trace.msg(1, "Dupe {0}: {1} DUPE_OF {2}".format(fileCount, filePath, firstDupeFilePath)) elif fileCrc is not None: self._dupeFileSurveys[fileCrc] = (1, filePath) trace.file(2, "Added {0} -- {1} to dupe dictionary".format(filePath, fileCrc)) return firstDupeFilePath
def _measure_file(self, workItem): ''' Unpack workItem and run all measures requested by the configItems for the file ''' (path, deltaPath, fileName, configItems, options, numFilesInFolder) = workItem self._currentFilePath = os.path.join(path, fileName) trace.file(1, "Processing: {0}".format(self._currentFilePath)) deltaFilePath = None if deltaPath is not None: deltaFilePath = os.path.join(deltaPath, fileName) continueProcessing = True try: for configItem in configItems: if self._check_for_stop(): break self._open_file(configItem.module, deltaFilePath) # # Synchronus delegation to the measure module defined in the config file # configItem.module.process_file(self._currentFilePath, self._currentFileIterator, configItem, numFilesInFolder, self.file_measured_callback) except utils.FileMeasureError, e: trace.traceback(2) self._currentFileErrors.append( uistrings.STR_ErrorMeasuringFile.format( self._currentFilePath, str(e))) continueProcessing = not options.breakOnError
def _open_file(self, filePath, oldFileHandle=None): ''' Return the requested fileObject if criteria are met ''' tryToOpen = True newFileHandle = None # Check for extensions if self._ignoreNonCode and (filetype.is_noncode_ext(filePath)): trace.file(1, "Skipping, non-code ext: {0}".format(filePath)) tryToOpen = False # Check for size threshold elif self._sizeThreshold > 0: fileSize = utils.get_file_size(filePath) if self._sizeThreshold < fileSize: trace.file(1, "Skipping, size {0}: {1}".format(fileSize, filePath)) tryToOpen = False if tryToOpen: # Open the file if it hasn't been opened, otherwise reset it if not oldFileHandle: # Use a universal open with line buffering to support binary files and # reduce the cost of open on larger files newFileHandle = open(filePath, 'rU', 1) else: newFileHandle = oldFileHandle newFileHandle.seek(0) # Reset the file # Do tests that look at start of the file keepFileOpen = False if self._ignoreNonCode and filetype.is_noncode_file(newFileHandle): trace.file(1, "Skipping, non-code start: {0}".format(filePath)) elif self._ignoreBinary and not filetype.is_text_file(newFileHandle): trace.file(1, "Skipping, binary char: {0}".format(filePath)) else: keepFileOpen = True if not keepFileOpen: # If we were NOT passed an existing file handle, close what we opened if not oldFileHandle and newFileHandle: newFileHandle.close() newFileHandle = None return newFileHandle
def process_file(self, filePath, fileLines, configEntry, numSameFiles, file_measured_callback): ''' Inherited modules use the default implementation of process_file to handle calling _survey and packaging results, including any file metadata ''' utils.timing_set('FILE_MEASURE_TIME') trace.file(2, "process_file: {0} {1}".format(self.__class__.__name__, filePath)) trace.file(3, " config: {0}".format(str(configEntry))) # Stash path for error handling in derived classes self._currentPath = utils.SurveyorPathParser(filePath) # Does the config measure filter need to be overridden? if self._measureFilter is not None: configEntry.new_measure_filter(self._measureFilter) # Measurements (whole file metrics) will be stored in a dictionary # Pack measurement data with file metadata measurements = {} self._pack_metadata_into_measures(configEntry, numSameFiles, measurements) # Analysis items (per line items) are a list of dictionaries analysis = [] # # Delegate the survey work to specializations # measureResults = {} analysisResults = [] if self._survey(fileLines, configEntry, measurements, analysis): # Pack measurements that match our measure filter for measureName, measure in measurements.iteritems(): if self.match_measure(measureName, configEntry.measureFilters): measureResults[measureName] = measure # Pack analysis items into a list of dictionaries for return to app # We only send analysis items that match filter for analysisItem in analysis: analysisRow = {} for itemName, itemValue in analysisItem.iteritems(): if self.match_measure(itemName, configEntry.measureFilters): analysisRow[itemName] = itemValue if analysisRow: analysisResults.append(analysisRow) # If this is a delta comparison and there are no lines, it means the # delta file is an exact dupe if not fileLines and self._deltaFilePath: measureResults[METADATA_DUPE_PATH] = self._deltaFilePath # Add timing info if self.match_measure(METADATA_TIMING, configEntry.measureFilters): measureResults[METADATA_TIMING] = "{0:.4f}".format(utils.timing_get('FILE_MEASURE_TIME')) self._currentPath = None self._deltaFilePath = None # Send data back to the caller (jobworker.Worker in default framework) file_measured_callback(filePath, measureResults, analysisResults)
def _survey_lines(self, linesToSurvey, params, measurements, analysis): ''' Analyze file line by line. linesToSurvey is an iterable set of lines. Processing is driven by the regular expressions in member variables. The order of processing each line is: - Preprocess line string - Detect machine vs. human code - Detect blank lines - Detect single and multi-line comments - Capture line measures - Peform line processing (searches, routines, etc.) ''' # Setup dictionary for measures and searches we'll do self._survey_start(params) # If no lines to process, we may still want to output empty measures if linesToSurvey is None: linesToSurvey = [] # Track whether we are inside a multi-line comment - we ignore nesting scanningMultiLine = False # If we have a line seperator, apply it for bufferLine in linesToSurvey: self.counts['RawLines'][self._activeBlock] += 1 if self._traceLevel: trace.file(4, "Raw: {0}".format(bufferLine)) # Allow specializations to special-case certain lines if self._alternate_line_processing(bufferLine): continue lines = [bufferLine] if self.addLineSep is not None: lines = bufferLine.split(self.addLineSep) # # Read through the file lines and process them one at a time # This is the main processing loop for all csmodules derived from NBNC # try: for rawLine in lines: self.counts['TotalLines'][self._activeBlock] += 1 # Allow for clean up of artifacts or other pre-processing line = self._preprocess_line(rawLine) # Detect true blank lines if self.reTrueBlankLine.match(line): self.counts['TrueBlankLines'][self._activeBlock] += 1 self._trace_line(line, "T") continue # Block Detection if len(self.blockDetectors) > 1: if self._detect_block_change(line, analysis): scanningMultiLine = False # Don't allow multi-line comment to span blocks # Determine comment state # This is done before blank lines to make sure we consider multi-line # comment syntax that will be counted as "blank", e.g., /* on it's own line onCommentLine, scanningMultiLine = self._detect_line_comment( line, scanningMultiLine) # Detect blank lines if self._detect_blank_line(line): continue # Measure and analyze -- overriden in derived classes self._measure_line(line, onCommentLine) self._analyze_line(line, analysis, onCommentLine) except Exception, e: trace.traceback() raise utils.FileMeasureError( "Problem processing line: {0} with module: {1}\n{2}". format(str(sum(self.counts['RawLines'])), self.__class__.__name__, str(e)))
def _survey_lines(self, linesToSurvey, params, measurements, analysis): ''' Analyze file line by line. linesToSurvey is an iterable set of lines. Processing is driven by the regular expressions in member variables. The order of processing each line is: - Preprocess line string - Detect machine vs. human code - Detect blank lines - Detect single and multi-line comments - Capture line measures - Peform line processing (searches, routines, etc.) ''' # Setup dictionary for measures and searches we'll do self._survey_start(params) # If no lines to process, we may still want to output empty measures if linesToSurvey is None: linesToSurvey = [] # Track whether we are inside a multi-line comment - we ignore nesting scanningMultiLine = False # Loop through the raw lines we were passed for bufferLine in linesToSurvey: self.counts['RawLines'][self._activeBlock] += 1 if self._traceLevel: trace.file(4, "Raw: {0}".format(bufferLine)) # Allow specializations to skip and/or special-case certain lines if self._alternate_line_processing(bufferLine): continue # If we have a line seperator, apply it lines = [bufferLine] if self.addLineSep is not None: lines = bufferLine.split(self.addLineSep) # # Read through the lines to measure and process them one at a time # This is the main measure loop for csmodules derived from NBNC # try: for rawLine in lines: self.counts['TotalLines'][self._activeBlock] += 1 # Allow for clean up of artifacts or other pre-processing line = self._preprocess_line(rawLine) # Detect true blank lines if self.reTrueBlankLine.match(line): self.counts['TrueBlankLines'][self._activeBlock] += 1 self._trace_line(line, "T") continue # Block Detection if len(self.blockDetectors) > 1: if self._detect_block_change(line, analysis): scanningMultiLine = False # Don't allow multi-line comment to span blocks # Determine comment state # This is done before blank lines to make sure we consider multi-line # comment syntax that will be counted as "blank", e.g., /* on it's own line onCommentLine, scanningMultiLine = self._detect_line_comment(line, scanningMultiLine) # Detect "blank" lines with no useful info if self._detect_blank_line(line): continue # Measure and analyze -- overriden in derived classes self._measure_line(line, onCommentLine) self._analyze_line(line, analysis, onCommentLine) except Exception, e: trace.traceback() raise utils.FileMeasureError( "Problem processing line: {0} with module: {1}\n{2}".format( str(sum(self.counts['RawLines'])), self.__class__.__name__, str(e)))