Beispiel #1
0
 def addTokens(path, content):
    if not self.matchesExtension(path):
       return
    fileList.append(path)
    words = list(tokenizer.tokenize(path, content))
    identifierCount[0] += sum([1 for word in words
                               if tokenizer.isIdentifier(word[0])])
    tokenCount[0] += len(words)
Beispiel #2
0
 def addTokens(path, content):
     if not self.matchesExtension(path):
         return
     fileList.append(path)
     words = list(tokenizer.tokenize(path, content))
     identifierCount[0] += sum(
         [1 for word in words if tokenizer.isIdentifier(word[0])])
     tokenCount[0] += len(words)
Beispiel #3
0
 def benchmark(self, path, content, benchmarks, sample, maxTokens, filters):
     self._resetFile(path)
     logFile = Logger.logFile('log')
     wordList = self._parse(path, content, False)
     lines = self._getLines(content)
     lineIndex = 0
     tokensTested = 0
     for index, (word, loc, node) in enumerate(wordList):
         # Go backwards so we can use last prediction and input in subsequent
         # logging code
         if tokensTested >= maxTokens:
             return tokensTested
         while (loc > lines[lineIndex][1] + len(lines[lineIndex][0])):
             lineIndex += 1
         lineStart = lines[lineIndex][1]
         linePrefix = content[lineStart:loc]
         if self.benchmarkCounter % sample == 0 and \
            (not filters.onlySeen or word in self.words) and \
            len(word) >= filters.minWordLength and \
            not ('#' in linePrefix or '//' in linePrefix) and \
            (not filters.onlyIdentifiers or tokenizer.isIdentifier(word)):
             prediction = []
             for prefixSize in range(PREFIX_SIZE, -1, -1):
                 input = AnnotatedInput(
                     Input(path, content, loc, word[:prefixSize], -1),
                     wordList, index, lines, lineIndex)
                 prediction = self._predictAnnotated(input)
                 for benchmark in benchmarks:
                     benchmark.update(prediction, word, prefixSize)
             doLogging = True
             if filters.inFirst != -1:
                 predictedWords = [
                     w for (w, p) in prediction[:filters.inFirst]
                 ]
                 if word not in predictedWords:
                     doLogging = False
             if filters.notInFirst != -1:
                 predictedWords = [
                     w for (w, p) in prediction[:filters.notInFirst]
                 ]
                 if word in predictedWords:
                     doLogging = False
             if doLogging:
                 self._logPrediction(input, prediction, word)
                 tokensTested += 1
         else:
             input = AnnotatedInput(Input(path, content, loc, '', -1),
                                    wordList, index, lines, lineIndex)
         self.words.add(word)
         self._trainOneWord(input, word, False, True)
         self.tokensTrained += 1
         self.benchmarkCounter += 1
     return tokensTested
Beispiel #4
0
 def benchmark(self, path, content, benchmarks, sample, maxTokens,
               filters):
    self._resetFile(path)
    logFile = Logger.logFile('log')
    wordList = self._parse(path, content, False)
    lines = self._getLines(content);
    lineIndex = 0
    tokensTested = 0
    for index,(word,loc,node) in enumerate(wordList):
       # Go backwards so we can use last prediction and input in subsequent
       # logging code
       if tokensTested >= maxTokens:
          return tokensTested
       while (loc > lines[lineIndex][1] + len(lines[lineIndex][0])):
          lineIndex += 1
       lineStart = lines[lineIndex][1]
       linePrefix = content[lineStart:loc]
       if self.benchmarkCounter % sample == 0 and \
          (not filters.onlySeen or word in self.words) and \
          len(word) >= filters.minWordLength and \
          not ('#' in linePrefix or '//' in linePrefix) and \
          (not filters.onlyIdentifiers or tokenizer.isIdentifier(word)):
          prediction = []
          for prefixSize in range(PREFIX_SIZE,-1,-1):
             input = AnnotatedInput(Input(path, content, loc,
                word[:prefixSize], -1), wordList, index, lines, lineIndex)
             prediction = self._predictAnnotated(input)
             for benchmark in benchmarks:
                benchmark.update(prediction, word, prefixSize)
          doLogging = True
          if filters.inFirst != -1:
             predictedWords = [w for (w, p) in prediction[:filters.inFirst]]
             if word not in predictedWords:
                doLogging = False
          if filters.notInFirst != -1:
             predictedWords = [w for (w, p) in
                               prediction[:filters.notInFirst]]
             if word in predictedWords:
                doLogging = False
          if doLogging:
             self._logPrediction(input, prediction, word)
             tokensTested += 1
       else:
          input = AnnotatedInput(Input(path, content, loc, '', -1),
                wordList, index, lines, lineIndex)
       self.words.add(word)
       self._trainOneWord(input, word, False, True)
       self.tokensTrained += 1
       self.benchmarkCounter += 1
    return tokensTested
Beispiel #5
0
   def benchmark(self, dataset):
      tokenCount = [0]
      identifierCount = [0]
      fileList = []
      def addTokens(path, content):
         if not self.matchesExtension(path):
            return
         fileList.append(path)
         words = list(tokenizer.tokenize(path, content))
         identifierCount[0] += sum([1 for word in words
                                    if tokenizer.isIdentifier(word[0])])
         tokenCount[0] += len(words)
      scanDir.scanDir(dataset, addTokens)
      trainTokens = self.trainTokens if self.trainTokens \
                    else int(self.trainTokensPct * tokenCount[0])
      metrics = self.metrics
      logFile = Logger.logFile('log')
      random.shuffle(fileList)

      def train(path, content, maxTokens):
         logFile.log('Training on {}'.format(path))
         return self.classifier.train(path, content, maxTokens, False, 1)
      def weightTrain(path, content, maxTokens):
         return self.classifier.train(path, content, maxTokens, True,
                                      self.sample)
      def benchmark(path, content, maxTokens):
         return self.classifier.benchmark(path, content, metrics,
                                        self.sample, maxTokens, self.filters)
      phases = [_Phase('Training', train, trainTokens)]
      if self.weightTraining > 0:
         phases.append(_Phase('WeightTraining', weightTrain,
                              self.weightTraining))
      finalPhase = _Phase('Testing', benchmark,
                          self.maxSamples if self.maxSamples else \
                          self.testingFrac*tokenCount[0])
      if not self.startAt:
         phases.append(finalPhase)

      phaseStart = [datetime.datetime.now()]
      phaseIdx = [0]
      phaseTokenCounter = [0]
      logFile.start(phases[phaseIdx[0]].name)
      def handleFile(path):
         f = open(path)
         # FixMe: [performance] Would it be better to iterate through
         # lines to avoid loading file into memory?
         content = f.read()
         f.close()
         path = os.path.relpath(path, dataset)
         if phaseIdx[0] == len(phases):
            return
         phase = phases[phaseIdx[0]]
         if phaseTokenCounter[0] < phase.tokens:
            sys.stdout.write("\r{} {:.2%}".format(phase.name,
               phaseTokenCounter[0] / float(phase.tokens)))
            sys.stdout.flush()
            phaseTokenCounter[0] += phase.func(path, content, phase.tokens -
                  phaseTokenCounter[0])
         else:
            self.classifier.logParams('post' + phase.name)
            nextPhaseStart = datetime.datetime.now()
            phaseTime = nextPhaseStart - phaseStart[0]
            phaseStart[0] = nextPhaseStart
            lowerName = phase.name[:1].lower() + phase.name[1:]
            performanceMap.put(lowerName + 'Time', phaseTime.total_seconds())
            outputMap.put(lowerName + 'Tokens', phaseTokenCounter[0])
            sys.stdout.write("\r{} 100.00%\n".format(phase.name))
            sys.stdout.flush()
            phaseIdx[0] += 1
            phaseTokenCounter[0] = 0
            logFile.end()
            if phaseIdx[0] < len(phases):
               logFile.start(phases[phaseIdx[0]].name)

      outputMap = Logger.mapFile('output')
      performanceMap = Logger.mapFile('performance')
      killed = False
      outputMap.put("totalTokens", tokenCount[0])
      outputMap.put("totalIdentifiers", identifierCount[0])

      startTime = datetime.datetime.now()
      try:
         for path in fileList:
            handleFile(path)
         if self.startAt:
            phases.append(finalPhase)
            logFile.start(phases[phaseIdx[0]].name)
            for path in fileList[int(self.startAt*len(fileList)):]:
               handleFile(path)
      except KeyboardInterrupt:
         print '^C received, stopping benchmark'
         killed = True
      endTime = datetime.datetime.now()
      totalTime = endTime - startTime

      performanceMap.put('totalTime', totalTime.total_seconds())
      performanceMap.put('memory', utils.memUsage.memory())
      performanceMap.put('stackSize', utils.memUsage.stacksize())
      performanceMap.put('resident', utils.memUsage.resident())
      outputMap.put("tokensTrained", self.classifier.tokensTrained)
      outputMap.put("uniqueTokens", len(self.classifier.words))
      outputMap.put("uniqueIdentifiers",
                    sum([1 for word in self.classifier.words
                         if tokenizer.isIdentifier(word)]))
      for metric in metrics:
         metric.output(outputMap)
      return killed
Beispiel #6
0
    def benchmark(self, dataset):
        tokenCount = [0]
        identifierCount = [0]
        fileList = []

        def addTokens(path, content):
            if not self.matchesExtension(path):
                return
            fileList.append(path)
            words = list(tokenizer.tokenize(path, content))
            identifierCount[0] += sum(
                [1 for word in words if tokenizer.isIdentifier(word[0])])
            tokenCount[0] += len(words)

        scanDir.scanDir(dataset, addTokens)
        trainTokens = self.trainTokens if self.trainTokens \
                      else int(self.trainTokensPct * tokenCount[0])
        metrics = self.metrics
        logFile = Logger.logFile('log')
        random.shuffle(fileList)

        def train(path, content, maxTokens):
            logFile.log('Training on {}'.format(path))
            return self.classifier.train(path, content, maxTokens, False, 1)

        def weightTrain(path, content, maxTokens):
            return self.classifier.train(path, content, maxTokens, True,
                                         self.sample)

        def benchmark(path, content, maxTokens):
            return self.classifier.benchmark(path, content, metrics,
                                             self.sample, maxTokens,
                                             self.filters)

        phases = [_Phase('Training', train, trainTokens)]
        if self.weightTraining > 0:
            phases.append(
                _Phase('WeightTraining', weightTrain, self.weightTraining))
        finalPhase = _Phase('Testing', benchmark,
                            self.maxSamples if self.maxSamples else \
                            self.testingFrac*tokenCount[0])
        if not self.startAt:
            phases.append(finalPhase)

        phaseStart = [datetime.datetime.now()]
        phaseIdx = [0]
        phaseTokenCounter = [0]
        logFile.start(phases[phaseIdx[0]].name)

        def handleFile(path):
            f = open(path)
            # FixMe: [performance] Would it be better to iterate through
            # lines to avoid loading file into memory?
            content = f.read()
            f.close()
            path = os.path.relpath(path, dataset)
            if phaseIdx[0] == len(phases):
                return
            phase = phases[phaseIdx[0]]
            if phaseTokenCounter[0] < phase.tokens:
                sys.stdout.write("\r{} {:.2%}".format(
                    phase.name, phaseTokenCounter[0] / float(phase.tokens)))
                sys.stdout.flush()
                phaseTokenCounter[0] += phase.func(
                    path, content, phase.tokens - phaseTokenCounter[0])
            else:
                self.classifier.logParams('post' + phase.name)
                nextPhaseStart = datetime.datetime.now()
                phaseTime = nextPhaseStart - phaseStart[0]
                phaseStart[0] = nextPhaseStart
                lowerName = phase.name[:1].lower() + phase.name[1:]
                performanceMap.put(lowerName + 'Time',
                                   phaseTime.total_seconds())
                outputMap.put(lowerName + 'Tokens', phaseTokenCounter[0])
                sys.stdout.write("\r{} 100.00%\n".format(phase.name))
                sys.stdout.flush()
                phaseIdx[0] += 1
                phaseTokenCounter[0] = 0
                logFile.end()
                if phaseIdx[0] < len(phases):
                    logFile.start(phases[phaseIdx[0]].name)

        outputMap = Logger.mapFile('output')
        performanceMap = Logger.mapFile('performance')
        killed = False
        outputMap.put("totalTokens", tokenCount[0])
        outputMap.put("totalIdentifiers", identifierCount[0])

        startTime = datetime.datetime.now()
        try:
            for path in fileList:
                handleFile(path)
            if self.startAt:
                phases.append(finalPhase)
                logFile.start(phases[phaseIdx[0]].name)
                for path in fileList[int(self.startAt * len(fileList)):]:
                    handleFile(path)
        except KeyboardInterrupt:
            print '^C received, stopping benchmark'
            killed = True
        endTime = datetime.datetime.now()
        totalTime = endTime - startTime

        performanceMap.put('totalTime', totalTime.total_seconds())
        performanceMap.put('memory', utils.memUsage.memory())
        performanceMap.put('stackSize', utils.memUsage.stacksize())
        performanceMap.put('resident', utils.memUsage.resident())
        outputMap.put("tokensTrained", self.classifier.tokensTrained)
        outputMap.put("uniqueTokens", len(self.classifier.words))
        outputMap.put(
            "uniqueIdentifiers",
            sum([
                1 for word in self.classifier.words
                if tokenizer.isIdentifier(word)
            ]))
        for metric in metrics:
            metric.output(outputMap)
        return killed