예제 #1
0
    def analyze(self, unknownDocument):
        # unknownDocument is a single doc, type Document.
        results = dict()
        unknownDocHistogram: dict = histograms.normalizeHistogram(
            histograms.generateAbsoluteHistogram(unknownDocument))
        #unknownDocHistogramNp = np.asarray(list(unknownDocHistogram.items()))
        results = dict()
        if self.mode == "author":
            for author in self._histograms:
                authorResult = 0  # numerial result for an author (mean histogram)
                for item in self._histograms[author]:
                    if unknownDocHistogram.get(item) != None:
                        authorResult -= self._histograms[author][
                            item] * math.log(unknownDocHistogram[item])
                results[author] = authorResult

        elif self.mode == "document":
            for author in self._histograms:
                for doc in self._histograms[author]:
                    docResult = 0  # numerical result for a single document
                    for item in doc:
                        if unknownDocHistogram.get(item) != None:
                            docResult -= self._histograms[author][doc][
                                item] * math.log(unknownDocHistogram[item])
                    results[doc] = docResult

        return results
예제 #2
0
 def analyze(self, unknownDocument):
     '''Compare a normalized histogram of unknownDocument against the normalized known document histograms and return a dictionary of distances.'''
     results = dict()
     for author, knownHist in self._authorHistograms.items():
         results[author] = self.distance.distance(
             histograms.normalizeHistogram(
                 histograms.generateAbsoluteHistogram(unknownDocument)),
             knownHist)
     return results
예제 #3
0
 def train(self, knownDocuments):
     if self.mode == "author":
         # authors -> mean histograms
         self._histograms = histograms.generateKnownDocsMeanHistograms(
             histograms.generateKnownDocsNormalizedHistogramSet(
                 knownDocuments))
         #self._histogramsNp = {author:np.asarray(list(docHistogram.items())) for (author,docHistogram) in self._histograms.items()}
         # ^^ goes into the histogram list and change mean histograms into numpy arrays
     elif self.mode == 'document':
         # authors -> list of histograms
         self._histograms = histograms.generateKnownDocsNormalizedHistogramSet(
             knownDocuments)
예제 #4
0
	def train(self, knownDocuments):
		'''Get a mean normalized histogram for each known author.'''
		self.authorHistograms = histograms.generateKnownDocsMeanHistograms(histograms.generateKnownDocsNormalizedHistogramSet(knownDocuments))