Ejemplo n.º 1
0
    def handle(self, *args, **options):
        std_arrays = defaultdict(list)
        doclist = []
        canned_loader = CannedLoader()
        for doc in canned_loader.iterate():
            # Process this document
            tm = TextManager(doc.text, doc.year)
            # Trace progress message
            self.stdout.write(doc.title + " " + doc.author)
            for key, value in sorted(tm.profile_stats().items()):
                std_arrays[key].append(value)
            doclist.append((doc.author + ", " + doc.title, tm.profile_stats()))

        mean_averages = {key: numpy.mean(values) for key, values in std_arrays.items()}
        standard_deviations = {key: numpy.std(values) for key, values in std_arrays.items()}
        doclist = [(title, _convert_to_zscores(stats, mean_averages, standard_deviations)) for title, stats in doclist]

        for i, doc in enumerate(doclist):
            title, stats = doc
            rankings = []
            for j, doc2 in enumerate(doclist):
                if j == i:
                    continue
                distance = _measure_distance(doc, doc2)
                rankings.append((doc2[0], doc2[1], distance))
            rankings.sort(key=lambda d: d[2])
            self.stdout.write("\n\n" + title)
            self.stdout.write(repr(stats))
            for d in rankings[0:10]:
                self.stdout.write("\t%s\t%f" % (d[0], d[2]))
                self.stdout.write("\t\t" + repr(d[1]))