Exemple #1
0
    def handle(self, *args, **options):
        std_arrays = defaultdict(list)
        doclist = []
        canned_loader = CannedLoader()
        for doc in canned_loader.iterate():
            # Process this document
            tm = TextManager(doc.text, doc.year)
            # Trace progress message
            self.stdout.write(doc.title + " " + doc.author)
            for key, value in sorted(tm.profile_stats().items()):
                std_arrays[key].append(value)
            doclist.append((doc.author + ", " + doc.title, tm.profile_stats()))

        mean_averages = {key: numpy.mean(values) for key, values in std_arrays.items()}
        standard_deviations = {key: numpy.std(values) for key, values in std_arrays.items()}
        doclist = [(title, _convert_to_zscores(stats, mean_averages, standard_deviations)) for title, stats in doclist]

        for i, doc in enumerate(doclist):
            title, stats = doc
            rankings = []
            for j, doc2 in enumerate(doclist):
                if j == i:
                    continue
                distance = _measure_distance(doc, doc2)
                rankings.append((doc2[0], doc2[1], distance))
            rankings.sort(key=lambda d: d[2])
            self.stdout.write("\n\n" + title)
            self.stdout.write(repr(stats))
            for d in rankings[0:10]:
                self.stdout.write("\t%s\t%f" % (d[0], d[2]))
                self.stdout.write("\t\t" + repr(d[1]))
def prepare_canned_texts():
    """
    Load the list of documents from the text file, process them,
    and save the results as canned JSON objects in the Document database.
    """
    # Truncate the existing Document database table
    Document.objects.all().delete()

    canned_loader = CannedLoader()
    for doc in canned_loader.iterate():
        # Process this document
        tm = TextManager(doc.text, doc.year)
        # Trace progress message
        stdout.write('%s, %s\n' % (doc.author, doc.title))

        # Trim fields where necessary
        mlength = Document._meta.get_field('author').max_length
        author = doc.author[:mlength]

        mlength = Document._meta.get_field('authorsort').max_length
        author_sort = doc.author_sort[:mlength]

        mlength = Document._meta.get_field('title').max_length
        title = doc.title[:mlength]

        mlength = Document._meta.get_field('titlesort').max_length
        title_sort = doc.title_sort[:mlength]

        # Prepare database record for this document
        dbdoc = Document(author=author,
                         authorsort=author_sort,
                         title=title,
                         titlesort=title_sort,
                         year=doc.year,
                         teaser=tm.teaser(),
                         text=tm.text,
                         lemmas=tm.lemmas_datastruct(formalism='json'),
                         tokens=tm.tokens_datastruct(formalism='json'),
                         randomsort=random.randint(0, 1000),)
        # Save the database record
        dbdoc.save()