def handle(self, *args, **options): std_arrays = defaultdict(list) doclist = [] canned_loader = CannedLoader() for doc in canned_loader.iterate(): # Process this document tm = TextManager(doc.text, doc.year) # Trace progress message self.stdout.write(doc.title + " " + doc.author) for key, value in sorted(tm.profile_stats().items()): std_arrays[key].append(value) doclist.append((doc.author + ", " + doc.title, tm.profile_stats())) mean_averages = {key: numpy.mean(values) for key, values in std_arrays.items()} standard_deviations = {key: numpy.std(values) for key, values in std_arrays.items()} doclist = [(title, _convert_to_zscores(stats, mean_averages, standard_deviations)) for title, stats in doclist] for i, doc in enumerate(doclist): title, stats = doc rankings = [] for j, doc2 in enumerate(doclist): if j == i: continue distance = _measure_distance(doc, doc2) rankings.append((doc2[0], doc2[1], distance)) rankings.sort(key=lambda d: d[2]) self.stdout.write("\n\n" + title) self.stdout.write(repr(stats)) for d in rankings[0:10]: self.stdout.write("\t%s\t%f" % (d[0], d[2])) self.stdout.write("\t\t" + repr(d[1]))
def prepare_canned_texts(): """ Load the list of documents from the text file, process them, and save the results as canned JSON objects in the Document database. """ # Truncate the existing Document database table Document.objects.all().delete() canned_loader = CannedLoader() for doc in canned_loader.iterate(): # Process this document tm = TextManager(doc.text, doc.year) # Trace progress message stdout.write('%s, %s\n' % (doc.author, doc.title)) # Trim fields where necessary mlength = Document._meta.get_field('author').max_length author = doc.author[:mlength] mlength = Document._meta.get_field('authorsort').max_length author_sort = doc.author_sort[:mlength] mlength = Document._meta.get_field('title').max_length title = doc.title[:mlength] mlength = Document._meta.get_field('titlesort').max_length title_sort = doc.title_sort[:mlength] # Prepare database record for this document dbdoc = Document(author=author, authorsort=author_sort, title=title, titlesort=title_sort, year=doc.year, teaser=tm.teaser(), text=tm.text, lemmas=tm.lemmas_datastruct(formalism='json'), tokens=tm.tokens_datastruct(formalism='json'), randomsort=random.randint(0, 1000),) # Save the database record dbdoc.save()