def get_word_list(self): self.close() token_list = RegexpTokenizer(r'\w+').tokenize(self.text) token_set = set([base for base in (base_form(w) for w in token_list if w.isalpha()) if base is not None]) list = [[w, self.sort_key(w)] for w in token_set] # Sort by frequency list.sort(key=lambda p: p[1]) return [p[0] for p in list]
def find_glossary_words(book_dir, all_words): glossaryfile = os.path.join(book_dir, 'glossary.json') if os.path.exists(glossaryfile): with open(glossaryfile, 'r', encoding='utf-8') as file: glossary = json.load(file) words = [base_form(e['headword']) for e in glossary] this_version_words = sorted(set(words).intersection(all_words)) return this_version_words else: return []
def get_word_rating(request, word): try: user = ClusiveUser.objects.get(user=request.user) base = base_form(word) wm = WordModel.objects.get(user=user, word=base) return JsonResponse({'rating': wm.rating}) except WordModel.DoesNotExist: return JsonResponse({'word': base, 'rating': False}) except ClusiveUser.DoesNotExist: logger.warning("No clusive user, can't fetch ratings") return JsonResponse({'word': base, 'rating': False})
def word_bank_remove(request, word): try: user = ClusiveUser.objects.get(user=request.user) base = base_form(word) wm = WordModel.objects.get(user=user, word=base) if wm: wm.register_wordbank_remove() return JsonResponse({'success': 1}) else: return JsonResponse({'success': 0}) except ClusiveUser.DoesNotExist: logger.warning("No clusive user, can't remove word") return JsonResponse({'success': 0})
def set_word_rating(request, word, rating): try: user = ClusiveUser.objects.get(user=request.user) base = base_form(word) wm, created = WordModel.objects.get_or_create(user=user, word=base) if WordModel.is_valid_rating(rating): wm.register_rating(rating) word_rated.send(sender=GlossaryConfig.__class__, request=request, word=word, rating=rating) return JsonResponse({'success': 1}) else: return JsonResponse({'success': 0}) except ClusiveUser.DoesNotExist: logger.warning("No clusive user, can't set ratings") return JsonResponse({'success': 0})
def init_data(self): self.data = {} try: book = Book.objects.get(id=self.book_id) with open(book.glossary_storage, 'r', encoding='utf-8') as file: logger.debug("Reading glossary %s", file.name) rawdata = json.load(file) self.data = {} for worddata in rawdata: base = glossaryutil.base_form(worddata['headword']) self.data[base] = worddata for altform in worddata['alternateForms']: self.data[altform.lower()] = worddata except FileNotFoundError: logger.warning('Book %s has no glossary', book) except EnvironmentError: logger.error('Failed to read glossary data')
def form_valid(self, form): text = form.cleaned_data['text'] word_list = wf.tokenize(text, self.lang) self.stats = [ { 'name': 'Flesch-Kincaid grade level', 'value': textstat.flesch_kincaid_grade(text), 'desc': 'Based on avg sentence length and syllables per word.'}, { 'name': 'Dale-Chall grade level', 'value': textstat.dale_chall_readability_score_v2(text), 'desc': 'Based on avg sentence length and percent difficult words.'}, { 'name': 'Number of words', 'value': textstat.lexicon_count(text) }, { 'name': 'Number of sentences', 'value': textstat.sentence_count(text) }, { 'name': 'Average sentence length', 'value': textstat.avg_sentence_length(text) }, { 'name': 'Average syllables per word', 'value': textstat.avg_syllables_per_word(text) }, { 'name': 'Difficult words', 'value': "%d (%d%%): %s" % (textstat.difficult_words(text), 100*textstat.difficult_words(text)/textstat.lexicon_count(text), ', '.join(textstat.difficult_words_list(text))) }, ] word_info = {} for word in word_list: base = base_form(word) w = word_info.get(base) if w: w['count'] += 1 if word != base and word not in w['alts']: w['alts'].append(word) else: w = { 'hw' : base, 'alts' : [], 'count' : 1, 'freq' : wf.zipf_frequency(base, self.lang) } if word != base: w['alts'].append(word) word_info[base] = w self.words = sorted(word_info.values(), key=lambda x: x.get('freq')) logger.debug('words: %s', self.words) # Don't do normal process of redirecting to success_url. Just stay on this form page forever. return self.render_to_response(self.get_context_data(form=form))
def glossdef(request, book_id, cued, word): """Return a formatted HTML representation of a word's meaning(s).""" base = base_form(word) try: book = Book.objects.get(pk=book_id) except Book.DoesNotExist: book = None defs = lookup(book, base) vocab_lookup.send(sender=GlossaryConfig.__class__, request=request, word=base, cued=cued, source=defs['source'] if defs else None) # TODO might want to record how many meanings were found (especially if it's 0): len(defs['meanings']) if defs: context = {'defs': defs} if book: context['book_path'] = book.path return render(request, 'glossary/glossdef.html', context=context) else: return HttpResponseNotFound("<p>No definition found</p>")
def test_base_forms(self): self.assertEqual('noun', base_form('noun')) self.assertEqual('noun', base_form('nouns')) self.assertEqual('act', base_form('acting')) self.assertEqual('act', base_form('acted')) self.assertEqual('go', base_form('went')) self.assertEqual('go', base_form('goes')) self.assertEqual('large', base_form('largest')) self.assertEqual('text', base_form('texts')) self.assertEqual('install', base_form('installing')) # Not British 'instal' self.assertEqual('more', base_form( 'more')) # alphabetically before the other possibility, "much" self.assertEqual( 'ooblecks', base_form('ooblecks')) # unknown word is passed through as is