def handle(self, *args, **options):
     # write new global white lists
     verbosity = int(options['verbosity'])
     if verbosity >= 2:
         self.stderr.write('Writing new global white lists...\n')
     writeWhiteListTables(GlobalWord.objects.order_by('untranslated'))
     # update local tables
     if verbosity >= 2:
         self.stderr.write('Updating local tables...\n')
     writeLocalTables(Document.objects.all())
Beispiel #2
0
def confirm_conflicting_duplicates(request, grade, deferred=False):

    WordFormSet = formset_factory(ConflictingWordForm, extra=0)
    if request.method == 'POST':
        formset = WordFormSet(request.POST)
        if formset.is_valid():
            affected_documents = set()
            # save the correct words in the GlobalWord
            # FIXME: in Djano 1.3+ formset formmsets are iterable, so you can just say 
            # for form in formset:
            for form in formset.forms:
                # FIXME: This is an open attack vector. A user can
                # change any word in the global dict with a carefuly
                # crafted post. It might be better not to pass the id.
                word = GlobalWord(grade=grade, **form.cleaned_data)
                word.save()
                # note which documents are affected
                filter_args = dict((k, form.cleaned_data[k]) for k in ('untranslated', 'type', 'homograph_disambiguation'))
                words_to_delete = LocalWord.objects.filter(grade=grade, **filter_args)
                affected_documents.update([word.document for word in words_to_delete])
                # delete the conflicting words (and also plain
                # duplicate non-conflicting words) from the LocalWords
                words_to_delete.delete()
            writeLocalTables(list(affected_documents))
            # once we are done dealing with conflicts we go back to regular confirmation
            redirect = 'dictionary_confirm_g1' if grade == 1 else 'dictionary_confirm_g2'
            return HttpResponseRedirect(reverse(redirect))
    else:
        conflicting_words = get_conflicting_words(grade)
        braille_choices = defaultdict(set)
        global_ids = defaultdict()
        for untranslated, type, homograph_disambiguation, braille, global_id in conflicting_words:
            key = (untranslated, type, homograph_disambiguation)
            braille_choices[key].update([braille])
            if global_id > 0:
                global_ids[key] = global_id

        initial=[
            {'id': global_ids.get((untranslated, type, homograph_disambiguation)),
             'untranslated': untranslated,
             'type': type,
             'homograph_disambiguation': homograph_disambiguation,
             'braille': sorted(braille_choices[(untranslated, type, homograph_disambiguation)]),
             } for untranslated, type, homograph_disambiguation in braille_choices.keys()]
        initial = sorted(initial, key=lambda x: x['untranslated'])
        
        WordFormSet = formset_factory(ConflictingWordForm, extra=0)
        formset = WordFormSet(initial=initial)

    return render_to_response('dictionary/confirm_conflicting_duplicates.html', locals(), 
                              context_instance=RequestContext(request))
Beispiel #3
0
def local(request, document_id, grade):

    document = get_object_or_404(Document, pk=document_id)
    if request.method == 'POST':
        WordFormSet = modelformset_factory(
            LocalWord, 
            form=RestrictedWordForm,
            exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), 
            can_delete=True)

        formset = WordFormSet(request.POST, 
                              queryset=LocalWord.objects.filter(grade=grade, document=document))
        if formset.is_valid():
            instances = formset.save()
            writeLocalTables([document])
            redirect = 'dictionary_local_g1' if grade == 1 else 'dictionary_local_g2'
            return HttpResponseRedirect(reverse(redirect, args=[document_id]))
        else:
            return render_to_response('dictionary/local.html', locals(),
                                      context_instance=RequestContext(request))

    filterform = FilterForm(request.GET)
    if filterform.is_valid():
        currentFilter = filterform.cleaned_data['filter']
    
    words_list = LocalWord.objects.filter(grade=grade, document=document,
                                          untranslated__contains=currentFilter).order_by('untranslated', 'type')
    paginator = Paginator(words_list, MAX_WORDS_PER_PAGE)
    try:
        page = int(request.GET.get('page', '1'))
    except ValueError:
        page = 1
    
    try:
        words = paginator.page(page)
    except InvalidPage:
        words = paginator.page(paginator.num_pages)

    WordFormSet = modelformset_factory(
        LocalWord, 
        form=RestrictedWordForm,
        exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), 
        can_delete=True, extra=0)

    formset = WordFormSet(queryset=words.object_list)

    return render_to_response('dictionary/local.html', locals(), 
                              context_instance=RequestContext(request))
Beispiel #4
0
def check(request, document_id, grade):

    document = get_object_or_404(Document, pk=document_id)

    if request.method == 'POST':
        WordFormSet = modelformset_factory(
            LocalWord, 
            form=RestrictedWordForm,
            exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), 
            can_delete=True)

        formset = WordFormSet(request.POST)
        if formset.is_valid():
            instances = formset.save(commit=False)
            for instance in instances:
                instance.grade = grade
                instance.document = document
                instance.save()
            writeLocalTables([document])
            redirect = 'dictionary_check_g1' if grade == 1 else 'dictionary_check_g2'
            return HttpResponseRedirect(reverse(redirect, args=[document_id]))
        else:
            return render(request, 'dictionary/words.html', locals())

    # filter some words from the xml
    content = document.latest_version().content
    content.open()
    # strip='none': if this parameter is not set, whitespace is removed automatically for documents with a DOCTYPE declaration
    tree = etree.parse(saxon9he(content.file, os.path.join(settings.PROJECT_DIR, 'dictionary', 'xslt', 'filter.xsl'), '-strip:none', contraction=grade).stdout, parser=HUGE_TREE_PARSER)
    content.close()

    # grab the homographs
    homographs = set(("|".join(homograph.xpath('text()')).lower() 
                      for homograph in tree.xpath('//brl:homograph', namespaces=BRL_NAMESPACE)))
    duplicate_homographs = set((smart_unicode(word) for 
                                word in 
                                chain(GlobalWord.objects.filter(grade=grade).filter(type=5).filter(homograph_disambiguation__in=homographs).values_list('homograph_disambiguation', flat=True),
                                      LocalWord.objects.filter(grade=grade).filter(type=5).filter(document=document).filter(homograph_disambiguation__in=homographs).values_list('homograph_disambiguation', flat=True))))
    unknown_homographs = [{'untranslated': homograph.replace('|', ''), 
                           'braille': translate(getTables(grade), homograph.replace('|', unichr(0x250A))),
                           'type': 5,
                           'homograph_disambiguation': homograph}
                          for homograph in homographs - duplicate_homographs]
    # grab names and places
    names = set((name for names in 
                 (name.text.lower().split() for name in tree.xpath('//brl:name', namespaces=BRL_NAMESPACE) if name.text != None) for name in names))
    duplicate_names = set((smart_unicode(word) for 
                           word in 
                           chain(GlobalWord.objects.filter(grade=grade).filter(type__in=(1,2)).filter(untranslated__in=names).values_list('untranslated', flat=True),
                                 LocalWord.objects.filter(grade=grade).filter(type__in=(1,2)).filter(document=document).filter(untranslated__in=names).values_list('untranslated', flat=True))))
    unknown_names = [{'untranslated': name, 
                      'braille': translate(getTables(grade, name=True), name), 
                      'type': 2,
                      'homograph_disambiguation': ''}
                     for name in names - duplicate_names]
    places = set((place for places in 
                 (place.text.lower().split() for place in tree.xpath('//brl:place', namespaces=BRL_NAMESPACE) if place.text != None) for place in places))
    duplicate_places = set((smart_unicode(word) for 
                            word in 
                            chain(GlobalWord.objects.filter(grade=grade).filter(type__in=(3,4)).filter(untranslated__in=places).values_list('untranslated', flat=True),
                                  LocalWord.objects.filter(grade=grade).filter(type__in=(3,4)).filter(document=document).filter(untranslated__in=places).values_list('untranslated', flat=True))))
    unknown_places = [{'untranslated': place,
                       'braille': translate(getTables(grade, place=True), place),
                       'type': 4,
                       'homograph_disambiguation': ''}
                      for place in places - duplicate_places]

    # filter homographs, names and places from the xml
    xsl = etree.parse(os.path.join(settings.PROJECT_DIR, 'dictionary', 'xslt', 'filter_names.xsl'),
                      parser=HUGE_TREE_PARSER)
    transform = etree.XSLT(xsl)
    filtered_tree = transform(tree)
    # grab the rest of the content
    content = etree.tostring(filtered_tree, method="text", encoding=unicode)
    # filter all punctuation and replace dashes by space, so we can split by space below
    content = ''.join(
        # replace Punctuation Dash and Punctuation other (except for "'") with space
        c if c == u"\u0027" or unicodedata.category(c) not in ['Pd', 'Po'] else ' '
        for c in content 
        # drop all chars which are not letters, separators or select
        # punctuation which we replace with space later on
        if unicodedata.category(c) in ['Lu', 'Ll', 'Zs', 'Zl', 'Zp', 'Pd', 'Po']
        or c in ['\n', '\r'])

    new_words = set((w.lower() for w in content.split() if len(w) > 1))
    # FIXME: We basically do a set difference manually here. This
    # would probably be better if done inside the db. However for that
    # we would have to be able to insert the new_words into the db in
    # an efficient manner, i.e. bulk insert. For a possibility on how
    # to do this in the context of Django ORM look at
    # http://ole-laursen.blogspot.com/2010/11/bulk-inserting-django-objects.html.
    # After that we could for example do a query along the lines of
    # cursor.execute("SELECT untranslated from new_words EXCEPT SELECT
    # untranslated FROM dict_words;). However MySQL doesn't seem to
    # support EXCEPT so it would be SELECT untranslated FROM new_words
    # w1 LEFT JOIN dict_words w2 ON w1.untranslated=w2.untranslated
    # WHERE w2.untranslated IS NULL;
    duplicate_words = set((smart_unicode(word) for 
                           word in
    # exclude type 2,4 and 5 as these probably have a different
    # translations, so we do need to show these words if they are not
    # tagged even if they have an entry in the dictionary as a name or
    # a place.
                           chain(GlobalWord.objects.filter(grade=grade).exclude(type__in=(2,4,5)).filter(untranslated__in=new_words).values_list('untranslated', flat=True),
                                 LocalWord.objects.filter(grade=grade).exclude(type__in=(2,4,5)).filter(document=document).filter(untranslated__in=new_words).values_list('untranslated', flat=True))))
    unknown_words = [{'untranslated': word, 
                      'braille': translate(getTables(grade), word),
                      'type' : 0,
                      'homograph_disambiguation': ''}
                     for word in new_words - duplicate_words]

    unknown_words = unknown_words + unknown_homographs + unknown_names + unknown_places
    unknown_words.sort(cmp=lambda x,y: cmp(x['untranslated'].lower(), y['untranslated'].lower()))

    # remove words from the local words which are no longer in the document (they might have
    # been typos that slipped in to the local words and were corrected subsequently)
    all_duplicates = duplicate_homographs | duplicate_names | duplicate_places | duplicate_words
    LocalWord.objects.filter(grade=grade, document=document).exclude(untranslated__in=all_duplicates).delete()

    paginator = Paginator(unknown_words, MAX_WORDS_PER_PAGE)
    try:
        page = int(request.GET.get('page', '1'))
    except ValueError:
        page = 1
    
    try:
        words = paginator.page(page)
    except InvalidPage:
        words = paginator.page(paginator.num_pages)

    WordFormSet = modelformset_factory(
        LocalWord, 
        form=RestrictedWordForm,
        exclude=('document', 'isConfirmed', 'isDeferred', 'grade'), 
        extra=len(words.object_list), can_delete=True)

    have_type = any((word['type']!=0 for word in words.object_list))
    have_homograph_disambiguation = any((word['homograph_disambiguation']!='' for word in words.object_list))
    formset = WordFormSet(queryset=LocalWord.objects.none(), initial=words.object_list)

    # Document statistic
    stats = DocumentStatistic(document=document, grade=grade, total=len(new_words), unknown=len(unknown_words))
    percentage = 100.0*stats.unknown/stats.total
    stats.save()

    return render(request, 'dictionary/words.html', locals())