def _group_words(chars, chinese_only=False): obj_list = [] loop = 0 skip = 0 for x in chars: if skip != 0: skip -= 1 loop += 1 continue obj = { 'chars': x, 'wordset': loop, } nc = False # IS IT A LINEBREAK if nc == False and x == '\n': obj['is_linebreak'] = True nc = True # IS IT A SPACE if nc == False and x == ' ': obj['is_space'] = True nc = True # IS IT PUNCTUATION if nc == False and _is_punctuation(x): obj['is_punctuation'] = True nc = True # IS IT A NUMBER? if nc == False and _is_number(x): obj['is_number'] = True number = True num = x while number == True: # if the next character is also a number, add it to this one try: next = chars[loop+1] except: break if _is_number(next): num = "%s%s" % (num, next) chars.pop(loop+1) else: break obj['chars'] = num nc = True # IS THE CHARACTER ENGLISH? if nc == False and _is_english(x): obj['is_english'] = True english = True eng_word = x while english == True: # IF THE NEXT CHAR IS ENGLISH, LETS BUILD THE ENGLISH WORD try: next = chars[loop+1] except: break if _is_english(next): eng_word = "%s%s" % (eng_word, next) chars.pop(loop+1) else: break obj['chars'] = eng_word nc = True # IF THE CHARACTER IS NOT CHINESE if nc == True: if chinese_only == False: obj_list.append(obj) loop += 1 continue search_string = [x,] # THIS LOOP WILL BUILD OUR CHINESE WORD - GUESSING WE WON'T HAVE MANY MORE THAN 10 CHARS for i in range(1,10): try: next_chars = chars[loop+i] if _is_punctuation(next_chars): next_chars = None break else: search_string.append(next_chars) except: break r_server = _get_redis() r = False while r == False and len(search_string) > 0: key = "ZH:%sC:%s" % ( len(search_string), "".join(search_string)) r = r_server.exists(key) if r: break else: try: search_string.pop() except IndexError: pass # initialise a ChineseWord object and add it to our object_list the_string = "".join(search_string) word = ChineseWord(chars=the_string) obj_list.append(word) # tells us how many characters need to be skipped before we start searching again # because maybe this word included the subsequent 3 chars, so let's not searhc them # again skip += (len(search_string)-1) loop += 1 return obj_list
def search(request, search_string=None, title='Search', words=None): r_server = _get_redis() # replace search string underscores with spaces if search_string: search_string = search_string.strip().replace('_', ' ') # HANDLES EMPTY OR NULL SEARCH STRING if search_string == None and request.method != 'POST': form = SearchForm() return _render(request, 'website/search.html', locals()) # CHECK IF IT'S A POST REQUEST OR URL SEARCH if search_string == None and request.method == 'POST': form = SearchForm(request.POST) if form.is_valid(): search_string = form.cleaned_data['char'] else: # POST AND NO SEARCH STRING - SHOW THEM THE PLAIN JANE SEARCH PAGE form = SearchForm() return _render(request, 'website/search.html', locals()) # HANDLES AN AMBIGUOUS SEARCH if _is_ambiguous(search_string): message = messages.AMBIGUOUS_WORD return render(request, 'problem.html', locals()) if r_server.exists((settings.PINYIN_WORD_KEY % _pinyin_to_ascii(search_string))): return _pinyin_search(request, search_string) if _is_english(search_string): return _english_search(request, search_string) # IF THE SEARCH IS OVER 10 CHARACTERS, RETURN A TEXT #if len(search_string) > 12: # from creader.views import text # return text(request, words=search_string) if not words: things = _split_unicode_chrs(search_string) words = _group_words(things) # IF THE USER WAS LOGGED IN, RECORD IT IN THEIR 'SAVED WORDS' if request.user.is_authenticated(): for x in words: word_searched.send( sender=word_searched, word=x.chars, time=datetime.datetime.now(), user_id=request.user.email ) # if there's only 1 word, take us straight to the single word definition if len(words) == 1: word = words[0] url = reverse('single_word', args=[word]) return HttpResponseRedirect(url) return _render(request, 'website/wordlist.html', locals())