def _get_random(self, number=1, chars=None, length='*', pinyin=None): """ Returns random words the same length as the character provided """ if chars: length = len(chars) r_server = _get_redis() pattern = "ZH:%sC:*" % length randoms = [] loop = 0 for x in r_server.scan_iter(pattern): randoms.append(x) loop += 1 if loop > 20: break random.shuffle(randoms,random.random) if number == 1: return json.loads(_search_redis( randoms[0] )) else: count = 0 words = [] while number > 0: words.append(json.loads(_search_redis( randoms[count] ))) count += 1 number -= 1 return words return
def handle_noargs(self, **options): # 一事無成 一事无成 [yi1 shi4 wu2 cheng2] /to have achieved nothing/to be a total failure/to get nowhere/ r_server = _get_redis() # get all the character entries in the dict (100k roughly) key = "ZH:1C:*" all_keys = r_server.scan_iter(key) # go through each key, find out what words start with this one loop = 0 for key in all_keys: this_word = ChineseWord(chars=smart_unicode(key.split(':')[2])) starts_with = [] for y in this_word._starts_with(): starts_with.append(y.chars) contains = [] for y in this_word._contains(): contains.append(y.chars) # SAVE THE VALUES this_word.starts_with = starts_with this_word.contains = contains values = vars(this_word) # nice, serialises the object values r_server.set(key, json.dumps(values)) loop += 1 print loop
def _pinyin_search(request, search_string): # CLEAN UP THE INCOMING PINYIN clean_string = _normalize_pinyin(search_string) ascii_string = _pinyin_to_ascii(search_string) key = settings.PINYIN_WORD_KEY % ascii_string suggested = [] words = [] r_server = _get_redis() try: for o in json.loads(r_server.get(key)): word = ChineseWord(chars=o) for i in word.meanings: # IF THE CLEANED SEARCH STRING AND THE CONVERTED PINYIN MATCH if _normalize_pinyin(i['pinyin']) == clean_string: words.append(word) # IF THERE'S NO NUMBERS IN THE CLEANED_STRING, ADD IT elif not any(ext in clean_string for ext in ['1', '2', '3', '4', '5']): words.append(word) else: suggested.append(word) except TypeError: pass return _render(request, 'website/wordlist.html', locals())
def _english_search(request, search_string): r_server = _get_redis() words = [] try: key = settings.ENGLISH_WORD_KEY % (len(search_string.split(' ')), search_string.lower()) obj = json.loads(r_server.get(key)) for x in obj['characters']: words.append(ChineseWord(chars=x)) except TypeError: # split up the string and search each word individually pass suggested = [] for x in search_string.split(' '): try: key = settings.ENGLISH_WORD_KEY % (len(x.split(' ')), x.lower()) obj = json.loads(r_server.get(key)) for x in obj['characters']: suggested.append(ChineseWord(chars=x)) except TypeError: pass return _render(request, 'website/wordlist.html', locals())
def text(request, hashkey=None, words=None): if not hashkey: hashkey = ''.join(random.choice(string.ascii_lowercase + string.digits) for x in range(5)) key = "text:%s" % hashkey if request.user.is_authenticated(): user = request.user.email else: user = '******' mapping = { 'user': user, 'title': '', 'chars': words, 'timestamp': time.time(), 'hash': hashkey, 'url' : '', } # ADD IT TO REDIS r_server = _get_redis() r_server.hmset(key, mapping) else: key = 'text:%s' % hashkey obj = None if _search_redis(key, lookup=False): obj = _search_redis(key) if not obj: if request.is_ajax(): html = render_to_string('website/problem_snippet.html', locals()) return HttpResponse(html) return _render(request, 'website/problem.html', locals()) title = 'Article' try: url = urlparse(obj['url']).netloc except KeyError: pass chars = obj['chars'].decode('utf-8') # because redis stores things as strings... things = _split_unicode_chrs(chars) obj_list = _group_words(things) list_template = 'creader/text_page_snippet.html' if request.GET.get('page'): template = 'creader/text_page_snippet.html' return render_to_response(template, locals()) return _render(request, 'creader/text.html', locals())
def _del_keys(self, key): r_server = _get_redis() keys = r_server.keys(key) item_count = 0 for x in keys: r_server.delete(x) item_count += 1 print "Deleted %s items matching %s" % (item_count, key) return
def __init__(self, email): self.key = "PW:%s" % email self.user = User.objects.get(email=email) r_server = _get_redis() # WARNING: uncomment this to completely clear wordlist for testing # r_server.delete(self.key) if r_server.get(self.key): self.words = json.loads(r_server.get(self.key)) else: self.words = {} self._update_list(self.words)
def url(request): # TODO if it's already been scanned and saved, don't bother parsing it again…. if request.GET.get('url'): url = request.GET.get('url') # PARSE THE WEBPAGE AND RETURN A LIST OF CHARACTERS html = urllib2.urlopen(url).read() text = readabilityParser(html) title = Document(html).title() new_text = strip_tags(text) # GIVE IT AN ID this_id = ''.join(random.choice(string.ascii_lowercase + string.digits) for x in range(5)) key = "text:%s" % this_id if request.user.is_authenticated(): user = request.user.email else: user = '******' mapping = { 'user': user, 'title': title, 'chars': new_text, 'timestamp': time.time(), 'hash': this_id, 'url' : url, } # ADD IT TO REDIS r_server = _get_redis() r_server.hmset(key, mapping) if request.user.is_authenticated(): article_saved.send(sender=article_saved, article_id=this_id, time=time.time(), user_id=request.user.pk) return HttpResponseRedirect(reverse('text', args=[this_id])) else: problem = "TODO: Make a proper page here which explains the reader and how it works" return _render(request, 'website/problem.html', locals())
def _contains(self, chars=None): """ Returns any words containing this word """ r_server = _get_redis() key = "ZH:*C:*%s*" % chars keys = r_server.scan_iter(key) words = [] for x in keys: chars = x.split(':')[-1] if "," in chars: continue new_word = ChineseWord(smart_unicode(chars)) words.append(new_word) words = sorted(words, reverse=False, key=lambda thing: thing.length) return words
def _starts_with(self, chars=None): """ Returns all words starting with these ones """ r_server = _get_redis() if not chars: chars = self.chars key = "ZH:*C:%s*" % chars keys = r_server.scan_iter(key) words = [] for x in keys: chars = x.split(':')[-1] if "," in chars: continue new_word = ChineseWord(smart_unicode(chars)) words.append(new_word) words = sorted(words, reverse=False, key=lambda thing: thing.length) return words
def handle_noargs(self, **options): # 一事無成 一事无成 [yi1 shi4 wu2 cheng2] /to have achieved nothing/to be a total failure/to get nowhere/ # EMPTY ALL ZH + PY KEYS self._del_keys('ZH:*') self._del_keys('PY:*') # NOW LETS START file = open(settings.DICT_FILE_LOCATION) item_count = 0 for line in file: if line.startswith("#"): pass else: # OPEN REDIS CONNECTION NOW r_server = _get_redis() # GATHER ALL THE MAIN VARIABLES new = line.split() numbered_pinyin = line[(line.index('[')+1):(line.index(']'))] f = ReadingFactory() tonal_pinyin = f.convert(numbered_pinyin, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v', 'missingToneMark': 'fifth'}) meanings = line[(line.index('/')+1):(line.rindex('/'))] characters = new[1] # REMOVE ALL THE UGLY CHARACTERS if ',' in characters: characters = characters.replace(',', '') # GET AND CLEAN THE MEASURE WORD mws = None if "CL:" in meanings: new_meanings = meanings.split('/') for idx, val in enumerate(new_meanings): if "CL:" in val: mws = [] for x in val.replace('CL:', '').split(','): x = x[:(x.index('['))] if '|' in x: x = x[(x.index('|')+1):] # ADD THE MEAASURE WORDS ENTRY # ---------------------------- mws_key = settings.MEASURE_WORD_KEY % x if r_server.exists(mws_key): values = json.loads(_search_redis(mws_key)) values['chars'].append(characters) else: values = {'chars': [characters,]} r_server.set(mws_key, json.dumps(values)) mws.append(x) new_meanings.pop(idx) meanings = "/".join(new_meanings) char_key = settings.CHINESE_WORD_KEY % ((len((characters))/3), characters) # CREATE THE PRONUNCIATION/MEANING PAIR pair = {} pair['pinyin'] = tonal_pinyin pair['pinyin_numbered'] = _normalize_pinyin(numbered_pinyin) pair['meaning'] = meanings pair['measure_words'] = mws # ADD THE PINYIN ENTRY # -------------------- py_key = settings.PINYIN_WORD_KEY % _pinyin_to_ascii(numbered_pinyin) if r_server.exists(py_key): values = json.loads(_search_redis(py_key)) if smart_unicode(characters) not in values: values.append(characters) else: values = [characters,] r_server.set(py_key, json.dumps(values)) # ADD THE CHINESE CHARACTER ENTRY # ------------------------------- if r_server.exists(char_key): values = json.loads(_search_redis(char_key)) values['meanings'].append(pair) else: values = { 'chars': characters, 'meanings': [pair,], } r_server.set(char_key, json.dumps(values)) item_count += 1 print item_count print "%s Chinese items added" % item_count file.close()
def _group_words(chars, chinese_only=False): obj_list = [] loop = 0 skip = 0 for x in chars: if skip != 0: skip -= 1 loop += 1 continue obj = { 'chars': x, 'wordset': loop, } nc = False # IS IT A LINEBREAK if nc == False and x == '\n': obj['is_linebreak'] = True nc = True # IS IT A SPACE if nc == False and x == ' ': obj['is_space'] = True nc = True # IS IT PUNCTUATION if nc == False and _is_punctuation(x): obj['is_punctuation'] = True nc = True # IS IT A NUMBER? if nc == False and _is_number(x): obj['is_number'] = True number = True num = x while number == True: # if the next character is also a number, add it to this one try: next = chars[loop+1] except: break if _is_number(next): num = "%s%s" % (num, next) chars.pop(loop+1) else: break obj['chars'] = num nc = True # IS THE CHARACTER ENGLISH? if nc == False and _is_english(x): obj['is_english'] = True english = True eng_word = x while english == True: # IF THE NEXT CHAR IS ENGLISH, LETS BUILD THE ENGLISH WORD try: next = chars[loop+1] except: break if _is_english(next): eng_word = "%s%s" % (eng_word, next) chars.pop(loop+1) else: break obj['chars'] = eng_word nc = True # IF THE CHARACTER IS NOT CHINESE if nc == True: if chinese_only == False: obj_list.append(obj) loop += 1 continue search_string = [x,] # THIS LOOP WILL BUILD OUR CHINESE WORD - GUESSING WE WON'T HAVE MANY MORE THAN 10 CHARS for i in range(1,10): try: next_chars = chars[loop+i] if _is_punctuation(next_chars): next_chars = None break else: search_string.append(next_chars) except: break r_server = _get_redis() r = False while r == False and len(search_string) > 0: key = "ZH:%sC:%s" % ( len(search_string), "".join(search_string)) r = r_server.exists(key) if r: break else: try: search_string.pop() except IndexError: pass # initialise a ChineseWord object and add it to our object_list the_string = "".join(search_string) word = ChineseWord(chars=the_string) obj_list.append(word) # tells us how many characters need to be skipped before we start searching again # because maybe this word included the subsequent 3 chars, so let's not searhc them # again skip += (len(search_string)-1) loop += 1 return obj_list
def handle_noargs(self, **options): # EXAMPLE: 一中一台 [yi1 Zhong1 yi1 Tai2] /first meaning/second meaning/ file = open(settings.DICT_FILE_LOCATION) r_server = _get_redis() # EMPTY ALL EN KEYS FROM THE DATABASE item_count = 0 keys = r_server.keys('EN:*') for x in keys: r_server.delete(x) item_count += 1 print "Deleted %s items" % item_count # NOW LETS START item_count = 0 for line in file: if not line.startswith("#"): # GATHER ALL THE MAIN VARIABLES new = line.split() characters = new[1] numbered_pinyin = line[(line.index('[')+1):(line.index(']'))] f = ReadingFactory() tonal_pinyin = f.convert(numbered_pinyin, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v', 'missingToneMark': 'fifth'}) meanings = line[(line.index('/')+1):(line.rindex('/'))] # CREATE AN INDEX: What we'll do first is try to strip out # as much crap as possible from each definition, and as close as # possible find a single word that we can index on. for x in meanings.split('/'): ns = x # new_string # REMOVE ANYTHING BETWEEN BRACKETS try: ns = ns.replace(ns[(ns.index('(')+1):(ns.index(')'))], '') ns = ns.replace('(', '').replace(')', '') #replace the brackets too except ValueError: pass # REMOVE ANYTHING BETWEEN SQUARE BRACKETS try: ns = ns.replace(ns[(ns.index('[')+1):(ns.index(']'))], '') ns = ns.replace('[', '').replace(']', '') #replace the brackets too except ValueError: pass # IGNORE THE MEANING IF IT CONTAINS AN EXCLUDED PHRASE if len(filter(lambda y: y not in ns, EXCLUSIONS)) != len(EXCLUSIONS): continue # IF THE MEANING IS NOW EMPTY, IGNORE IT ns = ns.strip() if ns == '': continue # DEAL WITH INFINITIVE VERBS LIKE "TO DO" WITH 2 WORDS if len(ns.split(' ')) <= 3 and ns.startswith('to '): ns = ns.split(' ', 1)[1] # REMOVE ITEMS LIKE "SEE XYZ" if ns.split(' ')[0] == 'see' and ns[-1] not in string.ascii_letters: continue # THERE'S ALSO SOME ANNOYING "..." MARKS TOO if "..." in ns: ns = ns.replace('...', '') # FOR NOW, JUST ADD ITEMS WITH 2 WORDs if len(ns.split(' ')) <= 3: key = "EN:%sW:%s" % (len(ns.split(' ')), ns.lower()) print key if r_server.exists(key): values = json.loads(_search_redis(key)) values['characters'].append(characters) r_server.set(key, json.dumps(values)) else: values = { 'english': x, 'characters': [characters,], } r_server.set(key, json.dumps(values)) item_count += 1 print item_count #if item_count > 20: # break print "%s English dictionary items added" % item_count file.close()
def search(request, search_string=None, title='Search', words=None): r_server = _get_redis() # replace search string underscores with spaces if search_string: search_string = search_string.strip().replace('_', ' ') # HANDLES EMPTY OR NULL SEARCH STRING if search_string == None and request.method != 'POST': form = SearchForm() return _render(request, 'website/search.html', locals()) # CHECK IF IT'S A POST REQUEST OR URL SEARCH if search_string == None and request.method == 'POST': form = SearchForm(request.POST) if form.is_valid(): search_string = form.cleaned_data['char'] else: # POST AND NO SEARCH STRING - SHOW THEM THE PLAIN JANE SEARCH PAGE form = SearchForm() return _render(request, 'website/search.html', locals()) # HANDLES AN AMBIGUOUS SEARCH if _is_ambiguous(search_string): message = messages.AMBIGUOUS_WORD return render(request, 'problem.html', locals()) if r_server.exists((settings.PINYIN_WORD_KEY % _pinyin_to_ascii(search_string))): return _pinyin_search(request, search_string) if _is_english(search_string): return _english_search(request, search_string) # IF THE SEARCH IS OVER 10 CHARACTERS, RETURN A TEXT #if len(search_string) > 12: # from creader.views import text # return text(request, words=search_string) if not words: things = _split_unicode_chrs(search_string) words = _group_words(things) # IF THE USER WAS LOGGED IN, RECORD IT IN THEIR 'SAVED WORDS' if request.user.is_authenticated(): for x in words: word_searched.send( sender=word_searched, word=x.chars, time=datetime.datetime.now(), user_id=request.user.email ) # if there's only 1 word, take us straight to the single word definition if len(words) == 1: word = words[0] url = reverse('single_word', args=[word]) return HttpResponseRedirect(url) return _render(request, 'website/wordlist.html', locals())
def _update_list(self, wordlist): """ Take a dictionary of words and update the whole list""" r_server = _get_redis() r_server.set(self.key, json.dumps(wordlist))