Example #1
0
    def _get_random(self, number=1, chars=None, length='*', pinyin=None):
        """ Returns random words the same length as the character provided """
        
        if chars:
            length = len(chars)
        
        r_server = _get_redis()
        pattern = "ZH:%sC:*" % length

        randoms = []
        loop = 0
        for x in r_server.scan_iter(pattern):
            randoms.append(x)
            loop += 1
            if loop > 20:
                break
        random.shuffle(randoms,random.random)
        
               
        if number == 1:
            return json.loads(_search_redis( randoms[0] ))
        else:
            count = 0
            words = []
            while number > 0:
                words.append(json.loads(_search_redis( randoms[count] )))
                count += 1
                number -= 1
            
            return words
        
        return
    def handle_noargs(self, **options):
        # 一事無成 一事无成 [yi1 shi4 wu2 cheng2] /to have achieved nothing/to be a total failure/to get nowhere/
        
        r_server = _get_redis()
        
        # get all the character entries in the dict (100k roughly)
        key = "ZH:1C:*"
        all_keys = r_server.scan_iter(key)
        
        # go through each key, find out what words start with this one
        loop = 0
        for key in all_keys:
            
            this_word = ChineseWord(chars=smart_unicode(key.split(':')[2]))
            
            starts_with = []
            for y in this_word._starts_with():
                starts_with.append(y.chars)

            
            contains = []
            for y in this_word._contains():
                contains.append(y.chars)
                
            
            # SAVE THE VALUES
            this_word.starts_with = starts_with
            this_word.contains = contains
            values = vars(this_word) # nice, serialises the object values
            r_server.set(key, json.dumps(values))

            
            loop += 1
            print loop
Example #3
0
def _pinyin_search(request, search_string):
    
    # CLEAN UP THE INCOMING PINYIN
    clean_string = _normalize_pinyin(search_string)
    ascii_string = _pinyin_to_ascii(search_string)
    key = settings.PINYIN_WORD_KEY % ascii_string  
        
    suggested = []
    words = [] 
    r_server = _get_redis()    
    try:
        for o in json.loads(r_server.get(key)):
            word = ChineseWord(chars=o)
            for i in word.meanings:
                
                # IF THE CLEANED SEARCH STRING AND THE CONVERTED PINYIN MATCH
                if _normalize_pinyin(i['pinyin']) == clean_string:
                    words.append(word)
                    
                # IF THERE'S NO NUMBERS IN THE CLEANED_STRING, ADD IT
                elif not any(ext in clean_string for ext in ['1', '2', '3', '4', '5']):
                    words.append(word)
                
                else:
                    suggested.append(word)
    except TypeError:
        pass
    
                     
    return _render(request, 'website/wordlist.html', locals())
Example #4
0
def _english_search(request, search_string):

    r_server = _get_redis()
    words = []
    
    try:
        key = settings.ENGLISH_WORD_KEY % (len(search_string.split(' ')), search_string.lower())
        obj = json.loads(r_server.get(key))
        for x in obj['characters']:
            words.append(ChineseWord(chars=x))
    except TypeError:
        # split up the string and search each word individually
        pass
    
    suggested = []
    for x in search_string.split(' '):
        try:
            key = settings.ENGLISH_WORD_KEY % (len(x.split(' ')), x.lower())
            obj = json.loads(r_server.get(key))
            for x in obj['characters']:
                suggested.append(ChineseWord(chars=x))
        except TypeError:
            pass
  
    return _render(request, 'website/wordlist.html', locals())
Example #5
0
def text(request, hashkey=None, words=None):
    
    if not hashkey:
        hashkey = ''.join(random.choice(string.ascii_lowercase + string.digits) for x in range(5))
        key = "text:%s" % hashkey
    
        if request.user.is_authenticated():
            user = request.user.email
        else:
            user = '******'
            
        mapping = {
            'user': user,
            'title': '', 
            'chars': words,
            'timestamp': time.time(),
            'hash': hashkey,
            'url' : '',
        }
        
        # ADD IT TO REDIS
        r_server = _get_redis()
        r_server.hmset(key, mapping)
                        
    else:
        key = 'text:%s' % hashkey
    
    
    obj = None
    if _search_redis(key, lookup=False):
        obj = _search_redis(key)
    
    if not obj:
        if request.is_ajax():
            html = render_to_string('website/problem_snippet.html', locals())
            return HttpResponse(html)
        
        return _render(request, 'website/problem.html', locals())
    
    title = 'Article'
    try:
        url = urlparse(obj['url']).netloc
    except KeyError:
        pass
        
    chars = obj['chars'].decode('utf-8') # because redis stores things as strings...
    things = _split_unicode_chrs(chars)
    obj_list = _group_words(things) 
    
    
    list_template = 'creader/text_page_snippet.html' 
    
    if request.GET.get('page'):
        template = 'creader/text_page_snippet.html'
        return render_to_response(template, locals())
        
    return _render(request, 'creader/text.html', locals())
 def _del_keys(self, key):
     r_server = _get_redis()
     keys = r_server.keys(key)
     item_count = 0
     for x in keys:
         r_server.delete(x)
         item_count += 1
     
     print "Deleted %s items matching %s" % (item_count, key)
     return
Example #7
0
    def __init__(self, email):
        self.key = "PW:%s" % email
        self.user = User.objects.get(email=email)

        r_server = _get_redis()

        # WARNING: uncomment this to completely clear wordlist for testing
        # r_server.delete(self.key)

        if r_server.get(self.key):
            self.words = json.loads(r_server.get(self.key))
        else:
            self.words = {}
            self._update_list(self.words)
Example #8
0
def url(request):
    # TODO if it's already been scanned and saved, don't bother parsing it again….
    
    if request.GET.get('url'):
        url = request.GET.get('url')
        
        # PARSE THE WEBPAGE AND RETURN A LIST OF CHARACTERS
        html = urllib2.urlopen(url).read()
        text = readabilityParser(html)
        title = Document(html).title() 
        new_text = strip_tags(text)
        
        # GIVE IT AN ID
        this_id = ''.join(random.choice(string.ascii_lowercase + string.digits) for x in range(5))
        key = "text:%s" % this_id
    
        if request.user.is_authenticated():
            user = request.user.email
        else:
            user = '******'
            
        mapping = {
            'user': user,
            'title': title, 
            'chars': new_text,
            'timestamp': time.time(),
            'hash': this_id,
            'url' : url,
        }
        
        # ADD IT TO REDIS
        r_server = _get_redis()
        r_server.hmset(key, mapping)
        
        if request.user.is_authenticated():
            article_saved.send(sender=article_saved, article_id=this_id, time=time.time(), user_id=request.user.pk)
    
        return HttpResponseRedirect(reverse('text', args=[this_id]))
    else:
        
        problem = "TODO: Make a proper page here which explains the reader and how it works"
        return _render(request, 'website/problem.html', locals())  
Example #9
0
    def _contains(self, chars=None):
        """ Returns any words containing this word """
        
        r_server = _get_redis()
        key = "ZH:*C:*%s*" % chars        
        keys = r_server.scan_iter(key)
        

        words = []
        for x in keys:
            chars = x.split(':')[-1]
            if "," in chars:
                continue
                
            new_word = ChineseWord(smart_unicode(chars))
            words.append(new_word)
            
        words = sorted(words, reverse=False, key=lambda thing: thing.length)
        
        return words
Example #10
0
    def _starts_with(self, chars=None):
        """
        Returns all words starting with these ones
        """
        
        r_server = _get_redis()
        if not chars:
            chars = self.chars
        
        key = "ZH:*C:%s*" % chars
        keys = r_server.scan_iter(key)

        words = []
        for x in keys:
            chars = x.split(':')[-1]
            if "," in chars:
                continue
                
            new_word = ChineseWord(smart_unicode(chars))
            words.append(new_word)
            
        words =  sorted(words, reverse=False, key=lambda thing: thing.length)
        
        return words
    def handle_noargs(self, **options):
        # 一事無成 一事无成 [yi1 shi4 wu2 cheng2] /to have achieved nothing/to be a total failure/to get nowhere/

        # EMPTY ALL ZH + PY KEYS
        self._del_keys('ZH:*')
        self._del_keys('PY:*')
        
        # NOW LETS START
        file = open(settings.DICT_FILE_LOCATION)
        item_count = 0
        for line in file:
            if line.startswith("#"):
                pass
            else:
                
                # OPEN REDIS CONNECTION NOW
                r_server = _get_redis()
                
                # GATHER ALL THE MAIN VARIABLES
                new = line.split()
                numbered_pinyin = line[(line.index('[')+1):(line.index(']'))]
                f = ReadingFactory()
                tonal_pinyin =  f.convert(numbered_pinyin, 'Pinyin', 'Pinyin',
                    sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v',
                    'missingToneMark': 'fifth'})
                meanings = line[(line.index('/')+1):(line.rindex('/'))]               
                characters = new[1]
                
                # REMOVE ALL THE UGLY CHARACTERS
                if ',' in characters:
                    characters = characters.replace(',', '')
                
                
                # GET AND CLEAN THE MEASURE WORD
                mws = None
                if "CL:" in meanings:
                    new_meanings = meanings.split('/')
                    for idx, val in enumerate(new_meanings):
                        if "CL:" in val:
                            mws = []
                            for x in val.replace('CL:', '').split(','):
                                
                                x = x[:(x.index('['))]
                                if '|' in x:
                                    x = x[(x.index('|')+1):]
                                    
                                    
                                # ADD THE MEAASURE WORDS ENTRY
                                # ----------------------------
                                mws_key = settings.MEASURE_WORD_KEY % x   
                                if r_server.exists(mws_key):
                                    values = json.loads(_search_redis(mws_key))
                                    values['chars'].append(characters)
                                else:
                                    values = {'chars': [characters,]}
                                r_server.set(mws_key, json.dumps(values))                                
                                    
                                mws.append(x)
                            
                            
                            
                            new_meanings.pop(idx)
                    meanings = "/".join(new_meanings)
                

                    
                    
                    
                
                
                
                char_key = settings.CHINESE_WORD_KEY % ((len((characters))/3), characters)                 
                
                # CREATE THE PRONUNCIATION/MEANING PAIR
                pair = {}
                pair['pinyin'] = tonal_pinyin
                pair['pinyin_numbered'] = _normalize_pinyin(numbered_pinyin)
                pair['meaning'] = meanings
                pair['measure_words'] = mws
                
                
                
                # ADD THE PINYIN ENTRY
                # --------------------
                
                py_key = settings.PINYIN_WORD_KEY % _pinyin_to_ascii(numbered_pinyin)
                if r_server.exists(py_key):
                    values = json.loads(_search_redis(py_key))
                    if smart_unicode(characters) not in values:
                        values.append(characters)
                else:
                    values = [characters,]
                
                r_server.set(py_key, json.dumps(values))                    
    
    
    
    
                # ADD THE CHINESE CHARACTER ENTRY
                # -------------------------------
                if r_server.exists(char_key):
                    values = json.loads(_search_redis(char_key))
                    values['meanings'].append(pair)
                else:
                    values = {
                        'chars': characters,
                        'meanings': [pair,],
                    }
                    
                r_server.set(char_key, json.dumps(values))
                
                item_count += 1
                print item_count

                
                               
        
        print "%s Chinese items added" % item_count          
        file.close()        
Example #12
0
def _group_words(chars, chinese_only=False):
    obj_list = []
    loop = 0        
    skip = 0

    for x in chars:
        
        if skip != 0:
            skip -= 1
            loop += 1
            continue
        
        obj = {
             'chars': x,
             'wordset': loop,   
        }
                
        nc = False
                
        # IS IT A LINEBREAK
        if nc == False and x == '\n':
            obj['is_linebreak'] = True
            nc = True

        # IS IT A SPACE    
        if nc == False and x == ' ':
            obj['is_space'] = True
            nc = True

        # IS IT PUNCTUATION
        if nc == False and _is_punctuation(x):
            obj['is_punctuation'] = True 
            nc = True
    
        
        # IS IT A NUMBER?          
        if nc == False and _is_number(x):
            obj['is_number'] = True
            number = True
            num = x
            while number == True:
            
                # if the next character is also a number, add it to this one
                try:
                    next = chars[loop+1]
                except:
                    break
                
                if _is_number(next):
                    num = "%s%s" % (num, next)
                    chars.pop(loop+1)

                else:
                    break
                            

            obj['chars'] = num
            nc = True
        
        
        
        
        # IS THE CHARACTER ENGLISH?            
        if nc == False and _is_english(x):            
            obj['is_english'] = True
            english = True
            eng_word = x
            while english == True:
            
                # IF THE NEXT CHAR IS ENGLISH, LETS BUILD THE ENGLISH WORD
                try:
                    next = chars[loop+1]
                except:
                    break
                
                if _is_english(next):
                    eng_word = "%s%s" % (eng_word, next)
                    chars.pop(loop+1)

                else:
                    break
                            

            obj['chars'] = eng_word
            nc = True
        
        # IF THE CHARACTER IS NOT CHINESE
        if nc == True:
            if chinese_only == False:
                obj_list.append(obj)
                
            loop += 1
            continue

        search_string = [x,]
                
        # THIS LOOP WILL BUILD OUR CHINESE WORD - GUESSING WE WON'T HAVE MANY MORE THAN 10 CHARS
        for i in range(1,10):
            try:
                next_chars = chars[loop+i]
                if _is_punctuation(next_chars):
                    next_chars = None
                    break
                else:
                    search_string.append(next_chars)
            except:
                break
        
        
        r_server = _get_redis()
        r = False   
        
        
        while r == False and len(search_string) > 0:            
            
            key = "ZH:%sC:%s" % ( len(search_string), "".join(search_string))
            r = r_server.exists(key)
            
            if r:
                break
            else:
                try:
                    search_string.pop()
                except IndexError:
                    pass
        

                
        # initialise a ChineseWord object and add it to our object_list
        the_string = "".join(search_string)
        word = ChineseWord(chars=the_string)
        obj_list.append(word)
        
        
        # tells us how many characters need to be skipped before we start searching again
        # because maybe this word included the subsequent 3 chars, so let's not searhc them
        # again
        skip += (len(search_string)-1)
        loop += 1
        
     
    return obj_list  
    def handle_noargs(self, **options):
        # EXAMPLE: 一中一台 [yi1 Zhong1 yi1 Tai2] /first meaning/second meaning/
        file = open(settings.DICT_FILE_LOCATION)
        r_server = _get_redis()
        
        # EMPTY ALL EN KEYS FROM THE DATABASE
        item_count = 0
        keys = r_server.keys('EN:*')
        for x in keys:
            r_server.delete(x)
            item_count += 1
        print "Deleted %s items" % item_count
        
        
        # NOW LETS START
        item_count = 0
        for line in file:
            if not line.startswith("#"):

                # GATHER ALL THE MAIN VARIABLES
                new = line.split()
                characters = new[1]
                numbered_pinyin = line[(line.index('[')+1):(line.index(']'))]
                f = ReadingFactory()
                tonal_pinyin =  f.convert(numbered_pinyin, 'Pinyin', 'Pinyin',
                    sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v',
                    'missingToneMark': 'fifth'})
                meanings = line[(line.index('/')+1):(line.rindex('/'))]               
                
                # CREATE AN INDEX: What we'll do first is try to strip out
                # as much crap as possible from each definition, and as close as
                # possible find a single word that we can index on.
                
                for x in meanings.split('/'):
                    
                    ns = x # new_string
                    
                    # REMOVE ANYTHING BETWEEN BRACKETS
                    try:
                        ns = ns.replace(ns[(ns.index('(')+1):(ns.index(')'))], '')
                        ns = ns.replace('(', '').replace(')', '') #replace the brackets too
                    except ValueError:
                        pass
                    
                    # REMOVE ANYTHING BETWEEN SQUARE BRACKETS
                    try:
                        ns = ns.replace(ns[(ns.index('[')+1):(ns.index(']'))], '')
                        ns = ns.replace('[', '').replace(']', '') #replace the brackets too
                    except ValueError:
                        pass
                    
                    # IGNORE THE MEANING IF IT CONTAINS AN EXCLUDED PHRASE 
                    if len(filter(lambda y: y not in ns, EXCLUSIONS)) != len(EXCLUSIONS):
                        continue
                                        
                    # IF THE MEANING IS NOW EMPTY, IGNORE IT
                    ns = ns.strip()
                    if ns == '':
                        continue
                    
                    # DEAL WITH INFINITIVE VERBS LIKE "TO DO" WITH 2 WORDS
                    if len(ns.split(' ')) <= 3 and ns.startswith('to '):
                        ns = ns.split(' ', 1)[1]
                    
                    # REMOVE ITEMS LIKE "SEE XYZ"
                    if ns.split(' ')[0] == 'see' and ns[-1] not in string.ascii_letters:
                        continue
                    
                    # THERE'S ALSO SOME ANNOYING "..." MARKS TOO
                    if "..." in ns:
                        ns = ns.replace('...', '')                    
                    
                    
                    # FOR NOW, JUST ADD ITEMS WITH 2 WORDs
                    if len(ns.split(' ')) <= 3:
                        
                        key = "EN:%sW:%s" % (len(ns.split(' ')), ns.lower())
                        print key
                        if r_server.exists(key):
                            values = json.loads(_search_redis(key))
                            values['characters'].append(characters)
                            r_server.set(key, json.dumps(values))

                        else:
                            
                            values = {
                                'english': x,
                                'characters': [characters,],
                            }
                            
                            r_server.set(key, json.dumps(values))
                        
                        item_count += 1
                        print item_count
                        
            #if item_count > 20:
            #    break
                                        
                    
                
                
                
                                
        
        print "%s English dictionary items added" % item_count          
        file.close()        
Example #14
0
def search(request, search_string=None, title='Search', words=None):
    
    r_server = _get_redis()
        
    # replace search string underscores with spaces
    if search_string:
        search_string = search_string.strip().replace('_', ' ')        
               

    # HANDLES EMPTY OR NULL SEARCH STRING
    if search_string == None and request.method != 'POST':
        form = SearchForm()
        return _render(request, 'website/search.html', locals())
          
          
    # CHECK IF IT'S A POST REQUEST OR URL SEARCH
    if search_string == None and request.method == 'POST':
        form = SearchForm(request.POST)
        if form.is_valid():
            search_string = form.cleaned_data['char']

        else:
            # POST AND NO SEARCH STRING - SHOW THEM THE PLAIN JANE SEARCH PAGE
            form = SearchForm()
            return _render(request, 'website/search.html', locals())


    # HANDLES AN AMBIGUOUS SEARCH
    if _is_ambiguous(search_string):
        message = messages.AMBIGUOUS_WORD
        return render(request, 'problem.html', locals())


    if r_server.exists((settings.PINYIN_WORD_KEY % _pinyin_to_ascii(search_string))):  
        return _pinyin_search(request, search_string)


    if _is_english(search_string):
        return _english_search(request, search_string)


    # IF THE SEARCH IS OVER 10 CHARACTERS, RETURN A TEXT
    #if len(search_string) > 12:
    #    from creader.views import text                
    #    return text(request, words=search_string)
    
    
    if not words:
        things = _split_unicode_chrs(search_string)        
        words = _group_words(things)   

        
    # IF THE USER WAS LOGGED IN, RECORD IT IN THEIR 'SAVED WORDS'
    if request.user.is_authenticated():
        for x in words:
            word_searched.send(
                sender=word_searched, 
                word=x.chars, 
                time=datetime.datetime.now(), 
                user_id=request.user.email
            )
    
    
    # if there's only 1 word, take us straight to the single word definition
    if len(words) == 1:
        word = words[0]
        url = reverse('single_word', args=[word])
        return HttpResponseRedirect(url)
    
    return _render(request, 'website/wordlist.html', locals())
Example #15
0
 def _update_list(self, wordlist):
     """ Take a dictionary of words and update the whole list"""
     r_server = _get_redis()
     r_server.set(self.key, json.dumps(wordlist))