def filter_words(words): # filter common words # use the stopwords file?? common = set(['the','to','of','at','a','is','in','what','and','it','this', 'that','and',]) words = filter(lambda x: x not in common \ and dict.is_word(x) \ and len(x) > 1, words) return words
def filter_word_phrases(phrases, min_phrase_len, min_occurrence, all_phrases): # must do before substring deletion to prevent occurence 1 items # from messing things up for p,c in phrases.items(): pwords = p.split(' ') if c < min_occurrence or len(pwords) < min_phrase_len \ or not reduce(lambda x,y: x and dict.is_word(y), pwords):# or p not in good: del phrases[p] ps = phrases.items() good = set([]) # find substrings to delete within a phrase set, this may be pretty inefficient... for p1,c1 in ps: substr = False for p2,c2 in ps: if p1 == p2: # compares content, no way to do pointers.. continue #if p2.find(p1) > -1: # not counting substrings that don't have same occurrence count if c1 == c2 and (p2.startswith(p1) or p2.endswith(p1)): # slightly faster, barely.. substr = True break if not substr: good.add(p1) for p,c in phrases.items(): if p not in good: del phrases[p] # could also prune between different phrase sets based on common words.. # but should i? might add a bit to the time.. # the word to phrases association is pretty arbitrary... better representation? if phrases:# and count > 0:#40: # change to show top words & their phrases #print "Phrases for %s (%s): %s" % (word, count, phrases) #print good """ # pretty printer print "Phrases for %s (%s):" % (word, count) for p in reversed(sorted(phrases.items(), key=lambda x: x[1])): print p """ # TODO: prune phrases that aren't common enough? # TODO: prune phrases that aren't mostly english words.. # TODO: prune nonsensical things... single letters that aren't in [a, i, ...] for ap,ac in phrases.items(): for w in ap.split(' '): #if not dict.is_word(w) or not len(w) > 1: # continue l = all_phrases.setdefault(w, []) l.append(APhrase.getPhrase(ap, ac))
def get_common_words(word_frequencies): counts = word_frequencies citems = counts.items() common_words = list(reversed(sorted(citems, key=lambda x: x[1][0]))) # sorted descending r = re.compile('^[a-zA-Z][a-zA-Z-\'"]*$', re.U) common_words = filter(lambda x: x[0] not in TOO_COMMON and r.match(x[0]) \ and dict.is_word(x[0]) and len(x[0]) > 1, common_words) return common_words
def GET(self): if self.restrict_access(): return web.header("Pragma", "no-cache") web.header("Cache-Control", "no-cache") default_last_time = str(time.time()) input = web.input(search="", page=0, last_time=default_last_time) #web.debug('input time ' + input.last_time) if input.last_time == '0.0': # bah... input.last_time = default_last_time last_words = env.get_last_words(since=input.last_time) last_update = 0 if not input.search: input.search = last_words last_update = env.get_last_update() todisplay = 10 posts_count = 0 start_idx = (input.page - 1) * todisplay + 1 end_idx = min(posts_count, input.page * todisplay) if end_idx >= posts_count: start_idx = posts_count - (posts_count % todisplay) + 1 end_idx = posts_count input.page = posts_count / todisplay + 1 #security issue w/ making request for typed words.. words = input.search.split(' ') words.reverse() for w in words: if not w or not dict.is_word(w): continue #trans = '' # translation(w) # unused for now wrdef = wr_definition(w) # check if english word.. # sound is only in english! #jqreq = """$.get('audio?q=' + encodeURIComponent($(this).attr('href')) + '&d=' + new Date().toUTCString());return false;""" #print '<a href="%s" onclick="%s">%s</a>' % (wrdef.sound, jqreq, wrdef.listen) #print "<br /><br />" #print '<iframe src="http://www.wordreference.com/%s/%s" width="320" height="240"></iframe>' % ('enfr', web.urlquote(input.search)) print render.vocab_results(w, wrdef, last_update) return