Beispiel #1
0
def filter_words(words):
  # filter common words
  # use the stopwords file??
  common = set(['the','to','of','at','a','is','in','what','and','it','this',
                'that','and',])
  words = filter(lambda x: x not in common \
                 and dict.is_word(x) \
                 and len(x) > 1, words)
  return words
Beispiel #2
0
def filter_word_phrases(phrases, min_phrase_len, min_occurrence, all_phrases):
	# must do before substring deletion to prevent occurence 1 items
	# from messing things up
	for p,c in phrases.items():
		pwords = p.split(' ')
		if c < min_occurrence or len(pwords) < min_phrase_len \
		    or not reduce(lambda x,y: x and dict.is_word(y), pwords):# or p not in good:
			del phrases[p]
	
	ps = phrases.items()
	good = set([])
	# find substrings to delete within a phrase set, this may be pretty inefficient...
	for p1,c1 in ps:
		substr = False
		for p2,c2 in ps:
			if p1 == p2: # compares content, no way to do pointers..
				continue
			#if p2.find(p1) > -1:
			# not counting substrings that don't have same occurrence count
			if c1 == c2 and (p2.startswith(p1) or p2.endswith(p1)): # slightly faster, barely..
				substr = True
				break
		if not substr:
			good.add(p1)
	
	for p,c in phrases.items():
		if p not in good:
			del phrases[p]
	
	# could also prune between different phrase sets based on common words..
	# but should i? might add a bit to the time..
	# the word to phrases association is pretty arbitrary... better representation?
		
	
	if phrases:# and count > 0:#40: # change to show top words & their phrases
		#print "Phrases for %s (%s): %s" % (word, count, phrases)
		#print good
		
		""" # pretty printer
		print "Phrases for %s (%s):" % (word, count)
		for p in reversed(sorted(phrases.items(), key=lambda x: x[1])):
			print p
		"""
		
		# TODO: prune phrases that aren't common enough?

		# TODO: prune phrases that aren't mostly english words..
    # TODO: prune nonsensical things... single letters that aren't in [a, i, ...]
		
		for ap,ac in phrases.items():
			for w in ap.split(' '):
				#if not dict.is_word(w) or not len(w) > 1:
				#	continue
				l = all_phrases.setdefault(w, [])
				l.append(APhrase.getPhrase(ap, ac))
Beispiel #3
0
def get_common_words(word_frequencies):
	counts = word_frequencies
	citems = counts.items()
	common_words = list(reversed(sorted(citems, key=lambda x: x[1][0])))
	# sorted descending
	
	r = re.compile('^[a-zA-Z][a-zA-Z-\'"]*$', re.U)
	
	common_words = filter(lambda x: x[0] not in TOO_COMMON and r.match(x[0]) \
											  and dict.is_word(x[0]) and len(x[0]) > 1,
									 			common_words)
	return common_words
Beispiel #4
0
    def GET(self):
        if self.restrict_access(): return
        web.header("Pragma", "no-cache")
        web.header("Cache-Control", "no-cache")

        default_last_time = str(time.time())
        input = web.input(search="", page=0, last_time=default_last_time)

        #web.debug('input time ' + input.last_time)
        if input.last_time == '0.0': # bah...
          input.last_time = default_last_time

        last_words = env.get_last_words(since=input.last_time)
        last_update = 0
        if not input.search:
            input.search = last_words
            last_update = env.get_last_update()

        todisplay = 10
        posts_count = 0

        start_idx = (input.page - 1) * todisplay + 1
        end_idx = min(posts_count, input.page *  todisplay)
        if end_idx >= posts_count:
            start_idx = posts_count - (posts_count % todisplay) + 1
            end_idx = posts_count
            input.page = posts_count / todisplay + 1

        #security issue w/ making request for typed words..

        words = input.search.split(' ')
        words.reverse()

        for w in words:
          if not w or not dict.is_word(w):
            continue
        
          #trans = '' # translation(w) # unused for now
          
          wrdef = wr_definition(w)
        
          # check if english word..
          # sound is only in english!
          #jqreq = """$.get('audio?q=' + encodeURIComponent($(this).attr('href')) + '&d=' + new Date().toUTCString());return false;"""
          #print '<a href="%s" onclick="%s">%s</a>' % (wrdef.sound, jqreq, wrdef.listen)
          #print "<br /><br />"
          #print '<iframe src="http://www.wordreference.com/%s/%s" width="320" height="240"></iframe>' % ('enfr', web.urlquote(input.search)) 
          print render.vocab_results(w, wrdef, last_update)
        
        return