Ejemplo n.º 1
0
	def __init__(self, database_name="media"):
		self.conn = Connection.Instance().conn
		self.db = Database(self.conn, database_name).db
		print "[INFO] Initialized"
Ejemplo n.º 2
0
class Searcher:
	def __init__(self, database_name="media"):
		self.conn = Connection.Instance().conn
		self.db = Database(self.conn, database_name).db
		print "[INFO] Initialized"

	def cleanup(self):
		Connection.Instance().disconnect()
		print "[INFO] Cleaned up"

	def search(self, name, start_datetime):
		# look through MongoDB and generate a list
		# sorted by relevance
		collections = self.db.collection_names()
		results = {}
		queries_done = Set()
		queries_pending = [(name, 0)]

		# 1st pass: seperate name into words and regex OR
		# this is more forgiving than matching the entire string
		# the words must 100% match though
		while queries_pending:
			query = queries_pending.pop()
			depth = query[1]
			query = query[0]
			query_plain = re.sub(u'[^A-Za-z0-9\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uffef\u4e00-\u9faf]+', ' ', query).strip()

			if slugify(query) in queries_done or \
				depth > 2:
				continue

			queries_done.add(slugify(query))

			if len(query) < 3:
				for collection in collections:
					if not collection.startswith("system"):
						for match in list(self.db[collection].find({"titles": query}), limit=20):
							slug = slugify("{0}-{1}".format(collection, match['titles'][0].encode('utf-8').strip()))
							results[slug] = match

							# add the other titles
							for title in match['titles']:
								if slugify(title) not in queries_done:
									queries_pending.append((title, depth + 1))

			else:
				search_terms = [re.escape(query), re.escape(query_plain)]

				# removes japanese grammar from the list of search search_terms
				# remove blacklisted terms
				# remove terms that are shorter than 3 letters
				query_terms = [query for query in query.split() if \
					query.lower() not in japanese_grammar and \
					query.lower() not in blacklist]

				# create bigram search
				for grams in ngrams(query_terms, 2):
					search_terms.append(re.escape(" ".join(grams)))

				# removes japanese grammar from the list of search search_terms
				# remove blacklisted terms
				# remove terms that are shorter than 3 letters
				query_terms = [query for query in query_plain.split() if \
					query.lower() not in japanese_grammar and \
					query.lower() not in blacklist]

				# create bigram search
				for grams in ngrams(query_terms, 2):
					search_terms.append(re.escape(" ".join(grams)))

				search_terms_str = "|".join(search_terms)
				#print search_terms_str
				
				# regex depends on the len of the a
				regex = re.compile(ur'{0}'.format(search_terms_str), re.IGNORECASE)

				for collection in collections:
					if not collection.startswith("system"):
						for match in list(self.db[collection].find({"titles": {"$regex":regex, "$options":"-i"}}))[0:20]:
							slug = slugify("{0}-{1}".format(collection, match['titles'][0].encode('utf-8').strip()))
							results[slug] = match

							# add the other titles
							for title in match['titles']:
								if slugify(title) not in queries_done:
									queries_pending.append((title, depth + 1))

		# calculate relevance of all results
		for key, result in results.iteritems():
			result.update(self.calculate_relevance(result, name, start_datetime))

		# filter results if trigram similiarity is 1
		new_results = []
		for result in results.values():
			if result['relevance_trigram'] < 1:
				new_results.append(result)
		results = new_results

		# sort results
		results = sorted(results, key=lambda result:result['relevance_total'])
		
		return results

	def substring_index(self, title, name):
		substring = self.longest_common_substring(title.lower(), name.lower())
		substring_index = title.lower().find(substring)
		return substring_index if substring_index != -1 else len(title)

	def longest_common_substring(self, s1, s2):
	    m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))]
	    longest, x_longest = 0, 0
	    for x in xrange(1, 1 + len(s1)):
	        for y in xrange(1, 1 + len(s2)):
	            if s1[x - 1] == s2[y - 1]:
	                m[x][y] = m[x - 1][y - 1] + 1
	                if m[x][y] > longest:
	                    longest = m[x][y]
	                    x_longest = x
	            else:
	                m[x][y] = 0
	    return s1[x_longest - longest: x_longest]

	def calculate_relevance(self, result, name, start_datetime):
		relevance = {}
		
		# title similarity - trigram matching
		# case sensitive
		# 0 if trigrams are identical, 1.0 if no trigrams are common"""
		relevance['relevance_trigram'] = min([distance_ngrams_same_letters(title, name) for title in result['titles']])

		# substring index
		# 0 means it appears at the first character of the string
		# if substring is not found, it will return min(len()) of all the titles
		relevance['relevance_substring_index'] = min([self.substring_index(title, name) for title in result['titles']])
		
		# start_datetime similarity - abs
		# small difference is better
		if result['start_date'] and start_datetime:
			relevance['relevance_start_datetime'] = abs(start_datetime - result['start_date'])
			relevance['relevance_start_datetime'] = relevance['relevance_start_datetime'].days
		else:
			# not defined -> assign arbitary large number
			relevance['relevance_start_datetime'] = 2**12-1

		# total relevance
		# harmonic mean and geometric mean is too biased toward the trigram similiarity (needs for tweaking)
		# weighted sum polynomial
		relevance['relevance_total'] = \
				relevance['relevance_trigram'] * (relevance['relevance_start_datetime'] +100) + \
				(relevance['relevance_substring_index'] + 1) * (relevance['relevance_start_datetime'] + 100)

		return relevance

	def percent_difference(self, n1, n2):
		return abs((n1-n2)/((n1+n2)/2))*100

	def abs_difference(self, n1, n2):
		return abs(n1-n2)

	def reduce(self, results):
		# TODO: need to use cluster algo for reduce
		# instead of a greedy algo using relevance_total
		# using relevance start_datetime and trigram similarity to determine to groupings
		reduced_results = []

		if not results:
			return results

		group = [results[0]]
		group_avg_relevance_start_datetime = results[0]['relevance_start_datetime']
		group_avg_relevance_trigram = results[0]['relevance_trigram']

		# reduce the list of results
		for i in xrange(1, len(results)):
			if 	self.abs_difference(group_avg_relevance_start_datetime, results[i]['relevance_start_datetime']) < 5 and \
				self.abs_difference(group_avg_relevance_trigram, results[i]['relevance_trigram']) < 0.3:
				group.append(results[i])
			else:
				# need to combine the group to one result
				reduced_results.append(self.combiner(group))

				group = [results[i]] 
				group_avg_relevance_start_datetime = results[i]['relevance_start_datetime']
				group_avg_relevance_trigram = results[i]['relevance_trigram']

		reduced_results.append(self.combiner(group))

		return reduced_results

	def combiner(self, dicts):
		super_dict = defaultdict(set)  # uses set to avoid duplicates
		for d in dicts:
			for k, v in d.iteritems():
				if v is None:
					continue

				# need to deserialize arrays in order to insert them into the set
				if k == 'promo_urls' or k == 'studios' or \
					k == 'titles' or k == 'studio_urls' or \
					k == 'genres':
					for list_v in v:
						super_dict[k].add(list_v)
				elif k == '_id':
					continue
				elif k == 'relevance_total' or \
					k == 'relevance_trigram' or \
					k == 'relevance_start_datetime' or \
					k == 'relevance_substring_index':
					super_dict[k] = min(super_dict[k], v) if super_dict[k] else v
				elif k == 'request_datetime':
					super_dict[k] = [datetime.utcnow()]
				elif k == 'start_date':
					super_dict[k] = min(super_dict[k], v) if super_dict[k] else v
				elif k == 'total_episodes':
					if v and int(v) != 0:
						super_dict[k].add(int(v))
				else:
					super_dict[k].add(v)

		# convert set to list
		for k, v in super_dict.iteritems():
			if k == 'relevance_total' or \
				k == 'relevance_trigram' or \
				k == 'relevance_start_datetime' or \
				k == 'relevance_substring_index' or \
				k == 'start_date':
				continue
			elif k == 'studios' or \
				k == 'titles' or \
				k == 'promo_urls' or \
				k == 'studio_urls' or \
				k == 'genres':
				super_dict[k] = list(v)
			else:
				super_dict[k] = list(v) if len(v) > 1 else list(v)[0]

		return super_dict