def __init__(self, database_name="media"): self.conn = Connection.Instance().conn self.db = Database(self.conn, database_name).db print "[INFO] Initialized"
class Searcher: def __init__(self, database_name="media"): self.conn = Connection.Instance().conn self.db = Database(self.conn, database_name).db print "[INFO] Initialized" def cleanup(self): Connection.Instance().disconnect() print "[INFO] Cleaned up" def search(self, name, start_datetime): # look through MongoDB and generate a list # sorted by relevance collections = self.db.collection_names() results = {} queries_done = Set() queries_pending = [(name, 0)] # 1st pass: seperate name into words and regex OR # this is more forgiving than matching the entire string # the words must 100% match though while queries_pending: query = queries_pending.pop() depth = query[1] query = query[0] query_plain = re.sub(u'[^A-Za-z0-9\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uffef\u4e00-\u9faf]+', ' ', query).strip() if slugify(query) in queries_done or \ depth > 2: continue queries_done.add(slugify(query)) if len(query) < 3: for collection in collections: if not collection.startswith("system"): for match in list(self.db[collection].find({"titles": query}), limit=20): slug = slugify("{0}-{1}".format(collection, match['titles'][0].encode('utf-8').strip())) results[slug] = match # add the other titles for title in match['titles']: if slugify(title) not in queries_done: queries_pending.append((title, depth + 1)) else: search_terms = [re.escape(query), re.escape(query_plain)] # removes japanese grammar from the list of search search_terms # remove blacklisted terms # remove terms that are shorter than 3 letters query_terms = [query for query in query.split() if \ query.lower() not in japanese_grammar and \ query.lower() not in blacklist] # create bigram search for grams in ngrams(query_terms, 2): search_terms.append(re.escape(" ".join(grams))) # removes japanese grammar from the list of search search_terms # remove blacklisted terms # remove terms that are shorter than 3 letters query_terms = [query for query in query_plain.split() if \ query.lower() not in japanese_grammar and \ query.lower() not in blacklist] # create bigram search for grams in ngrams(query_terms, 2): search_terms.append(re.escape(" ".join(grams))) search_terms_str = "|".join(search_terms) #print search_terms_str # regex depends on the len of the a regex = re.compile(ur'{0}'.format(search_terms_str), re.IGNORECASE) for collection in collections: if not collection.startswith("system"): for match in list(self.db[collection].find({"titles": {"$regex":regex, "$options":"-i"}}))[0:20]: slug = slugify("{0}-{1}".format(collection, match['titles'][0].encode('utf-8').strip())) results[slug] = match # add the other titles for title in match['titles']: if slugify(title) not in queries_done: queries_pending.append((title, depth + 1)) # calculate relevance of all results for key, result in results.iteritems(): result.update(self.calculate_relevance(result, name, start_datetime)) # filter results if trigram similiarity is 1 new_results = [] for result in results.values(): if result['relevance_trigram'] < 1: new_results.append(result) results = new_results # sort results results = sorted(results, key=lambda result:result['relevance_total']) return results def substring_index(self, title, name): substring = self.longest_common_substring(title.lower(), name.lower()) substring_index = title.lower().find(substring) return substring_index if substring_index != -1 else len(title) def longest_common_substring(self, s1, s2): m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))] longest, x_longest = 0, 0 for x in xrange(1, 1 + len(s1)): for y in xrange(1, 1 + len(s2)): if s1[x - 1] == s2[y - 1]: m[x][y] = m[x - 1][y - 1] + 1 if m[x][y] > longest: longest = m[x][y] x_longest = x else: m[x][y] = 0 return s1[x_longest - longest: x_longest] def calculate_relevance(self, result, name, start_datetime): relevance = {} # title similarity - trigram matching # case sensitive # 0 if trigrams are identical, 1.0 if no trigrams are common""" relevance['relevance_trigram'] = min([distance_ngrams_same_letters(title, name) for title in result['titles']]) # substring index # 0 means it appears at the first character of the string # if substring is not found, it will return min(len()) of all the titles relevance['relevance_substring_index'] = min([self.substring_index(title, name) for title in result['titles']]) # start_datetime similarity - abs # small difference is better if result['start_date'] and start_datetime: relevance['relevance_start_datetime'] = abs(start_datetime - result['start_date']) relevance['relevance_start_datetime'] = relevance['relevance_start_datetime'].days else: # not defined -> assign arbitary large number relevance['relevance_start_datetime'] = 2**12-1 # total relevance # harmonic mean and geometric mean is too biased toward the trigram similiarity (needs for tweaking) # weighted sum polynomial relevance['relevance_total'] = \ relevance['relevance_trigram'] * (relevance['relevance_start_datetime'] +100) + \ (relevance['relevance_substring_index'] + 1) * (relevance['relevance_start_datetime'] + 100) return relevance def percent_difference(self, n1, n2): return abs((n1-n2)/((n1+n2)/2))*100 def abs_difference(self, n1, n2): return abs(n1-n2) def reduce(self, results): # TODO: need to use cluster algo for reduce # instead of a greedy algo using relevance_total # using relevance start_datetime and trigram similarity to determine to groupings reduced_results = [] if not results: return results group = [results[0]] group_avg_relevance_start_datetime = results[0]['relevance_start_datetime'] group_avg_relevance_trigram = results[0]['relevance_trigram'] # reduce the list of results for i in xrange(1, len(results)): if self.abs_difference(group_avg_relevance_start_datetime, results[i]['relevance_start_datetime']) < 5 and \ self.abs_difference(group_avg_relevance_trigram, results[i]['relevance_trigram']) < 0.3: group.append(results[i]) else: # need to combine the group to one result reduced_results.append(self.combiner(group)) group = [results[i]] group_avg_relevance_start_datetime = results[i]['relevance_start_datetime'] group_avg_relevance_trigram = results[i]['relevance_trigram'] reduced_results.append(self.combiner(group)) return reduced_results def combiner(self, dicts): super_dict = defaultdict(set) # uses set to avoid duplicates for d in dicts: for k, v in d.iteritems(): if v is None: continue # need to deserialize arrays in order to insert them into the set if k == 'promo_urls' or k == 'studios' or \ k == 'titles' or k == 'studio_urls' or \ k == 'genres': for list_v in v: super_dict[k].add(list_v) elif k == '_id': continue elif k == 'relevance_total' or \ k == 'relevance_trigram' or \ k == 'relevance_start_datetime' or \ k == 'relevance_substring_index': super_dict[k] = min(super_dict[k], v) if super_dict[k] else v elif k == 'request_datetime': super_dict[k] = [datetime.utcnow()] elif k == 'start_date': super_dict[k] = min(super_dict[k], v) if super_dict[k] else v elif k == 'total_episodes': if v and int(v) != 0: super_dict[k].add(int(v)) else: super_dict[k].add(v) # convert set to list for k, v in super_dict.iteritems(): if k == 'relevance_total' or \ k == 'relevance_trigram' or \ k == 'relevance_start_datetime' or \ k == 'relevance_substring_index' or \ k == 'start_date': continue elif k == 'studios' or \ k == 'titles' or \ k == 'promo_urls' or \ k == 'studio_urls' or \ k == 'genres': super_dict[k] = list(v) else: super_dict[k] = list(v) if len(v) > 1 else list(v)[0] return super_dict