def __init__(self): allCategories = Constants.categories self.__all_sources = [] self.__entity_collection = MongoEntityCollection() self.__stats_collection = MongoEntityStatsCollection() # Within each category, we have a number of sources and each is assigned a priority. The priority is used to # determine how long to wait for results from that source. self.__categories_to_sources_and_priorities = {} for category in allCategories: self.__categories_to_sources_and_priorities[category] = [] self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3) self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10) # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no # clear distinction between individual movies and box sets, etc. self.__registerSource(AmazonSource(), music=5, book=10) self.__registerSource(FactualSource(), place=8) self.__registerSource(GooglePlacesSource(), place=8) self.__registerSource(RdioSource(), music=8) self.__registerSource(SpotifySource(), music=8) self.__registerSource(TMDBSource(), film=8) self.__registerSource(TheTVDBSource(), film=8)
class EntitySearch(object): def __registerSource(self, source, **categoriesToPriorities): self.__all_sources.append(source) for (category, priority) in categoriesToPriorities.items(): if category not in Constants.categories: raise Exception("unrecognized category: %s" % category) self.__categories_to_sources_and_priorities[category].append((source, priority)) def __init__(self): allCategories = Constants.categories self.__all_sources = [] self.__entity_collection = MongoEntityCollection() self.__stats_collection = MongoEntityStatsCollection() # Within each category, we have a number of sources and each is assigned a priority. The priority is used to # determine how long to wait for results from that source. self.__categories_to_sources_and_priorities = {} for category in allCategories: self.__categories_to_sources_and_priorities[category] = [] self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3) self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10) # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no # clear distinction between individual movies and box sets, etc. self.__registerSource(AmazonSource(), music=5, book=10) self.__registerSource(FactualSource(), place=8) self.__registerSource(GooglePlacesSource(), place=8) self.__registerSource(RdioSource(), music=8) self.__registerSource(SpotifySource(), music=8) self.__registerSource(TMDBSource(), film=8) self.__registerSource(TheTVDBSource(), film=8) def __terminateWaiting(self, pool, start_time, category, resultsDict): logTimingData('IN TERMINATE WAITING') sources_to_priorities = dict(self.__categories_to_sources_and_priorities[category]) total_value_received = 0 total_potential_value_outstanding = sum(sources_to_priorities.values()) sources_seen = set() while True: try: elapsed_seconds = total_seconds(datetime.datetime.now() - start_time) if elapsed_seconds >= 7: logs.warning('Search completely timed out at 7s!') pool.kill() return for (source, results) in resultsDict.items(): if source in sources_seen: continue logTimingData('JUST NOW SEEING SOURCE: ' + source.sourceName) sources_seen.add(source) # If a source returns at least 5 results, we assume we got a good result set from it. If it # returns less, we're more inclined to wait for straggling sources. total_value_received += sources_to_priorities[source] * min(5, len(results)) / 5.0 logTimingData('DECREMENTING OUTSTANDING BY ' + str(sources_to_priorities[source]) + ' FOR SOURCE ' + source.sourceName) total_potential_value_outstanding -= sources_to_priorities[source] logTimingData('AT %f seconds elapsed, TOTAL VALUE RECEIVED IS %f, TOTAL OUTSTANDING IS %f' % ( elapsed_seconds, total_value_received, total_potential_value_outstanding )) except Exception: logs.warning('TERMINATE_WARNING SHIT IS F****D') logs.report() raise if total_potential_value_outstanding <= 0: logTimingData('ALL SOURCES DONE') return if total_value_received: marginal_value_of_outstanding_sources = total_potential_value_outstanding / total_value_received # Comes out to: # 0.08 for 1s # 0.25 for 1.5s # 0.79 for 2s # 2.51 for 2.5s # 7.94 for 3s # So we'll ditch that 4th remaining source for music around 1.5s; we'll ditch the second source for # something like Places around 2s; we'll ditch any lingering source around 3s if we've received # anything. min_marginal_value = 10 ** (elapsed_seconds - 2.1) if min_marginal_value > marginal_value_of_outstanding_sources: sources_not_seen = [ source.sourceName for source in sources_to_priorities.keys() if source not in sources_seen ] if sources_not_seen: # This is interesting information whether we want the full timing data logged or not. log_template = 'QUITTING EARLY: At %f second elapsed, bailing on sources [%s] because with ' + \ 'value received %f, value outstanding %f, marginal value %f, min marginal value %f' logs.debug(log_template % ( elapsed_seconds, ', '.join(sources_not_seen), total_value_received, total_potential_value_outstanding, marginal_value_of_outstanding_sources, min_marginal_value )) pool.kill() return gevent.sleep(0.01) def __searchSource(self, source, queryCategory, queryText, resultsDict, timesDict, **queryParams): try: # Note that the timing here is not 100% legit because gevent won't interrupt code except on I/O, but it's good # enough to give a solid idea. before = datetime.datetime.now() if shouldLogRawSourceResults: queryParams['logRawResults'] = True results = source.searchLite(queryCategory, queryText, **queryParams) after = datetime.datetime.now() # First level of filtering on data quality score -- results that are really horrendous get dropped entirely # pre-clustering. filteredResults = [result for result in results if result.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_CLUSTER] timesDict[source] = after - before logs.debug("GOT RESULTS FROM SOURCE %s IN ELAPSED TIME %s -- COUNT: %d, AFTER FILTERING: %d" % ( source.sourceName, str(after - before), len(results), len(filteredResults) )) resultsDict[source] = filteredResults except GreenletExit: pass except: logs.report() resultsDict[source] = [] def search(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None): if not isinstance(text, unicode): text = text.decode('utf-8') if category not in Constants.categories: raise Exception("unrecognized category: (%s)" % category) start = datetime.datetime.now() results = {} times = {} pool = utils.LoggingThreadPool(len(self.__categories_to_sources_and_priorities)) def termWaiting(): logs.debug('in termWaiting') try: return self.__terminateWaiting(pool, datetime.datetime.now(), category, results) except Exception: logs.report() logs.debug('done with termWaiting') logs.debug("SHOULD_DISABLE_TIMEOUT IS " + str(shouldDisableTimeout)) if not shouldDisableTimeout: logTimingData('SPAWNING TERMINATE WAITING') #pool.spawn(self.__terminateWaiting, pool, datetime.datetime.now(), category, results) pool.spawn(termWaiting) for (source, priority) in self.__categories_to_sources_and_priorities[category]: # TODO: Handing the exact same timeout down to the inner call is probably wrong because we end up in this # situation where outer pools and inner pools are using the same timeout and possibly the outer pool will # nix the whole thing before the inner pool cancels out, which is what we'd prefer so that it's handled # more gracefully. pool.spawn(self.__searchSource, source, category, text, results, times, timeout=timeout, coords=coords) logTimingData("TIME CHECK ISSUED ALL QUERIES AT " + str(datetime.datetime.now())) pool.join() logTimingData("TIME CHECK GOT ALL RESPONSES AT " + str(datetime.datetime.now())) logTimingData('TIMES: ' + (', '.join(['%s took %s' % (source.sourceName, str(times[source])) for source in times]))) for source in self.__all_sources: if source in results and results[source]: logSourceResultsData("\nRESULTS FROM SOURCE " + source.sourceName + " TIME ELAPSED: " + str(times[source]) + "\n\n") for result in results[source]: logSourceResultsData(utils.normalize(repr(result))) pass beforeDeduping = datetime.datetime.now() dedupedResults = SearchResultDeduper().dedupeResults(category, results.values()) afterDeduping = datetime.datetime.now() logTimingData("DEDUPING TOOK " + str(afterDeduping - beforeDeduping)) logTimingData("TIME CHECK DONE AT:" + str(datetime.datetime.now())) logTimingData("ELAPSED:" + str(afterDeduping - start)) logClusterData("\n\nDEDUPED RESULTS\n\n") for dedupedResult in dedupedResults[:limit]: logClusterData("\n\n%s\n\n" % str(dedupedResult)) return dedupedResults[:limit] def __getEntityIdForCluster(self, cluster): idsFromClusteredEntities = [] fastResolveQueries = [] for result in cluster.results: if result.dataQuality < MIN_RESULT_DATA_QUALITY_TO_INCLUDE: continue if result.resolverObject.source == 'stamped': idsFromClusteredEntities.append(result.resolverObject.key) else: fastResolveQueries.append((result.resolverObject.source, result.resolverObject.key)) fastResolvedIds = filter(None, self.__stampedSource.resolve_fast_batch(fastResolveQueries)) if fastResolveQueries else [] allIds = idsFromClusteredEntities + fastResolvedIds if len(idsFromClusteredEntities) > 2: logs.warning('Search results directly clustered multiple StampedSource results: [%s]' % ', '.join(str(entityId) for entityId in idsFromClusteredEntities)) elif len(allIds) > 2: logs.warning('Search results indirectly clustered multiple entity IDs together: [%s]' % ', '.join(str(entityId) for entityId in allIds)) if not allIds: return None return allIds[0] def __proxyToEntity(self, cluster): # Additional level of filtering -- some things get clustered (for the purpose of boosting certain cluster # scores) but never included in the final result because we're not 100% that the data is good enough to show # users. filteredResults = [r for r in cluster.results if r.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_INCLUDE] # So this is ugly, but it's pretty common for two listings to have the same or virtually the same data quality # and using relevance as a tie-breaker is really helpful. filteredResults.sort(key=lambda r: (r.dataQuality + (r.relevance / 10.0), r.resolverObject.source, r.resolverObject.key), reverse=True) # TODO PRELAUNCH: Only use the best result from each source. entity = EntityProxyContainer().addAllProxies(result.resolverObject for result in filteredResults).buildEntity() for result in filteredResults: entity.addThirdPartyId(result.resolverObject.source, result.resolverObject.key) return entity @utils.lazyProperty def __stampedSource(self): return StampedSource() def __buildEntity(self, entityId): entity = self.__entity_collection.getEntity(entityId) entity._maybeRegenerateThirdPartyIds() return entity def rescoreFinalResults(self, entityAndClusterList): def isTempEntity(entity): return entity.entity_id is None realEntityIds = [ entity.entity_id for (entity, cluster) in entityAndClusterList if not isTempEntity(entity) ] entityStats = self.__stats_collection.getStatsForEntities(realEntityIds) statsByEntityId = dict([(stats.entity_id, stats) for stats in entityStats]) def scoreEntityAndCluster((entity, cluster)): if isTempEntity(entity): dataScore = cluster.dataQuality else: numStamps = 0 if entity.entity_id in statsByEntityId: numStamps = statsByEntityId[entity.entity_id].num_stamps dataScore = 1.1 + math.log(numStamps+1, 50) # TODO: Possibly distinguish even more about which of these have rich data. There are some types of data # that don't affect dataQuality because they don't make us less certain about the state of a cluster, but # they make user interactions with it more positive -- pictures, preview URLs, etc. We should factor # these in here. return dataScore * cluster.relevance entityAndClusterList.sort(key=scoreEntityAndCluster, reverse=True) def searchEntitiesAndClusters(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None): clusters = self.search(category, text, timeout=timeout, limit=limit, coords=coords) searchDoneTime = datetime.datetime.now() entityResults = [] entityIdsToNewClusterIdxs = {} entitiesAndClusters = [] for cluster in clusters: # TODO: make use of nemesis ids here. entityId = self.__getEntityIdForCluster(cluster) if not entityId: # One more layer of filtering here -- clusters that don't overall hit our quality minimum get # dropped. We never drop clusters that resolve to entities for this reason. if cluster.dataQuality >= MIN_CLUSTER_DATA_QUALITY: entitiesAndClusters.append((self.__proxyToEntity(cluster), cluster)) else: logClusterData('DROPPING CLUSTER for shitty data quality:\n%s' % cluster) # TODO PRELAUNCH: Make sure that the type we get from fast_resolve == the type we get from # StampedSourceObject.key, or else using these as keys in a map together won't work. elif entityId not in entityIdsToNewClusterIdxs: entityIdsToNewClusterIdxs[entityId] = len(entitiesAndClusters) entitiesAndClusters.append((self.__buildEntity(entityId), cluster)) else: originalIndex = entityIdsToNewClusterIdxs[entityId] (_, originalCluster) = entitiesAndClusters[originalIndex] # We're not actually augmenting the result at all here; the result is the unadultered entity. We won't # show an entity augmented with other third-party IDs we've attached in search results because it will # create inconsistency for the entity show page and we don't know if they will definitely be attached. # The point of the grok is entirely to boost the rank of the cluster (and thus of the entity.) # TODO PRELAUNCH: Consider overriding this for sparse or user-created entities. # TODO: Debug check to see if the two are definitely not a match according to our clustering logic. originalCluster.grok(cluster) # TODO: Reorder according to final scores that incorporate dataQuality and a richness score (presence of stamps, # presence of enriched entity, etc.) convertedToEntitiesTime = datetime.datetime.now() logTimingData('CONVERTING TO ENTITIES TOOK: %s' % (convertedToEntitiesTime - searchDoneTime)) self.rescoreFinalResults(entitiesAndClusters) rescoredTime = datetime.datetime.now() logTimingData('RESCORING TOOK: %s' % (rescoredTime - convertedToEntitiesTime)) return entitiesAndClusters def searchEntities(self, *args, **kwargs): return [entity for entity, _ in self.searchEntitiesAndClusters(*args, **kwargs)]