def main(): usage = "Usage: %prog [options]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option('--dry_run', action='store_true', dest='dry_run', default=False) parser.add_option('--nuclear', action='store_true', dest='nuclear', default=False) parser.add_option('--max_to_remove', action='store', dest='max_to_remove', default=None) (options, args) = parser.parse_args() if options.nuclear or options.dry_run: print 'F**K THE WORLD' MongoEntityColleciton()._collection.drop() # "Welcome to the human race." --Snake Plissken return entity_collection = MongoEntityCollection()._collection entity_ids = [result['_id'] for result in entity_collection.find(fields={'_id':True})] todos = MongoTodoCollection() stamps = MongoStampCollection() removed = 0 for entity_id in entity_ids: if options.max_to_remove is not None and removed >= options.max_to_remove: return has_attached_user_interactions = ( list(todos._collection.find({'entity.entity_id' : str(entity_id)}, fields={'_id':1})) or list(stamps._collection.find({'entity.entity_id' : str(entity_id)}, fields={'_id':1})) ) if has_attached_user_interactions: print 'SKIPPING', entity_id continue entity_collection.remove({'_id':entity_id}) removed += 1
def main(): usage = "Usage: %prog --entity_id=<id> OR %prod --search_id=<id> OR %prod <query> <subcategory?> <index?>" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option('--entity_id', action='store', dest='entity_id', default=None) parser.add_option('--search_id', action='store', dest='search_id', default=None) (options, args) = parser.parse_args() if options.entity_id and options.search_id: print '--entity_id and --search_id are mutually exclusive!' id_provided = options.entity_id or options.search_id if id_provided and len(args) > 1: print '--entity_id and --search_id cannot be used with query arguments!' if options.entity_id: from api.db.mongodb.MongoEntityCollection import MongoEntityCollection entity = MongoEntityCollection().getEntity(options.entity_id) elif options.search_id: entity = getEntityFromSearchId(options.search_id) else: query = buildQueryFromArgs(args) from api.MongoStampedAPI import MongoStampedAPI cursor = MongoStampedAPI()._entityDB._collection.find(query) if cursor.count() == 0: print("Could not find a matching entity for query: %s" % query) return entity = MongoStampedAPI()._entityDB._convertFromMongo(cursor[0]) print( "Before:\n%s" % pformat( entity.dataExport() ) ) container = FullResolveContainer() decorations = {} container.enrichEntity( entity, decorations ) print( "After:\n%s" % pformat( entity.dataExport() ) ) if len(decorations) > 0: print( "With decorations:") for k,v in decorations.items(): print( "%s decoration:" % k ) try: print( "%s" % pformat(v.dataExport()) ) except Exception: print( "%s" % pformat(v) ) from libs.CountedFunction import printFunctionCounts printFunctionCounts()
def main(): usage = "Usage: %prog [options]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option('--dry_run', action='store_true', dest='dry_run', default=None) # TODO: Ability to limit by vertical parser.add_option('--max_checks', type='int', action='store', dest='max_checks', default=-1) parser.add_option('--max_errors', type='int', action='store', dest='max_errors', default=-1) parser.add_option('--stamped_only', action='store_true', dest='stamped_only', default=False) parser.add_option('--report_out', action='store', dest='report_out', default=None) (options, args) = parser.parse_args() if not options.report_out: raise Exception('--report_out is required!') all_entity_ids = getAllEntityIds(options.stamped_only) random.shuffle(all_entity_ids) error_entity_ids = [] entities_checked = 0 entity_collection = MongoEntityCollection() report_file = open(options.report_out, 'w') if options.max_checks > 0: all_entity_ids = all_entity_ids[:options.max_checks] for entity_id in all_entity_ids: if options.max_errors > 0 and len(error_entity_ids) >= options.max_errors: break try: entities_checked += 1 entity = entity_collection.getEntity(entity_id) well_resolved = entityIsWellResolved(entity, report_file) if not well_resolved: error_entity_ids.append(entity_id) except ValueError: pass report_file.close() print 'Of %d entities examined, %d were found to have errors!' % (entities_checked, len(error_entity_ids)) for id in error_entity_ids: print id for (source, num_attempts) in sourceAttemptCounts.items(): print('source %s was seen in %d entities, and %d of those references were broken' % ( source, num_attempts, sourceFailureCounts[source] ))
def __init__(self): allCategories = Constants.categories self.__all_sources = [] self.__entity_collection = MongoEntityCollection() self.__stats_collection = MongoEntityStatsCollection() # Within each category, we have a number of sources and each is assigned a priority. The priority is used to # determine how long to wait for results from that source. self.__categories_to_sources_and_priorities = {} for category in allCategories: self.__categories_to_sources_and_priorities[category] = [] self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3) self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10) # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no # clear distinction between individual movies and box sets, etc. self.__registerSource(AmazonSource(), music=5, book=10) self.__registerSource(FactualSource(), place=8) self.__registerSource(GooglePlacesSource(), place=8) self.__registerSource(RdioSource(), music=8) self.__registerSource(SpotifySource(), music=8) self.__registerSource(TMDBSource(), film=8) self.__registerSource(TheTVDBSource(), film=8)
def test_db_fixture_string(self): # For this test, there is just hard-coded fixture text with no regenerate function, so we will always just get # this string. entityCollection = MongoEntityCollection() entity = entityCollection.getEntity("4e4c67f226f05a2ba9000002") print "The entity I got is:\n\n", entity, "\n\n"
class EntitySearch(object): def __registerSource(self, source, **categoriesToPriorities): self.__all_sources.append(source) for (category, priority) in categoriesToPriorities.items(): if category not in Constants.categories: raise Exception("unrecognized category: %s" % category) self.__categories_to_sources_and_priorities[category].append((source, priority)) def __init__(self): allCategories = Constants.categories self.__all_sources = [] self.__entity_collection = MongoEntityCollection() self.__stats_collection = MongoEntityStatsCollection() # Within each category, we have a number of sources and each is assigned a priority. The priority is used to # determine how long to wait for results from that source. self.__categories_to_sources_and_priorities = {} for category in allCategories: self.__categories_to_sources_and_priorities[category] = [] self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3) self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10) # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no # clear distinction between individual movies and box sets, etc. self.__registerSource(AmazonSource(), music=5, book=10) self.__registerSource(FactualSource(), place=8) self.__registerSource(GooglePlacesSource(), place=8) self.__registerSource(RdioSource(), music=8) self.__registerSource(SpotifySource(), music=8) self.__registerSource(TMDBSource(), film=8) self.__registerSource(TheTVDBSource(), film=8) def __terminateWaiting(self, pool, start_time, category, resultsDict): logTimingData('IN TERMINATE WAITING') sources_to_priorities = dict(self.__categories_to_sources_and_priorities[category]) total_value_received = 0 total_potential_value_outstanding = sum(sources_to_priorities.values()) sources_seen = set() while True: try: elapsed_seconds = total_seconds(datetime.datetime.now() - start_time) if elapsed_seconds >= 7: logs.warning('Search completely timed out at 7s!') pool.kill() return for (source, results) in resultsDict.items(): if source in sources_seen: continue logTimingData('JUST NOW SEEING SOURCE: ' + source.sourceName) sources_seen.add(source) # If a source returns at least 5 results, we assume we got a good result set from it. If it # returns less, we're more inclined to wait for straggling sources. total_value_received += sources_to_priorities[source] * min(5, len(results)) / 5.0 logTimingData('DECREMENTING OUTSTANDING BY ' + str(sources_to_priorities[source]) + ' FOR SOURCE ' + source.sourceName) total_potential_value_outstanding -= sources_to_priorities[source] logTimingData('AT %f seconds elapsed, TOTAL VALUE RECEIVED IS %f, TOTAL OUTSTANDING IS %f' % ( elapsed_seconds, total_value_received, total_potential_value_outstanding )) except Exception: logs.warning('TERMINATE_WARNING SHIT IS F****D') logs.report() raise if total_potential_value_outstanding <= 0: logTimingData('ALL SOURCES DONE') return if total_value_received: marginal_value_of_outstanding_sources = total_potential_value_outstanding / total_value_received # Comes out to: # 0.08 for 1s # 0.25 for 1.5s # 0.79 for 2s # 2.51 for 2.5s # 7.94 for 3s # So we'll ditch that 4th remaining source for music around 1.5s; we'll ditch the second source for # something like Places around 2s; we'll ditch any lingering source around 3s if we've received # anything. min_marginal_value = 10 ** (elapsed_seconds - 2.1) if min_marginal_value > marginal_value_of_outstanding_sources: sources_not_seen = [ source.sourceName for source in sources_to_priorities.keys() if source not in sources_seen ] if sources_not_seen: # This is interesting information whether we want the full timing data logged or not. log_template = 'QUITTING EARLY: At %f second elapsed, bailing on sources [%s] because with ' + \ 'value received %f, value outstanding %f, marginal value %f, min marginal value %f' logs.debug(log_template % ( elapsed_seconds, ', '.join(sources_not_seen), total_value_received, total_potential_value_outstanding, marginal_value_of_outstanding_sources, min_marginal_value )) pool.kill() return gevent.sleep(0.01) def __searchSource(self, source, queryCategory, queryText, resultsDict, timesDict, **queryParams): try: # Note that the timing here is not 100% legit because gevent won't interrupt code except on I/O, but it's good # enough to give a solid idea. before = datetime.datetime.now() if shouldLogRawSourceResults: queryParams['logRawResults'] = True results = source.searchLite(queryCategory, queryText, **queryParams) after = datetime.datetime.now() # First level of filtering on data quality score -- results that are really horrendous get dropped entirely # pre-clustering. filteredResults = [result for result in results if result.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_CLUSTER] timesDict[source] = after - before logs.debug("GOT RESULTS FROM SOURCE %s IN ELAPSED TIME %s -- COUNT: %d, AFTER FILTERING: %d" % ( source.sourceName, str(after - before), len(results), len(filteredResults) )) resultsDict[source] = filteredResults except GreenletExit: pass except: logs.report() resultsDict[source] = [] def search(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None): if not isinstance(text, unicode): text = text.decode('utf-8') if category not in Constants.categories: raise Exception("unrecognized category: (%s)" % category) start = datetime.datetime.now() results = {} times = {} pool = utils.LoggingThreadPool(len(self.__categories_to_sources_and_priorities)) def termWaiting(): logs.debug('in termWaiting') try: return self.__terminateWaiting(pool, datetime.datetime.now(), category, results) except Exception: logs.report() logs.debug('done with termWaiting') logs.debug("SHOULD_DISABLE_TIMEOUT IS " + str(shouldDisableTimeout)) if not shouldDisableTimeout: logTimingData('SPAWNING TERMINATE WAITING') #pool.spawn(self.__terminateWaiting, pool, datetime.datetime.now(), category, results) pool.spawn(termWaiting) for (source, priority) in self.__categories_to_sources_and_priorities[category]: # TODO: Handing the exact same timeout down to the inner call is probably wrong because we end up in this # situation where outer pools and inner pools are using the same timeout and possibly the outer pool will # nix the whole thing before the inner pool cancels out, which is what we'd prefer so that it's handled # more gracefully. pool.spawn(self.__searchSource, source, category, text, results, times, timeout=timeout, coords=coords) logTimingData("TIME CHECK ISSUED ALL QUERIES AT " + str(datetime.datetime.now())) pool.join() logTimingData("TIME CHECK GOT ALL RESPONSES AT " + str(datetime.datetime.now())) logTimingData('TIMES: ' + (', '.join(['%s took %s' % (source.sourceName, str(times[source])) for source in times]))) for source in self.__all_sources: if source in results and results[source]: logSourceResultsData("\nRESULTS FROM SOURCE " + source.sourceName + " TIME ELAPSED: " + str(times[source]) + "\n\n") for result in results[source]: logSourceResultsData(utils.normalize(repr(result))) pass beforeDeduping = datetime.datetime.now() dedupedResults = SearchResultDeduper().dedupeResults(category, results.values()) afterDeduping = datetime.datetime.now() logTimingData("DEDUPING TOOK " + str(afterDeduping - beforeDeduping)) logTimingData("TIME CHECK DONE AT:" + str(datetime.datetime.now())) logTimingData("ELAPSED:" + str(afterDeduping - start)) logClusterData("\n\nDEDUPED RESULTS\n\n") for dedupedResult in dedupedResults[:limit]: logClusterData("\n\n%s\n\n" % str(dedupedResult)) return dedupedResults[:limit] def __getEntityIdForCluster(self, cluster): idsFromClusteredEntities = [] fastResolveQueries = [] for result in cluster.results: if result.dataQuality < MIN_RESULT_DATA_QUALITY_TO_INCLUDE: continue if result.resolverObject.source == 'stamped': idsFromClusteredEntities.append(result.resolverObject.key) else: fastResolveQueries.append((result.resolverObject.source, result.resolverObject.key)) fastResolvedIds = filter(None, self.__stampedSource.resolve_fast_batch(fastResolveQueries)) if fastResolveQueries else [] allIds = idsFromClusteredEntities + fastResolvedIds if len(idsFromClusteredEntities) > 2: logs.warning('Search results directly clustered multiple StampedSource results: [%s]' % ', '.join(str(entityId) for entityId in idsFromClusteredEntities)) elif len(allIds) > 2: logs.warning('Search results indirectly clustered multiple entity IDs together: [%s]' % ', '.join(str(entityId) for entityId in allIds)) if not allIds: return None return allIds[0] def __proxyToEntity(self, cluster): # Additional level of filtering -- some things get clustered (for the purpose of boosting certain cluster # scores) but never included in the final result because we're not 100% that the data is good enough to show # users. filteredResults = [r for r in cluster.results if r.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_INCLUDE] # So this is ugly, but it's pretty common for two listings to have the same or virtually the same data quality # and using relevance as a tie-breaker is really helpful. filteredResults.sort(key=lambda r: (r.dataQuality + (r.relevance / 10.0), r.resolverObject.source, r.resolverObject.key), reverse=True) # TODO PRELAUNCH: Only use the best result from each source. entity = EntityProxyContainer().addAllProxies(result.resolverObject for result in filteredResults).buildEntity() for result in filteredResults: entity.addThirdPartyId(result.resolverObject.source, result.resolverObject.key) return entity @utils.lazyProperty def __stampedSource(self): return StampedSource() def __buildEntity(self, entityId): entity = self.__entity_collection.getEntity(entityId) entity._maybeRegenerateThirdPartyIds() return entity def rescoreFinalResults(self, entityAndClusterList): def isTempEntity(entity): return entity.entity_id is None realEntityIds = [ entity.entity_id for (entity, cluster) in entityAndClusterList if not isTempEntity(entity) ] entityStats = self.__stats_collection.getStatsForEntities(realEntityIds) statsByEntityId = dict([(stats.entity_id, stats) for stats in entityStats]) def scoreEntityAndCluster((entity, cluster)): if isTempEntity(entity): dataScore = cluster.dataQuality else: numStamps = 0 if entity.entity_id in statsByEntityId: numStamps = statsByEntityId[entity.entity_id].num_stamps dataScore = 1.1 + math.log(numStamps+1, 50) # TODO: Possibly distinguish even more about which of these have rich data. There are some types of data # that don't affect dataQuality because they don't make us less certain about the state of a cluster, but # they make user interactions with it more positive -- pictures, preview URLs, etc. We should factor # these in here. return dataScore * cluster.relevance entityAndClusterList.sort(key=scoreEntityAndCluster, reverse=True) def searchEntitiesAndClusters(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None): clusters = self.search(category, text, timeout=timeout, limit=limit, coords=coords) searchDoneTime = datetime.datetime.now() entityResults = [] entityIdsToNewClusterIdxs = {} entitiesAndClusters = [] for cluster in clusters: # TODO: make use of nemesis ids here. entityId = self.__getEntityIdForCluster(cluster) if not entityId: # One more layer of filtering here -- clusters that don't overall hit our quality minimum get # dropped. We never drop clusters that resolve to entities for this reason. if cluster.dataQuality >= MIN_CLUSTER_DATA_QUALITY: entitiesAndClusters.append((self.__proxyToEntity(cluster), cluster)) else: logClusterData('DROPPING CLUSTER for shitty data quality:\n%s' % cluster) # TODO PRELAUNCH: Make sure that the type we get from fast_resolve == the type we get from # StampedSourceObject.key, or else using these as keys in a map together won't work. elif entityId not in entityIdsToNewClusterIdxs: entityIdsToNewClusterIdxs[entityId] = len(entitiesAndClusters) entitiesAndClusters.append((self.__buildEntity(entityId), cluster)) else: originalIndex = entityIdsToNewClusterIdxs[entityId] (_, originalCluster) = entitiesAndClusters[originalIndex] # We're not actually augmenting the result at all here; the result is the unadultered entity. We won't # show an entity augmented with other third-party IDs we've attached in search results because it will # create inconsistency for the entity show page and we don't know if they will definitely be attached. # The point of the grok is entirely to boost the rank of the cluster (and thus of the entity.) # TODO PRELAUNCH: Consider overriding this for sparse or user-created entities. # TODO: Debug check to see if the two are definitely not a match according to our clustering logic. originalCluster.grok(cluster) # TODO: Reorder according to final scores that incorporate dataQuality and a richness score (presence of stamps, # presence of enriched entity, etc.) convertedToEntitiesTime = datetime.datetime.now() logTimingData('CONVERTING TO ENTITIES TOOK: %s' % (convertedToEntitiesTime - searchDoneTime)) self.rescoreFinalResults(entitiesAndClusters) rescoredTime = datetime.datetime.now() logTimingData('RESCORING TOOK: %s' % (rescoredTime - convertedToEntitiesTime)) return entitiesAndClusters def searchEntities(self, *args, **kwargs): return [entity for entity, _ in self.searchEntitiesAndClusters(*args, **kwargs)]