def main():
    usage = "Usage: %prog [options]"
    version = "%prog " + __version__
    parser = OptionParser(usage=usage, version=version)
    parser.add_option('--dry_run', action='store_true', dest='dry_run', default=False)
    parser.add_option('--nuclear', action='store_true', dest='nuclear', default=False)
    parser.add_option('--max_to_remove', action='store', dest='max_to_remove', default=None)
    (options, args) = parser.parse_args()
    if options.nuclear or options.dry_run:
        print 'F**K THE WORLD'
        MongoEntityColleciton()._collection.drop()
        # "Welcome to the human race." --Snake Plissken
        return

    entity_collection = MongoEntityCollection()._collection
    entity_ids = [result['_id'] for result in entity_collection.find(fields={'_id':True})]
    todos = MongoTodoCollection()
    stamps = MongoStampCollection()

    removed = 0
    for entity_id in entity_ids:
        if options.max_to_remove is not None and removed >= options.max_to_remove:
            return
        has_attached_user_interactions = (
            list(todos._collection.find({'entity.entity_id' : str(entity_id)}, fields={'_id':1})) or
            list(stamps._collection.find({'entity.entity_id' : str(entity_id)}, fields={'_id':1}))
        )
        if has_attached_user_interactions:
            print 'SKIPPING', entity_id
            continue
        entity_collection.remove({'_id':entity_id})
        removed += 1
예제 #2
0
def main():
    usage = "Usage: %prog --entity_id=<id>  OR  %prod --search_id=<id> OR %prod <query> <subcategory?> <index?>"
    version = "%prog " + __version__
    parser = OptionParser(usage=usage, version=version)
    parser.add_option('--entity_id', action='store', dest='entity_id', default=None)
    parser.add_option('--search_id', action='store', dest='search_id', default=None)
    (options, args) = parser.parse_args()

    if options.entity_id and options.search_id:
        print '--entity_id and --search_id are mutually exclusive!'
    id_provided = options.entity_id or options.search_id
    if id_provided and len(args) > 1:
        print '--entity_id and --search_id cannot be used with query arguments!'

    if options.entity_id:
        from api.db.mongodb.MongoEntityCollection import MongoEntityCollection
        entity = MongoEntityCollection().getEntity(options.entity_id)
    elif options.search_id:
        entity = getEntityFromSearchId(options.search_id)
    else:
        query = buildQueryFromArgs(args)
        from api.MongoStampedAPI import MongoStampedAPI
        cursor = MongoStampedAPI()._entityDB._collection.find(query)
        if cursor.count() == 0:
            print("Could not find a matching entity for query: %s" % query)
            return

        entity =  MongoStampedAPI()._entityDB._convertFromMongo(cursor[0])


    print( "Before:\n%s" % pformat( entity.dataExport() ) )

    container = FullResolveContainer()

    decorations = {}
    container.enrichEntity( entity, decorations )

    print( "After:\n%s" % pformat( entity.dataExport() ) )
    if len(decorations) > 0:
        print( "With decorations:")

        for k,v in decorations.items():
            print( "%s decoration:" % k )

            try:
                print( "%s" % pformat(v.dataExport()) )
            except Exception:
                print( "%s" % pformat(v) )

    from libs.CountedFunction import printFunctionCounts
    printFunctionCounts()
예제 #3
0
def main():
    usage = "Usage: %prog [options]"
    version = "%prog " + __version__
    parser = OptionParser(usage=usage, version=version)
    parser.add_option('--dry_run', action='store_true', dest='dry_run', default=None)
    # TODO: Ability to limit by vertical
    parser.add_option('--max_checks', type='int', action='store', dest='max_checks', default=-1)
    parser.add_option('--max_errors', type='int', action='store', dest='max_errors', default=-1)
    parser.add_option('--stamped_only', action='store_true', dest='stamped_only', default=False)
    parser.add_option('--report_out', action='store', dest='report_out', default=None)
    (options, args) = parser.parse_args()

    if not options.report_out:
        raise Exception('--report_out is required!')

    all_entity_ids = getAllEntityIds(options.stamped_only)
    random.shuffle(all_entity_ids)
    error_entity_ids = []
    entities_checked = 0
    entity_collection = MongoEntityCollection()
    report_file = open(options.report_out, 'w')
    if options.max_checks > 0:
        all_entity_ids = all_entity_ids[:options.max_checks]
    for entity_id in all_entity_ids:
        if options.max_errors > 0 and len(error_entity_ids) >= options.max_errors:
            break
        try:
            entities_checked += 1
            entity = entity_collection.getEntity(entity_id)
            well_resolved = entityIsWellResolved(entity, report_file)
            if not well_resolved:
                error_entity_ids.append(entity_id)
        except ValueError:
            pass

    report_file.close()

    print 'Of %d entities examined, %d were found to have errors!' % (entities_checked, len(error_entity_ids))
    for id in error_entity_ids:
        print id

    for (source, num_attempts) in sourceAttemptCounts.items():
        print('source %s was seen in %d entities, and %d of those references were broken' % (
            source, num_attempts, sourceFailureCounts[source]
        ))
예제 #4
0
    def __init__(self):
        allCategories = Constants.categories
        self.__all_sources = []
        self.__entity_collection = MongoEntityCollection()
        self.__stats_collection = MongoEntityStatsCollection()
        # Within each category, we have a number of sources and each is assigned a priority. The priority is used to
        # determine how long to wait for results from that source.
        self.__categories_to_sources_and_priorities = {}
        for category in allCategories:
            self.__categories_to_sources_and_priorities[category] = []

        self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3)
        self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10)
        # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than
        # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no
        # clear distinction between individual movies and box sets, etc.
        self.__registerSource(AmazonSource(), music=5, book=10)
        self.__registerSource(FactualSource(), place=8)
        self.__registerSource(GooglePlacesSource(), place=8)
        self.__registerSource(RdioSource(), music=8)
        self.__registerSource(SpotifySource(), music=8)
        self.__registerSource(TMDBSource(), film=8)
        self.__registerSource(TheTVDBSource(), film=8)
예제 #5
0
 def test_db_fixture_string(self):
     # For this test, there is just hard-coded fixture text with no regenerate function, so we will always just get
     # this string.
     entityCollection = MongoEntityCollection()
     entity = entityCollection.getEntity("4e4c67f226f05a2ba9000002")
     print "The entity I got is:\n\n", entity, "\n\n"
예제 #6
0
class EntitySearch(object):
    def __registerSource(self, source, **categoriesToPriorities):
        self.__all_sources.append(source)
        for (category, priority) in categoriesToPriorities.items():
            if category not in Constants.categories:
                raise Exception("unrecognized category: %s" % category)
            self.__categories_to_sources_and_priorities[category].append((source, priority))

    def __init__(self):
        allCategories = Constants.categories
        self.__all_sources = []
        self.__entity_collection = MongoEntityCollection()
        self.__stats_collection = MongoEntityStatsCollection()
        # Within each category, we have a number of sources and each is assigned a priority. The priority is used to
        # determine how long to wait for results from that source.
        self.__categories_to_sources_and_priorities = {}
        for category in allCategories:
            self.__categories_to_sources_and_priorities[category] = []

        self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3)
        self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10)
        # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than
        # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no
        # clear distinction between individual movies and box sets, etc.
        self.__registerSource(AmazonSource(), music=5, book=10)
        self.__registerSource(FactualSource(), place=8)
        self.__registerSource(GooglePlacesSource(), place=8)
        self.__registerSource(RdioSource(), music=8)
        self.__registerSource(SpotifySource(), music=8)
        self.__registerSource(TMDBSource(), film=8)
        self.__registerSource(TheTVDBSource(), film=8)

    def __terminateWaiting(self, pool, start_time, category, resultsDict):
        logTimingData('IN TERMINATE WAITING')
        sources_to_priorities = dict(self.__categories_to_sources_and_priorities[category])
        total_value_received = 0
        total_potential_value_outstanding = sum(sources_to_priorities.values())
        sources_seen = set()
        while True:
            try:
                elapsed_seconds = total_seconds(datetime.datetime.now() - start_time)

                if elapsed_seconds >= 7:
                    logs.warning('Search completely timed out at 7s!')
                    pool.kill()
                    return

                for (source, results) in resultsDict.items():
                    if source in sources_seen:
                        continue
                    logTimingData('JUST NOW SEEING SOURCE: ' + source.sourceName)
                    sources_seen.add(source)
                    # If a source returns at least 5 results, we assume we got a good result set from it. If it
                    # returns less, we're more inclined to wait for straggling sources.
                    total_value_received += sources_to_priorities[source] * min(5, len(results)) / 5.0
                    logTimingData('DECREMENTING OUTSTANDING BY ' + str(sources_to_priorities[source]) + ' FOR SOURCE ' + source.sourceName)
                    total_potential_value_outstanding -= sources_to_priorities[source]
                logTimingData('AT %f seconds elapsed, TOTAL VALUE RECEIVED IS %f, TOTAL OUTSTANDING IS %f' % (
                        elapsed_seconds, total_value_received, total_potential_value_outstanding
                    ))
            except Exception:
                logs.warning('TERMINATE_WARNING SHIT IS F****D')
                logs.report()
                raise

            if total_potential_value_outstanding <= 0:
                logTimingData('ALL SOURCES DONE')
                return

            if total_value_received:
                marginal_value_of_outstanding_sources = total_potential_value_outstanding / total_value_received
                # Comes out to:
                #   0.08 for 1s
                #   0.25 for 1.5s
                #   0.79 for 2s
                #   2.51 for 2.5s
                #   7.94 for 3s
                # So we'll ditch that 4th remaining source for music around 1.5s; we'll ditch the second source for
                # something like Places around 2s; we'll ditch any lingering source around 3s if we've received
                # anything.
                min_marginal_value = 10 ** (elapsed_seconds - 2.1)
                if min_marginal_value > marginal_value_of_outstanding_sources:
                    sources_not_seen = [
                        source.sourceName for source in sources_to_priorities.keys() if source not in sources_seen
                    ]
                    if sources_not_seen:
                        # This is interesting information whether we want the full timing data logged or not.
                        log_template = 'QUITTING EARLY: At %f second elapsed, bailing on sources [%s] because with ' + \
                            'value received %f, value outstanding %f, marginal value %f, min marginal value %f'
                        logs.debug(log_template % (
                            elapsed_seconds, ', '.join(sources_not_seen), total_value_received,
                            total_potential_value_outstanding, marginal_value_of_outstanding_sources, min_marginal_value
                        ))
                    pool.kill()
                    return

            gevent.sleep(0.01)

    def __searchSource(self, source, queryCategory, queryText, resultsDict, timesDict, **queryParams):
        try:
            # Note that the timing here is not 100% legit because gevent won't interrupt code except on I/O, but it's good
            # enough to give a solid idea.
            before = datetime.datetime.now()
            if shouldLogRawSourceResults:
                queryParams['logRawResults'] = True
            results = source.searchLite(queryCategory, queryText, **queryParams)

            after = datetime.datetime.now()
            # First level of filtering on data quality score -- results that are really horrendous get dropped entirely
            # pre-clustering.
            filteredResults = [result for result in results if result.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_CLUSTER]
            timesDict[source] = after - before
            logs.debug("GOT RESULTS FROM SOURCE %s IN ELAPSED TIME %s -- COUNT: %d, AFTER FILTERING: %d" % (
                source.sourceName, str(after - before), len(results), len(filteredResults)
            ))
            resultsDict[source] = filteredResults
        except GreenletExit:
            pass
        except:
            logs.report()
            resultsDict[source] = []

    def search(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None):
        if not isinstance(text, unicode):
            text = text.decode('utf-8')
        if category not in Constants.categories:
            raise Exception("unrecognized category: (%s)" % category)

        start = datetime.datetime.now()
        results = {}
        times = {}
        pool = utils.LoggingThreadPool(len(self.__categories_to_sources_and_priorities))

        def termWaiting():
            logs.debug('in termWaiting')
            try:
                return self.__terminateWaiting(pool, datetime.datetime.now(), category, results)
            except Exception:
                logs.report()
            logs.debug('done with termWaiting')

        logs.debug("SHOULD_DISABLE_TIMEOUT IS " + str(shouldDisableTimeout))
        if not shouldDisableTimeout:
            logTimingData('SPAWNING TERMINATE WAITING')
            #pool.spawn(self.__terminateWaiting, pool, datetime.datetime.now(), category, results)
            pool.spawn(termWaiting)

        for (source, priority) in self.__categories_to_sources_and_priorities[category]:
            # TODO: Handing the exact same timeout down to the inner call is probably wrong because we end up in this
            # situation where outer pools and inner pools are using the same timeout and possibly the outer pool will
            # nix the whole thing before the inner pool cancels out, which is what we'd prefer so that it's handled
            # more gracefully.
            pool.spawn(self.__searchSource, source, category, text, results, times, timeout=timeout, coords=coords)


        logTimingData("TIME CHECK ISSUED ALL QUERIES AT " + str(datetime.datetime.now()))
        pool.join()
        logTimingData("TIME CHECK GOT ALL RESPONSES AT " + str(datetime.datetime.now()))

        logTimingData('TIMES: ' + (', '.join(['%s took %s' % (source.sourceName, str(times[source])) for source in times])))
        for source in self.__all_sources:
            if source in results and results[source]:
                logSourceResultsData("\nRESULTS FROM SOURCE " + source.sourceName + " TIME ELAPSED: " + str(times[source]) + "\n\n")
                for result in results[source]:
                    logSourceResultsData(utils.normalize(repr(result)))
                    pass

        beforeDeduping = datetime.datetime.now()
        dedupedResults = SearchResultDeduper().dedupeResults(category, results.values())
        afterDeduping = datetime.datetime.now()
        logTimingData("DEDUPING TOOK " + str(afterDeduping - beforeDeduping))
        logTimingData("TIME CHECK DONE AT:" + str(datetime.datetime.now()))
        logTimingData("ELAPSED:" + str(afterDeduping - start))

        logClusterData("\n\nDEDUPED RESULTS\n\n")
        for dedupedResult in dedupedResults[:limit]:
            logClusterData("\n\n%s\n\n" % str(dedupedResult))

        return dedupedResults[:limit]


    def __getEntityIdForCluster(self, cluster):
        idsFromClusteredEntities = []
        fastResolveQueries = []
        for result in cluster.results:
            if result.dataQuality < MIN_RESULT_DATA_QUALITY_TO_INCLUDE:
                continue
            if result.resolverObject.source == 'stamped':
                idsFromClusteredEntities.append(result.resolverObject.key)
            else:
                fastResolveQueries.append((result.resolverObject.source, result.resolverObject.key))

        fastResolvedIds = filter(None, self.__stampedSource.resolve_fast_batch(fastResolveQueries)) if fastResolveQueries else []

        allIds = idsFromClusteredEntities + fastResolvedIds
        if len(idsFromClusteredEntities) > 2:
            logs.warning('Search results directly clustered multiple StampedSource results: [%s]' %
                         ', '.join(str(entityId) for entityId in idsFromClusteredEntities))
        elif len(allIds) > 2:
            logs.warning('Search results indirectly clustered multiple entity IDs together: [%s]' %
                         ', '.join(str(entityId) for entityId in allIds))
        if not allIds:
            return None
        return allIds[0]


    def __proxyToEntity(self, cluster):
        # Additional level of filtering -- some things get clustered (for the purpose of boosting certain cluster
        # scores) but never included in the final result because we're not 100% that the data is good enough to show
        # users.
        filteredResults = [r for r in cluster.results if r.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_INCLUDE]
        # So this is ugly, but it's pretty common for two listings to have the same or virtually the same data quality
        # and using relevance as a tie-breaker is really helpful.
        filteredResults.sort(key=lambda r: (r.dataQuality + (r.relevance / 10.0), r.resolverObject.source, r.resolverObject.key), reverse=True)
        # TODO PRELAUNCH: Only use the best result from each source.
        entity = EntityProxyContainer().addAllProxies(result.resolverObject for result in filteredResults).buildEntity()
        for result in filteredResults:
            entity.addThirdPartyId(result.resolverObject.source, result.resolverObject.key)
        return entity

    @utils.lazyProperty
    def __stampedSource(self):
        return StampedSource()


    def __buildEntity(self, entityId):
        entity = self.__entity_collection.getEntity(entityId)
        entity._maybeRegenerateThirdPartyIds()
        return entity


    def rescoreFinalResults(self, entityAndClusterList):
        def isTempEntity(entity):
            return entity.entity_id is None
        realEntityIds = [ entity.entity_id for (entity, cluster) in entityAndClusterList if not isTempEntity(entity) ]
        entityStats = self.__stats_collection.getStatsForEntities(realEntityIds)
        statsByEntityId = dict([(stats.entity_id, stats) for stats in entityStats])

        def scoreEntityAndCluster((entity, cluster)):
            if isTempEntity(entity):
                dataScore = cluster.dataQuality
            else:
                numStamps = 0
                if entity.entity_id in statsByEntityId:
                    numStamps = statsByEntityId[entity.entity_id].num_stamps
                dataScore = 1.1 + math.log(numStamps+1, 50)

            # TODO: Possibly distinguish even more about which of these have rich data. There are some types of data
            # that don't affect dataQuality because they don't make us less certain about the state of a cluster, but
            # they make user interactions with it more positive -- pictures, preview URLs, etc. We should factor
            # these in here.
            return dataScore * cluster.relevance

        entityAndClusterList.sort(key=scoreEntityAndCluster, reverse=True)


    def searchEntitiesAndClusters(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None):
        clusters = self.search(category, text, timeout=timeout, limit=limit, coords=coords)
        searchDoneTime = datetime.datetime.now()
        entityResults = []

        entityIdsToNewClusterIdxs = {}
        entitiesAndClusters = []
        for cluster in clusters:
            # TODO: make use of nemesis ids here.
            entityId = self.__getEntityIdForCluster(cluster)
            if not entityId:
                # One more layer of filtering here -- clusters that don't overall hit our quality minimum get
                # dropped. We never drop clusters that resolve to entities for this reason.
                if cluster.dataQuality >= MIN_CLUSTER_DATA_QUALITY:
                    entitiesAndClusters.append((self.__proxyToEntity(cluster), cluster))
                else:
                    logClusterData('DROPPING CLUSTER for shitty data quality:\n%s' % cluster)

            # TODO PRELAUNCH: Make sure that the type we get from fast_resolve == the type we get from
            # StampedSourceObject.key, or else using these as keys in a map together won't work.
            elif entityId not in entityIdsToNewClusterIdxs:
                entityIdsToNewClusterIdxs[entityId] = len(entitiesAndClusters)
                entitiesAndClusters.append((self.__buildEntity(entityId), cluster))
            else:
                originalIndex = entityIdsToNewClusterIdxs[entityId]
                (_, originalCluster) = entitiesAndClusters[originalIndex]
                # We're not actually augmenting the result at all here; the result is the unadultered entity. We won't
                # show an entity augmented with other third-party IDs we've attached in search results because it will
                # create inconsistency for the entity show page and we don't know if they will definitely be attached.
                # The point of the grok is entirely to boost the rank of the cluster (and thus of the entity.)
                # TODO PRELAUNCH: Consider overriding this for sparse or user-created entities.
                # TODO: Debug check to see if the two are definitely not a match according to our clustering logic.
                originalCluster.grok(cluster)

        # TODO: Reorder according to final scores that incorporate dataQuality and a richness score (presence of stamps,
        # presence of enriched entity, etc.)

        convertedToEntitiesTime = datetime.datetime.now()
        logTimingData('CONVERTING TO ENTITIES TOOK: %s' % (convertedToEntitiesTime - searchDoneTime))

        self.rescoreFinalResults(entitiesAndClusters)
        rescoredTime = datetime.datetime.now()
        logTimingData('RESCORING TOOK: %s' % (rescoredTime - convertedToEntitiesTime))

        return entitiesAndClusters


    def searchEntities(self, *args, **kwargs):
        return [entity for entity, _ in self.searchEntitiesAndClusters(*args, **kwargs)]