def enrichEntity(self, entity, decorations, max_iterations=None, timestamp=None):
        """
            (might be named enrichedEntityWithSources)
        enrichEntity takes a entity schema object (defined in api/Schemas.py), an output dict of decorations that is
            opaque to this class - only group objects and sources have an understanding of the decorations format
            the group method syncDecorations() handles all propagation of source local decorations to the output decoration dict
          returns a bool value indicating whether the entity was enriched
        """
        self.setNow(timestamp)
        max_iterations = max_iterations or self.__default_max_iterations
        modified_total = False
        logs.debug("Begin enrichment: %s (%s)" % (entity.title, entity.entity_id))

        # We will loop through all sources multiple times, because as data is enriched, previous unresolvable sources
        # may become resolvable and can enrich in turn.  If no fields are modified by any source in a given iteration,
        # then there's no reason to loop again
        for i in range(max_iterations):
            modified = False
            for source in self.__sources:
                if entity.kind not in source.kinds:
                    continue

                if entity.types and source.types and not set(entity.types).intersection(source.types):
                    continue

                groups = source.getGroups(entity)
                targetGroups = set()
                for group in groups:
                    if self.shouldEnrich(group, source.sourceName, entity):
                        targetGroups.add(group)
                if not targetGroups:
                    continue

                #  We have groups that are eligible for enrichment.  We'll modify a deep-copy of the entity
                copy = buildEntity(entity.dataExport())
                # timestamps is passed down to the source. If the source enriches a group, a mapping is added from the
                # group name to the time it was enriched (now, essentially). When the data we get from external source
                # is identical to what we already have, presence of the group in this map is the only way we can tell
                # that we received fresh data.
                # TODO: This is a dictionary for legacy reasons, it should really be a set.
                timestamps = {}
                localDecorations = {}  # opaque decorations, for group object based extensions (i.e. Menus)
                logs.debug("Enriching with '%s' for groups %s" % (source.sourceName, sorted(targetGroups)))
                groupObjs = [self.getGroup(group) for group in targetGroups]
                try:
                    enriched = source.enrichEntity(copy, groupObjs, self, localDecorations, timestamps)
                    if enriched:
                        for groupObj in groupObjs:
                            fieldsChanged = groupObj.syncFields(copy, entity)
                            decorationsChanged = groupObj.syncDecorations(localDecorations, decorations)
                            if fieldsChanged or groupObj.groupName in timestamps or decorationsChanged:
                                groupObj.setTimestamp(entity, self.now)
                                groupObj.setSource(entity, source.sourceName)
                                modified = True
                except Exception as e:
                    report()
            if not modified:
                break
            modified_total |= modified
        return modified_total
Beispiel #2
0
 def termWaiting():
     logs.debug('in termWaiting')
     try:
         return self.__terminateWaiting(pool, datetime.datetime.now(), category, results)
     except Exception:
         logs.report()
     logs.debug('done with termWaiting')
Beispiel #3
0
 def tracks(self):
     # We might be missing related items data entirely, in which case we start by issuing a lookup there.
     # TODO: This probably could be done as part of one lookup with the one about to be made.
     try:
         tracks = list(xp(self.data, 'RelatedItems')['c']['RelatedItem'])
     except KeyError:
         try:
             self._issueLookup()
         except LookupRequiredError:
             return []
     try:
         tracks = list(xp(self.data, 'RelatedItems')['c']['RelatedItem'])
         page_count = int(xp(self.data, 'RelatedItems', 'RelatedItemPageCount')['v'])
         for i in range(1,page_count):
             page = i+1
             self.countLookupCall('tracks')
             data = globalAmazon().item_lookup(ItemId=self.key,
                                               ResponseGroup='Large,RelatedItems',
                                               RelationshipType='Tracks',
                                               RelatedItemPage=str(page),
                                               timeout=MERGE_TIMEOUT)
             tracks.extend( xp(data, 'ItemLookupResponse', 'Items', 'Item', 'RelatedItems')['c']['RelatedItem'] )
         track_d = {}
         for track in tracks:
             track_d[ int(xp(track, 'Item', 'ItemAttributes', 'TrackSequence')['v']) ] = {
                 'name' : xp(track, 'Item', 'ItemAttributes', 'Title')['v'],
                 'key' : xp(track, 'Item', 'ASIN')['v'],
             }
         return [ track_d[k] for k in sorted(track_d) ]
     except LookupRequiredException:
         return []
     except Exception:
         # TODO: It seems possible that only one of the requests failed; shouldn't we keep the results of the others?
         report()
         return []
Beispiel #4
0
    def source(start, count):
        total = start + count
        while total > len(results):
            try:
                value = None
                if tolerant:
                    try:
                        value = constructor(generator.next())
                    except StopIteration:
                        raise
                    except Exception:
                        logs.report()
                else:
                    value = constructor(generator.next())
                if value is not None:
                    if unique:
                        if value not in value_set:
                            results.append(value)
                            value_set.add(value)
                    else:
                        results.append(value)
            except StopIteration:
                break

        result = results[start:]
        return result
Beispiel #5
0
 def __searchEntityTypeLite(self, entityType, queryText, resultsDict, timeout):
     try:
         if isinstance(queryText, unicode):
             queryText = queryText.encode('utf-8')
         resultsDict[entityType] = self.__itunes.method('search', entity=entityType, term=queryText, priority='high',
             timeout=timeout)['results']
     except Exception:
         logs.report()
Beispiel #6
0
 def wrapperFn():
     try:
         return logs.runInOtherLoggingContext(userFn, currLoggingContext)
     except GreenletExit:
         # If we deliberately killed the thread, don't log that. Makes the search logs really noisy.
         pass
     except:
         logs.report()
Beispiel #7
0
def writeComparisons(oldResults, newResults, outputDir):
    oldKeys = oldResults.viewkeys()
    newKeys = newResults.viewkeys()
    if oldKeys ^ newKeys:
        print 'WARNING: old and new results have mismatched keys:'
        print '%d OLD KEYS:' % len(oldKeys - newKeys), oldKeys - newKeys
        print '%d NEW KEYS:' % len(newKeys - oldKeys), newKeys - oldKeys

    changedRows = []
    clusteringChanges = []
    allRows = []
    commonKeys = oldKeys & newKeys
    for key in commonKeys:
        oldResolved, oldOriginal, oldProxyList = oldResults[key]
        newResolved, newOriginal, newProxyList = newResults[key]

        filename = key[:40] + '.html'
        oldData = __stripEntity(oldResolved.dataExport())
        newData = __stripEntity(newResolved.dataExport())
        try:
            with open(path.join(outputDir, filename), 'w') as fout:
                print >> fout, DIFF_FILE_HEADER
                print >> fout, '<h1>%s</h1>' % 'Enrich Input'
                print >> fout, __createDiffTable(pprint.pformat(oldOriginal), pprint.pformat(newOriginal))
                print >> fout, '<h1>%s</h1>' % 'Resolve output'
                print >> fout, __createDiffTable(pprint.pformat(oldResolved.dataExport()), pprint.pformat(newResolved.dataExport()))
                print >> fout, '<h1>%s</h1>' % 'List of resolver objects:'
                print >> fout, __createDiffTable(__formatProxyList(oldProxyList), __formatProxyList(newProxyList))
                print >> fout, '</body></html>'
        except Exception:
            logs.warning('Error writing diff file!')
            logs.report()
        diffLink = '<td><a href="%s">show diffs</a></td>' % filename

        tableRow = '<tr><td>%s</td>%s</tr>' % (oldOriginal['title'][:100], diffLink)
        if oldData != newData:
            changedRows.append(tableRow)
        if __hasClusteringChange(oldProxyList, newProxyList):
            clusteringChanges.append(tableRow)
        allRows.append(tableRow)
    allRowsFilename = 'index_all.html'
    writeTableOfContent(allRows, 'All results', path.join(outputDir, allRowsFilename))

    summary = """
        %d out of %d (%f%%) of the rows had clustering change. Here's a shuffled list of them.
        <a href="%s">show all</a>
        """ % (len(clusteringChanges), len(allRows), float(len(clusteringChanges)) * 100 / len(allRows), allRowsFilename)
    random.shuffle(clusteringChanges)
    writeTableOfContent(clusteringChanges, summary, path.join(outputDir, 'index_cluster.html'))

    summary = """
        %d out of %d (%f%%) of the rows changed. Here's a shuffled list of them.
        <a href="%s">show all</a>
        """ % (len(changedRows), len(allRows), float(len(changedRows)) * 100 / len(allRows), allRowsFilename)
    random.shuffle(changedRows)
    writeTableOfContent(changedRows, summary, path.join(outputDir, 'index.html'))
Beispiel #8
0
 def wrapper(worker, job):
     try:
         task_id, key, data, extra = pickleDecoder(job.data)
         handler(task_id, key, data, **extra)
     except Exception as e:
         basic_message = "Invalid job: %s %s" % (job.task, job.unique)
         logs.error(basic_message)
         logs.report()
         _warningEmail(basic_message)
     return ''
Beispiel #9
0
    def getProxiesForEntity(self, entity):
        source_id = self.getId(entity)
        if source_id is None:
            try:
                query = self.stamped.proxyFromEntity(entity)
                results = self.resolve(query)
                return [result[1] for result in results if result[0]['resolved']]
            except ValueError:
                logs.report()
                return []

        return [self.entityProxyFromKey(source_id, entity=entity)]
Beispiel #10
0
    def entityProxyFromKey(self, key, **kwargs):
        try:
            lookupData = globalAmazon().item_lookup(ResponseGroup='Large', ItemId=key, timeout=MERGE_TIMEOUT)
            result = _getLookupResult(lookupData)
            kind = xp(result, 'ItemAttributes', 'ProductGroup')['v'].lower()
            logs.debug(kind)

            if kind == 'book' or kind == 'ebooks':
                return AmazonBook(key, result, 0)
            if kind == 'video games':
                return AmazonVideoGame(key, result, 0)
            return self.__constructMusicObjectFromResult(result, 0)
        except KeyError:
            logs.report()
        return None
Beispiel #11
0
 def enrichEntity(self, entity, groups, controller, decorations, timestamps):
     singleplatform_id = getattr(entity.sources, 'singleplatform_id')
     try:
         if singleplatform_id is not None:
             if controller.shouldEnrich('menu', self.sourceName, entity):
                 menu = self.__singleplatform.get_menu_schema(singleplatform_id, timeout=MERGE_TIMEOUT)
                 entity.menu = menu != None
                 if menu is not None:
                     menu.entity_id = entity.entity_id
                     decorations['menu'] = menu
                     logs.debug('Regenerated menu for %s' % singleplatform_id)
     except HTTPError as e:
         logs.warning("HttpError %s from SinglePlatform for %s" % (e.code,singleplatform_id))
     except Exception as e:
         report("unexpected SinglePlatformSource error: %s" % e)
     return True
Beispiel #12
0
 def wrapper(task_id, key, data, **kwargs):
     try:
         logs.begin(saveLog=api._logsDB.saveLog,
                    saveStat=api._statsDB.addStat,
                    nodeName=api.node_name)
         logs.async_request(key)
         logs.info("Request %s: %s: %s: %s" % (task_id, key, data, kwargs))
         handler(task_id, key, data, **kwargs)
         logs.info("Finished with request %s" % (task_id,))
     except Exception as e:
         logs.error("Failed request %s" % (task_id,))
         logs.report()
         _warningEmail('%s - %s failed (%s)' % (api.node_name, key, datetime.utcnow().isoformat()))
     finally:
         logs.info('Saving request log for request %s' % (task_id,))
         try:
             logs.save()
         except Exception:
             print 'Unable to save logs'
             import traceback
             traceback.print_exc()
             logs.warning(traceback.format_exc())
    def _convertFromMongo(self, document, mini=False):
        if document is None:
            return None

        if '_id' in document and self._primary_key is not None:
            document[self._primary_key] = self._getStringFromObjectId(document['_id'])
            del(document['_id'])

        ### HACK: Verify that 'created' timestamp exists for entity
        if 'timestamp' not in document or 'created' not in document['timestamp']:
            try:
                created = ObjectId(document[self._primary_key]).generation_time.replace(tzinfo=None)
            except:
                report()
                raise
            document['timestamp'] = { 'created' : created }

        document.pop('titlel', None)
        document.pop('search_tokens', None)

        entity = buildEntity(document, mini=mini)

        return entity
Beispiel #14
0
    def __searchSource(self, source, queryCategory, queryText, resultsDict, timesDict, **queryParams):
        try:
            # Note that the timing here is not 100% legit because gevent won't interrupt code except on I/O, but it's good
            # enough to give a solid idea.
            before = datetime.datetime.now()
            if shouldLogRawSourceResults:
                queryParams['logRawResults'] = True
            results = source.searchLite(queryCategory, queryText, **queryParams)

            after = datetime.datetime.now()
            # First level of filtering on data quality score -- results that are really horrendous get dropped entirely
            # pre-clustering.
            filteredResults = [result for result in results if result.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_CLUSTER]
            timesDict[source] = after - before
            logs.debug("GOT RESULTS FROM SOURCE %s IN ELAPSED TIME %s -- COUNT: %d, AFTER FILTERING: %d" % (
                source.sourceName, str(after - before), len(results), len(filteredResults)
            ))
            resultsDict[source] = filteredResults
        except GreenletExit:
            pass
        except:
            logs.report()
            resultsDict[source] = []
Beispiel #15
0
import Globals
from logs import report

try:
    import logs
    from resolve.Resolver           import *
    from resolve.ResolverObject     import *
    from resolve.TitleUtils         import *
    from libs.Rdio                  import Rdio, globalRdio
    from resolve.GenericSource      import GenericSource, MERGE_TIMEOUT, SEARCH_TIMEOUT
    from utils                      import lazyProperty
    from pprint                     import pformat
    from search.ScoringUtils        import *
except:
    report()
    raise

class _RdioObject(object):
    """
    Abstract superclass (mixin) for Rdio objects.

    _RdioObjects can be instatiated with either the rdio_id or the rdio data for an entity.
    If both are provided, they must match. extras may be used to retrieve additional data
    when instantiating an object using only its id.

    Attributes:

    data - the type-specific rdio data for the entity
    rdio - an instance of Rdio (API proxy)
    """
Beispiel #16
0
    def searchLite(self, queryCategory, queryText, timeout=None, coords=None, logRawResults=False):
        if queryCategory not in ('music', 'film', 'app', 'book'):
            raise NotImplementedError()

        supportedProxyTypes = {
            'music': (iTunesArtist, iTunesAlbum, iTunesTrack),
            'film': (iTunesMovie, iTunesTVShow),
            'app': (iTunesApp,),
            'book': (iTunesBook,),
        }[queryCategory]

        types = mapCategoryToTypes(queryCategory)
        iTunesTypes = []
        typesMap = dict(self.__types_to_itunes_strings)
        for entityType in types:
            iTunesTypes.append(typesMap[entityType])

        pool = Pool(len(iTunesTypes))
        rawResults = {}
        for iTunesType in iTunesTypes:
            pool.spawn(self.__searchEntityTypeLite, iTunesType, queryText, rawResults, timeout)
        pool.join(timeout=timeout)

        if logRawResults:
            logComponents = ["\n\n\nITUNES RAW RESULTS\nITUNES RAW RESULTS\nITUNES RAW RESULTS\n\n\n"]

        searchResultsByType = {}
        # Convert from JSON objects to entity proxies. Pass through actual parsing errors, but report & drop the result
        # if we just see a type we aren't expecting. (Music search will sometimes return podcasts, for instance.)
        for (iTunesType, rawTypeResults) in rawResults.items():
            processedResults = []
            for rawResult in rawTypeResults:
                try:
                    if logRawResults:
                        logComponents.extend(['\n\n', pformat(rawResult), '\n\n'])
                    proxy = self.__createEntityProxy(rawResult, maxLookupCalls=0)
                    if not any(isinstance(proxy, proxyType) for proxyType in supportedProxyTypes):
                        logs.warning('Dropping iTunes proxy of unsupported type %s for queryCategory %s:\n\n%s\n\n' %
                                     (proxy.__class__.__name__, queryCategory, str(proxy)))
                        continue
                    processedResults.append(self.__createEntityProxy(rawResult, maxLookupCalls=0))
                except UnknownITunesTypeError:
                    logs.report()
                    pass

            if len(processedResults) > 0:
                searchResultsByType[iTunesType] = self.__scoreResults(iTunesType, processedResults, queryText)

        if logRawResults:
            logComponents.append("\n\n\nEND RAW ITUNES RESULTS\n\n\n")
            logs.debug(''.join(logComponents))

        if len(searchResultsByType) == 0:
            # TODO: Throw exception to avoid cache?
            return []
        if len(searchResultsByType) == 1:
            return searchResultsByType.values()[0]
        if queryCategory == 'music':
            # We have to separately request songs, albums, and artists because iTunes does a terrible job blending
            # results between the three. So we need to blend, but it's hard to know how to. We do a little work on the
            # string matching side, but
            self.__augmentAlbumAndArtistResultsWithSongs(searchResultsByType.get('album', []),
                searchResultsByType.get('musicArtist', []),
                searchResultsByType.get('song', []))
        return interleaveResultsByRelevance(searchResultsByType.values())
Beispiel #17
0
__copyright__ = "Copyright (c) 2011-2012 Stamped.com"
__license__   = "TODO"

import Globals, logs, re
import unicodedata, utils

try:
    from api.Constants  import *
    from api.Schemas    import *
    from difflib        import SequenceMatcher
    from libs.LibUtils  import parseDateString
    from datetime       import datetime
    from bson.objectid  import ObjectId 
    from collections    import defaultdict
except:
    logs.report()
    raise

def mapSubcategoryToCategory(subcategory):
    try:
        return subcategoryData[subcategory][0]
    except KeyError:
        logs.warning("Subcategory not defined: %s" % subcategory)
        raise

def mapSubcategoryToKinds(subcategory):
    try:
        return set(subcategoryData[subcategory][1])
    except KeyError:
        logs.warning("Subcategory not defined: %s" % subcategory)
        raise
Beispiel #18
0
    def __terminateWaiting(self, pool, start_time, category, resultsDict):
        logTimingData('IN TERMINATE WAITING')
        sources_to_priorities = dict(self.__categories_to_sources_and_priorities[category])
        total_value_received = 0
        total_potential_value_outstanding = sum(sources_to_priorities.values())
        sources_seen = set()
        while True:
            try:
                elapsed_seconds = total_seconds(datetime.datetime.now() - start_time)

                if elapsed_seconds >= 7:
                    logs.warning('Search completely timed out at 7s!')
                    pool.kill()
                    return

                for (source, results) in resultsDict.items():
                    if source in sources_seen:
                        continue
                    logTimingData('JUST NOW SEEING SOURCE: ' + source.sourceName)
                    sources_seen.add(source)
                    # If a source returns at least 5 results, we assume we got a good result set from it. If it
                    # returns less, we're more inclined to wait for straggling sources.
                    total_value_received += sources_to_priorities[source] * min(5, len(results)) / 5.0
                    logTimingData('DECREMENTING OUTSTANDING BY ' + str(sources_to_priorities[source]) + ' FOR SOURCE ' + source.sourceName)
                    total_potential_value_outstanding -= sources_to_priorities[source]
                logTimingData('AT %f seconds elapsed, TOTAL VALUE RECEIVED IS %f, TOTAL OUTSTANDING IS %f' % (
                        elapsed_seconds, total_value_received, total_potential_value_outstanding
                    ))
            except Exception:
                logs.warning('TERMINATE_WARNING SHIT IS F****D')
                logs.report()
                raise

            if total_potential_value_outstanding <= 0:
                logTimingData('ALL SOURCES DONE')
                return

            if total_value_received:
                marginal_value_of_outstanding_sources = total_potential_value_outstanding / total_value_received
                # Comes out to:
                #   0.08 for 1s
                #   0.25 for 1.5s
                #   0.79 for 2s
                #   2.51 for 2.5s
                #   7.94 for 3s
                # So we'll ditch that 4th remaining source for music around 1.5s; we'll ditch the second source for
                # something like Places around 2s; we'll ditch any lingering source around 3s if we've received
                # anything.
                min_marginal_value = 10 ** (elapsed_seconds - 2.1)
                if min_marginal_value > marginal_value_of_outstanding_sources:
                    sources_not_seen = [
                        source.sourceName for source in sources_to_priorities.keys() if source not in sources_seen
                    ]
                    if sources_not_seen:
                        # This is interesting information whether we want the full timing data logged or not.
                        log_template = 'QUITTING EARLY: At %f second elapsed, bailing on sources [%s] because with ' + \
                            'value received %f, value outstanding %f, marginal value %f, min marginal value %f'
                        logs.debug(log_template % (
                            elapsed_seconds, ', '.join(sources_not_seen), total_value_received,
                            total_potential_value_outstanding, marginal_value_of_outstanding_sources, min_marginal_value
                        ))
                    pool.kill()
                    return

            gevent.sleep(0.01)