def enrichEntity(self, entity, decorations, max_iterations=None, timestamp=None): """ (might be named enrichedEntityWithSources) enrichEntity takes a entity schema object (defined in api/Schemas.py), an output dict of decorations that is opaque to this class - only group objects and sources have an understanding of the decorations format the group method syncDecorations() handles all propagation of source local decorations to the output decoration dict returns a bool value indicating whether the entity was enriched """ self.setNow(timestamp) max_iterations = max_iterations or self.__default_max_iterations modified_total = False logs.debug("Begin enrichment: %s (%s)" % (entity.title, entity.entity_id)) # We will loop through all sources multiple times, because as data is enriched, previous unresolvable sources # may become resolvable and can enrich in turn. If no fields are modified by any source in a given iteration, # then there's no reason to loop again for i in range(max_iterations): modified = False for source in self.__sources: if entity.kind not in source.kinds: continue if entity.types and source.types and not set(entity.types).intersection(source.types): continue groups = source.getGroups(entity) targetGroups = set() for group in groups: if self.shouldEnrich(group, source.sourceName, entity): targetGroups.add(group) if not targetGroups: continue # We have groups that are eligible for enrichment. We'll modify a deep-copy of the entity copy = buildEntity(entity.dataExport()) # timestamps is passed down to the source. If the source enriches a group, a mapping is added from the # group name to the time it was enriched (now, essentially). When the data we get from external source # is identical to what we already have, presence of the group in this map is the only way we can tell # that we received fresh data. # TODO: This is a dictionary for legacy reasons, it should really be a set. timestamps = {} localDecorations = {} # opaque decorations, for group object based extensions (i.e. Menus) logs.debug("Enriching with '%s' for groups %s" % (source.sourceName, sorted(targetGroups))) groupObjs = [self.getGroup(group) for group in targetGroups] try: enriched = source.enrichEntity(copy, groupObjs, self, localDecorations, timestamps) if enriched: for groupObj in groupObjs: fieldsChanged = groupObj.syncFields(copy, entity) decorationsChanged = groupObj.syncDecorations(localDecorations, decorations) if fieldsChanged or groupObj.groupName in timestamps or decorationsChanged: groupObj.setTimestamp(entity, self.now) groupObj.setSource(entity, source.sourceName) modified = True except Exception as e: report() if not modified: break modified_total |= modified return modified_total
def termWaiting(): logs.debug('in termWaiting') try: return self.__terminateWaiting(pool, datetime.datetime.now(), category, results) except Exception: logs.report() logs.debug('done with termWaiting')
def tracks(self): # We might be missing related items data entirely, in which case we start by issuing a lookup there. # TODO: This probably could be done as part of one lookup with the one about to be made. try: tracks = list(xp(self.data, 'RelatedItems')['c']['RelatedItem']) except KeyError: try: self._issueLookup() except LookupRequiredError: return [] try: tracks = list(xp(self.data, 'RelatedItems')['c']['RelatedItem']) page_count = int(xp(self.data, 'RelatedItems', 'RelatedItemPageCount')['v']) for i in range(1,page_count): page = i+1 self.countLookupCall('tracks') data = globalAmazon().item_lookup(ItemId=self.key, ResponseGroup='Large,RelatedItems', RelationshipType='Tracks', RelatedItemPage=str(page), timeout=MERGE_TIMEOUT) tracks.extend( xp(data, 'ItemLookupResponse', 'Items', 'Item', 'RelatedItems')['c']['RelatedItem'] ) track_d = {} for track in tracks: track_d[ int(xp(track, 'Item', 'ItemAttributes', 'TrackSequence')['v']) ] = { 'name' : xp(track, 'Item', 'ItemAttributes', 'Title')['v'], 'key' : xp(track, 'Item', 'ASIN')['v'], } return [ track_d[k] for k in sorted(track_d) ] except LookupRequiredException: return [] except Exception: # TODO: It seems possible that only one of the requests failed; shouldn't we keep the results of the others? report() return []
def source(start, count): total = start + count while total > len(results): try: value = None if tolerant: try: value = constructor(generator.next()) except StopIteration: raise except Exception: logs.report() else: value = constructor(generator.next()) if value is not None: if unique: if value not in value_set: results.append(value) value_set.add(value) else: results.append(value) except StopIteration: break result = results[start:] return result
def __searchEntityTypeLite(self, entityType, queryText, resultsDict, timeout): try: if isinstance(queryText, unicode): queryText = queryText.encode('utf-8') resultsDict[entityType] = self.__itunes.method('search', entity=entityType, term=queryText, priority='high', timeout=timeout)['results'] except Exception: logs.report()
def wrapperFn(): try: return logs.runInOtherLoggingContext(userFn, currLoggingContext) except GreenletExit: # If we deliberately killed the thread, don't log that. Makes the search logs really noisy. pass except: logs.report()
def writeComparisons(oldResults, newResults, outputDir): oldKeys = oldResults.viewkeys() newKeys = newResults.viewkeys() if oldKeys ^ newKeys: print 'WARNING: old and new results have mismatched keys:' print '%d OLD KEYS:' % len(oldKeys - newKeys), oldKeys - newKeys print '%d NEW KEYS:' % len(newKeys - oldKeys), newKeys - oldKeys changedRows = [] clusteringChanges = [] allRows = [] commonKeys = oldKeys & newKeys for key in commonKeys: oldResolved, oldOriginal, oldProxyList = oldResults[key] newResolved, newOriginal, newProxyList = newResults[key] filename = key[:40] + '.html' oldData = __stripEntity(oldResolved.dataExport()) newData = __stripEntity(newResolved.dataExport()) try: with open(path.join(outputDir, filename), 'w') as fout: print >> fout, DIFF_FILE_HEADER print >> fout, '<h1>%s</h1>' % 'Enrich Input' print >> fout, __createDiffTable(pprint.pformat(oldOriginal), pprint.pformat(newOriginal)) print >> fout, '<h1>%s</h1>' % 'Resolve output' print >> fout, __createDiffTable(pprint.pformat(oldResolved.dataExport()), pprint.pformat(newResolved.dataExport())) print >> fout, '<h1>%s</h1>' % 'List of resolver objects:' print >> fout, __createDiffTable(__formatProxyList(oldProxyList), __formatProxyList(newProxyList)) print >> fout, '</body></html>' except Exception: logs.warning('Error writing diff file!') logs.report() diffLink = '<td><a href="%s">show diffs</a></td>' % filename tableRow = '<tr><td>%s</td>%s</tr>' % (oldOriginal['title'][:100], diffLink) if oldData != newData: changedRows.append(tableRow) if __hasClusteringChange(oldProxyList, newProxyList): clusteringChanges.append(tableRow) allRows.append(tableRow) allRowsFilename = 'index_all.html' writeTableOfContent(allRows, 'All results', path.join(outputDir, allRowsFilename)) summary = """ %d out of %d (%f%%) of the rows had clustering change. Here's a shuffled list of them. <a href="%s">show all</a> """ % (len(clusteringChanges), len(allRows), float(len(clusteringChanges)) * 100 / len(allRows), allRowsFilename) random.shuffle(clusteringChanges) writeTableOfContent(clusteringChanges, summary, path.join(outputDir, 'index_cluster.html')) summary = """ %d out of %d (%f%%) of the rows changed. Here's a shuffled list of them. <a href="%s">show all</a> """ % (len(changedRows), len(allRows), float(len(changedRows)) * 100 / len(allRows), allRowsFilename) random.shuffle(changedRows) writeTableOfContent(changedRows, summary, path.join(outputDir, 'index.html'))
def wrapper(worker, job): try: task_id, key, data, extra = pickleDecoder(job.data) handler(task_id, key, data, **extra) except Exception as e: basic_message = "Invalid job: %s %s" % (job.task, job.unique) logs.error(basic_message) logs.report() _warningEmail(basic_message) return ''
def getProxiesForEntity(self, entity): source_id = self.getId(entity) if source_id is None: try: query = self.stamped.proxyFromEntity(entity) results = self.resolve(query) return [result[1] for result in results if result[0]['resolved']] except ValueError: logs.report() return [] return [self.entityProxyFromKey(source_id, entity=entity)]
def entityProxyFromKey(self, key, **kwargs): try: lookupData = globalAmazon().item_lookup(ResponseGroup='Large', ItemId=key, timeout=MERGE_TIMEOUT) result = _getLookupResult(lookupData) kind = xp(result, 'ItemAttributes', 'ProductGroup')['v'].lower() logs.debug(kind) if kind == 'book' or kind == 'ebooks': return AmazonBook(key, result, 0) if kind == 'video games': return AmazonVideoGame(key, result, 0) return self.__constructMusicObjectFromResult(result, 0) except KeyError: logs.report() return None
def enrichEntity(self, entity, groups, controller, decorations, timestamps): singleplatform_id = getattr(entity.sources, 'singleplatform_id') try: if singleplatform_id is not None: if controller.shouldEnrich('menu', self.sourceName, entity): menu = self.__singleplatform.get_menu_schema(singleplatform_id, timeout=MERGE_TIMEOUT) entity.menu = menu != None if menu is not None: menu.entity_id = entity.entity_id decorations['menu'] = menu logs.debug('Regenerated menu for %s' % singleplatform_id) except HTTPError as e: logs.warning("HttpError %s from SinglePlatform for %s" % (e.code,singleplatform_id)) except Exception as e: report("unexpected SinglePlatformSource error: %s" % e) return True
def wrapper(task_id, key, data, **kwargs): try: logs.begin(saveLog=api._logsDB.saveLog, saveStat=api._statsDB.addStat, nodeName=api.node_name) logs.async_request(key) logs.info("Request %s: %s: %s: %s" % (task_id, key, data, kwargs)) handler(task_id, key, data, **kwargs) logs.info("Finished with request %s" % (task_id,)) except Exception as e: logs.error("Failed request %s" % (task_id,)) logs.report() _warningEmail('%s - %s failed (%s)' % (api.node_name, key, datetime.utcnow().isoformat())) finally: logs.info('Saving request log for request %s' % (task_id,)) try: logs.save() except Exception: print 'Unable to save logs' import traceback traceback.print_exc() logs.warning(traceback.format_exc())
def _convertFromMongo(self, document, mini=False): if document is None: return None if '_id' in document and self._primary_key is not None: document[self._primary_key] = self._getStringFromObjectId(document['_id']) del(document['_id']) ### HACK: Verify that 'created' timestamp exists for entity if 'timestamp' not in document or 'created' not in document['timestamp']: try: created = ObjectId(document[self._primary_key]).generation_time.replace(tzinfo=None) except: report() raise document['timestamp'] = { 'created' : created } document.pop('titlel', None) document.pop('search_tokens', None) entity = buildEntity(document, mini=mini) return entity
def __searchSource(self, source, queryCategory, queryText, resultsDict, timesDict, **queryParams): try: # Note that the timing here is not 100% legit because gevent won't interrupt code except on I/O, but it's good # enough to give a solid idea. before = datetime.datetime.now() if shouldLogRawSourceResults: queryParams['logRawResults'] = True results = source.searchLite(queryCategory, queryText, **queryParams) after = datetime.datetime.now() # First level of filtering on data quality score -- results that are really horrendous get dropped entirely # pre-clustering. filteredResults = [result for result in results if result.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_CLUSTER] timesDict[source] = after - before logs.debug("GOT RESULTS FROM SOURCE %s IN ELAPSED TIME %s -- COUNT: %d, AFTER FILTERING: %d" % ( source.sourceName, str(after - before), len(results), len(filteredResults) )) resultsDict[source] = filteredResults except GreenletExit: pass except: logs.report() resultsDict[source] = []
import Globals from logs import report try: import logs from resolve.Resolver import * from resolve.ResolverObject import * from resolve.TitleUtils import * from libs.Rdio import Rdio, globalRdio from resolve.GenericSource import GenericSource, MERGE_TIMEOUT, SEARCH_TIMEOUT from utils import lazyProperty from pprint import pformat from search.ScoringUtils import * except: report() raise class _RdioObject(object): """ Abstract superclass (mixin) for Rdio objects. _RdioObjects can be instatiated with either the rdio_id or the rdio data for an entity. If both are provided, they must match. extras may be used to retrieve additional data when instantiating an object using only its id. Attributes: data - the type-specific rdio data for the entity rdio - an instance of Rdio (API proxy) """
def searchLite(self, queryCategory, queryText, timeout=None, coords=None, logRawResults=False): if queryCategory not in ('music', 'film', 'app', 'book'): raise NotImplementedError() supportedProxyTypes = { 'music': (iTunesArtist, iTunesAlbum, iTunesTrack), 'film': (iTunesMovie, iTunesTVShow), 'app': (iTunesApp,), 'book': (iTunesBook,), }[queryCategory] types = mapCategoryToTypes(queryCategory) iTunesTypes = [] typesMap = dict(self.__types_to_itunes_strings) for entityType in types: iTunesTypes.append(typesMap[entityType]) pool = Pool(len(iTunesTypes)) rawResults = {} for iTunesType in iTunesTypes: pool.spawn(self.__searchEntityTypeLite, iTunesType, queryText, rawResults, timeout) pool.join(timeout=timeout) if logRawResults: logComponents = ["\n\n\nITUNES RAW RESULTS\nITUNES RAW RESULTS\nITUNES RAW RESULTS\n\n\n"] searchResultsByType = {} # Convert from JSON objects to entity proxies. Pass through actual parsing errors, but report & drop the result # if we just see a type we aren't expecting. (Music search will sometimes return podcasts, for instance.) for (iTunesType, rawTypeResults) in rawResults.items(): processedResults = [] for rawResult in rawTypeResults: try: if logRawResults: logComponents.extend(['\n\n', pformat(rawResult), '\n\n']) proxy = self.__createEntityProxy(rawResult, maxLookupCalls=0) if not any(isinstance(proxy, proxyType) for proxyType in supportedProxyTypes): logs.warning('Dropping iTunes proxy of unsupported type %s for queryCategory %s:\n\n%s\n\n' % (proxy.__class__.__name__, queryCategory, str(proxy))) continue processedResults.append(self.__createEntityProxy(rawResult, maxLookupCalls=0)) except UnknownITunesTypeError: logs.report() pass if len(processedResults) > 0: searchResultsByType[iTunesType] = self.__scoreResults(iTunesType, processedResults, queryText) if logRawResults: logComponents.append("\n\n\nEND RAW ITUNES RESULTS\n\n\n") logs.debug(''.join(logComponents)) if len(searchResultsByType) == 0: # TODO: Throw exception to avoid cache? return [] if len(searchResultsByType) == 1: return searchResultsByType.values()[0] if queryCategory == 'music': # We have to separately request songs, albums, and artists because iTunes does a terrible job blending # results between the three. So we need to blend, but it's hard to know how to. We do a little work on the # string matching side, but self.__augmentAlbumAndArtistResultsWithSongs(searchResultsByType.get('album', []), searchResultsByType.get('musicArtist', []), searchResultsByType.get('song', [])) return interleaveResultsByRelevance(searchResultsByType.values())
__copyright__ = "Copyright (c) 2011-2012 Stamped.com" __license__ = "TODO" import Globals, logs, re import unicodedata, utils try: from api.Constants import * from api.Schemas import * from difflib import SequenceMatcher from libs.LibUtils import parseDateString from datetime import datetime from bson.objectid import ObjectId from collections import defaultdict except: logs.report() raise def mapSubcategoryToCategory(subcategory): try: return subcategoryData[subcategory][0] except KeyError: logs.warning("Subcategory not defined: %s" % subcategory) raise def mapSubcategoryToKinds(subcategory): try: return set(subcategoryData[subcategory][1]) except KeyError: logs.warning("Subcategory not defined: %s" % subcategory) raise
def __terminateWaiting(self, pool, start_time, category, resultsDict): logTimingData('IN TERMINATE WAITING') sources_to_priorities = dict(self.__categories_to_sources_and_priorities[category]) total_value_received = 0 total_potential_value_outstanding = sum(sources_to_priorities.values()) sources_seen = set() while True: try: elapsed_seconds = total_seconds(datetime.datetime.now() - start_time) if elapsed_seconds >= 7: logs.warning('Search completely timed out at 7s!') pool.kill() return for (source, results) in resultsDict.items(): if source in sources_seen: continue logTimingData('JUST NOW SEEING SOURCE: ' + source.sourceName) sources_seen.add(source) # If a source returns at least 5 results, we assume we got a good result set from it. If it # returns less, we're more inclined to wait for straggling sources. total_value_received += sources_to_priorities[source] * min(5, len(results)) / 5.0 logTimingData('DECREMENTING OUTSTANDING BY ' + str(sources_to_priorities[source]) + ' FOR SOURCE ' + source.sourceName) total_potential_value_outstanding -= sources_to_priorities[source] logTimingData('AT %f seconds elapsed, TOTAL VALUE RECEIVED IS %f, TOTAL OUTSTANDING IS %f' % ( elapsed_seconds, total_value_received, total_potential_value_outstanding )) except Exception: logs.warning('TERMINATE_WARNING SHIT IS F****D') logs.report() raise if total_potential_value_outstanding <= 0: logTimingData('ALL SOURCES DONE') return if total_value_received: marginal_value_of_outstanding_sources = total_potential_value_outstanding / total_value_received # Comes out to: # 0.08 for 1s # 0.25 for 1.5s # 0.79 for 2s # 2.51 for 2.5s # 7.94 for 3s # So we'll ditch that 4th remaining source for music around 1.5s; we'll ditch the second source for # something like Places around 2s; we'll ditch any lingering source around 3s if we've received # anything. min_marginal_value = 10 ** (elapsed_seconds - 2.1) if min_marginal_value > marginal_value_of_outstanding_sources: sources_not_seen = [ source.sourceName for source in sources_to_priorities.keys() if source not in sources_seen ] if sources_not_seen: # This is interesting information whether we want the full timing data logged or not. log_template = 'QUITTING EARLY: At %f second elapsed, bailing on sources [%s] because with ' + \ 'value received %f, value outstanding %f, marginal value %f, min marginal value %f' logs.debug(log_template % ( elapsed_seconds, ', '.join(sources_not_seen), total_value_received, total_potential_value_outstanding, marginal_value_of_outstanding_sources, min_marginal_value )) pool.kill() return gevent.sleep(0.01)