def __init__(self, key=_API_V3_Key, secret=_API_V3_Secret, log=None): self.__v3_key = key self.__v3_secret = secret self.__singleplatform = StampedSinglePlatform() self.__log_file = log self.__limiter = RateLimiter(cpm=400,cpd=180000) self.__max_crosswalk_age = timedelta(30) self.__max_resolve_age = timedelta(30)
class Factual(object): """ Factual API Wrapper """ def __init__(self, key=_API_V3_Key, secret=_API_V3_Secret, log=None): self.__v3_key = key self.__v3_secret = secret self.__singleplatform = StampedSinglePlatform() self.__log_file = log self.__limiter = RateLimiter(cpm=400,cpd=180000) self.__max_crosswalk_age = timedelta(30) self.__max_resolve_age = timedelta(30) def search(self, query, limit=_limit, coordinates=None, radius=5000, priority="low"): params = {} params['prefix'] = 't' #params['limit'] = limit if isinstance(query, unicode): query = query.encode('utf-8') params['q'] = urllib.quote(query) params['filters'] = urllib.quote(json.dumps({ '$or':[ {"category":{"$bw":"Food & Beverage"}}, {"category":{"$bw":"Arts, Entertainment & Nightlife"}}, ] })) if coordinates is not None: params['geo'] = urllib.quote(json.dumps({ '$circle':{'$center':[coordinates[0], coordinates[1]], '$meters':radius} })) return self.__factual('global', priority=priority, **params) def resolve(self, data, limit=_limit, priority="low"): """ Use Resolve service to match entities to limited attributes, including partial names. Accepts a JSON compatible object such as: {'name':'ino','locality':'New York'} returns a JSON like object which includes the usual attributes and 'similarity' which indicates the quality of the match. Resolve does not require much information and will operate on partial names. """ string = json.dumps(data) r = self.__factual('resolve', values=urllib.quote(string), priority=priority) if r != None and len(r) > limit: r = r[:limit] return r def places(self, data, limit=_limit, priority="low"): """ A stricter search than resolve. Seems to only produce entities which exactly match the given fields (at least for name). """ string = urllib.quote(json.dumps(data)) return self.__factual('global', prefix='t', filters=string, priority=priority) def place(self, factual_id, priority="low"): result = self.places({'factual_id':factual_id}, 1, priority) if result: return result[0] else: return None def crosswalk_id(self, factual_id, namespace=None, limit=_limit, namespaces=None, priority="low"): """ Use Crosswalk service to find urls and ids that match the given entity. If namespace is provided, it limits the scope of the search to that service. It appears that there are not necessarilly crosswalk results for every factual_id. Regardless of the options, every entry in the result will contain the following fields: factual_id - the given id namespace - the namespace of entry (i.e. 'singleplatform') namespace_id - the string id within the namespace (i.e. 'ino') or '' if unknown/non-existant url - the url associated with the entity or '' (i.e. 'http://www.menuism.com/restaurants/ino-new-york-253388') """ args = {'factual_id':factual_id} if namespace != None: args['only'] = namespace elif namespaces != None: args['only'] = ','.join(namespaces) return self.__factual('crosswalk', priority=priority, **args) def crosswalk_external(self, space, space_id, namespace=None, limit=_limit, priority="low"): """ Use Crosswalk service to find urls and ids that match the given external entity. If namespace is provided, it limits the scope of the search to that service. Regardless of the options, every entry in the result will contain the following fields: factual_id - the given id namespace - the namespace of entry (i.e. 'singleplatform') namespace_id - the string id within the namespace (i.e. 'ino') or '' if unknown/non-existant url - the url associated with the entity or '' (i.e. 'http://www.menuism.com/restaurants/ino-new-york-253388') """ args = {'namespace':space,'namespace_id':space_id} if namespace != None: args['only'] = namespace return self.__factual('crosswalk', priority=priority, **args) def crossref_id(self, factual_id, limit=_limit, priority="low"): """ Use Crossref service to find urls that pertain to the given entity. """ return self.__factual('crossref', factual_id=factual_id, priority=priority) def crossref_url(self, url, limit=_limit, priority="low"): """ User Crossref service to find the entities related/mentioned at the given url. """ return self.__factual('crossref', url=urllib.quote(url), priority=priority) def restaurant(self, factual_id, priority="low"): """ Get Factual restaurant data for a given factual_id. """ string = json.dumps({'factual_id':factual_id}) result = self.__factual('restaurants-us', 't', filters=urllib.quote(string), priority=priority) if result: return result[0] else: return None def entity(self, factual_id): """ STUB Create a Stamped entity from a factual_id. """ entity = BasicEntity() self.enrich(entity,factual_id) return entity def resolveEntity(self, entity): factual_id = None result = False if 'factual_id' in entity: factual_id = entity['factual_id'] else: should_resolve = 'factual_timestamp' not in entity if 'factual_timestamp' in entity: resolve_age = datetime.utcnow() - entity['factual_timestamp'] if resolve_age > self.__max_resolve_age: should_resolve = True if should_resolve: factual_id = self.factual_from_entity(entity) entity.factual_timestamp = datetime.utcnow() entity.factual_id = factual_id result = True if factual_id is not None: should_crosswalk = 'factual_crosswalk' not in entity if 'factual_crosswalk' in entity: crosswalk_age = datetime.utcnow() - entity['factual_crosswalk'] if crosswalk_age > self.__max_crosswalk_age: should_crosswalk = True if should_crosswalk: data = self.crosswalk_id(factual_id,namespaces=['singleplatform', 'foursquare']) if data is not None: for datum in data: namespace = datum['namespace'] namespace_id = datum['namespace_id'] if namespace == 'singleplatform': entity.singleplatform_id = namespace_id entity.singleplatform_timestamp = datetime.utcnow() elif namespace == 'foursquare': entity.foursquare_id = namespace_id entity.foursquare_timestamp = datetime.utcnow() entity.factual_crosswalk = datetime.utcnow() result = True return result def enrichEntity(self, entity): return self.enrich(entity) def enrich(self, entity, factual_id=None, data=None): result = False if factual_id is None: if 'factual_id' in entity: factual_id = entity.factual_id else: factual_id = self.factual_from_entity(entity) if factual_id is not None: entity.factual_id = factual_id entity.factual_timestamp = datetime.utcnow() result = True if factual_id is None: return False if data is None: data = self.data(factual_id,entity=entity) if data is None: return result result = _enrich(entity,data) #if self.__sourceController.writeTo('address','factual',entity): if True: _populate(entity,data,_address_fields) entity.address_source = 'factual' entity.address_timestamp = datetime.utcnow() result = True return result def factual_from_entity(self, entity): """ Get the factual_id (if any) associated with the given entity. This method iterates through all available filters for the given entity until one of them resolves acceptably. If the entity fails to resolve, None is returned. """ first = True filters = _combos(entity) for f in filters: results = self.resolve(f,10) if results: for result in results: if self.__acceptable(result,entity,f,first): return result['factual_id'] first = False return None def factual_from_singleplatform(self, singleplatform_id): """ Get the factual_id (if any) associated with the given singleplatform ID. Convenience method for crosswalk lookup from a singleplatform ID. """ crosswalk_result = self.crosswalk_external('singleplatform',singleplatform_id,'singleplatform') if crosswalk_result: return crosswalk_result['factual_id'] else: return None def singleplatform(self, factual_id): """ Get singleplatform id from factual_id Convenience method for crosswalk lookup for singleplatform """ singleplatform_info = self.crosswalk_id(factual_id,namespace='singleplatform') sp_id = None if singleplatform_info and 'namespace_id' in singleplatform_info[0]: sp_id = singleplatform_info[0]['namespace_id'] if sp_id: return sp_id else: return None def factual_from_foursquare(self, foursquare_id): """ Get the factual_id (if any) associated with the given foursquare ID. Convenience method for crosswalk lookup from a foursquare ID. """ crosswalk_result = self.crosswalk_external('foursquare',foursquare_id,'foursquare') if crosswalk_result: return crosswalk_result['factual_id'] else: return None def foursquare(self, factual_id): """ Get foursquare id from factual_id Convenience method for crosswalk lookup for foursquare """ foursquare_info = self.crosswalk_id(factual_id,namespace='foursquare') sp_id = None if foursquare_info and 'namespace_id' in foursquare_info[0]: sp_id = foursquare_info[0]['namespace_id'] if sp_id: return sp_id else: return None def data(self, factual_id, entity=None, priority="low"): """ Generate Factual data for given factual_id. The entity argument is optional but may allow the method to run more efficiently. """ data = self.restaurant(factual_id, priority) if data is None: data = self.place(factual_id, priority=priority) return data def menu(self, factual_id): """ Get menu for a factual_id Currently only supports singleplatform and returns singleplatform menu verbatim. """ sp_id = self.singleplatform(factual_id) if sp_id: m = self.__singleplatform.get_menu_schema(sp_id) return m else: return None # note: these decorators add tiered caching to this function, such that # results will be cached locally with a very small LRU cache of 64 items # and also cached in Mongo or Memcached with the standard TTL of 7 days. @countedFn(name='Factual (before caching)') @lru_cache(maxsize=64) @cachedFn() @countedFn(name='Factual (after caching)') def __rawFactual(self, service, prefix='places', priority='low', **args): """ Helper method for making OAuth Factual API calls. This code is based on the recommended Python sample code available at: http://developer.factual.com/display/docs/Core+API+-+Oauth The custom beginning constructs the url based on input parameters. The custom end parses the JSON response and abstracts the data portion if successful. """ pairs = [ '%s=%s' % (k,v) for k,v in args.items() ] url = "http://api.v3.factual.com/%s/%s?%s" % (prefix,service,'&'.join(pairs)) params = parse_qsl(urlparse(url).query) consumer = oauth.OAuthConsumer(key=self.__v3_key, secret=self.__v3_secret) request = oauth.OAuthRequest.from_consumer_and_token(consumer, http_method='GET', http_url=url, parameters=params) request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(), consumer, None) response, content = service_request('factual', 'GET', url, header=request.to_header(), priority=priority) return content def __factual(self, service, prefix='places', priority="low", **args): """ Factual results are difficult to turn into mongo objects for the mongo cache because sometimes they contain dicts that use "$distance" as a key which is a problem for mongo. So we cache the result before it's parsed. """ m = json.loads(self.__rawFactual(service, prefix, priority, **args)) try: return m['response']['data'] except: return None def __distance(self, a, b): if 'latitude' in a and 'latitude' in b and 'longitude' in a and 'longitude' in b: latA = a['latitude'] latB = b['latitude'] lonA = a['longitude'] lonB = b['longitude'] dLat = latA-latB dLon = lonA-lonB return (dLat**2+dLon**2)**.5 else: #Don't disqualify if ommitted return 0 def __phone_test(self, result, entity, filters, verbose=False): if 'tel' in filters and 'tel' in result: good = filters['tel'] == result['tel'] or result['similarity'] > .98 if not good and verbose: self.__log("Rejected for different tel values\n") return good else: return True def __category_test(self, result, entity, filters, verbose=False): if 'category' not in filters or 'category' not in result: # Don't reject things for no category return True if not result['category'].startswith(filters['category']): if verbose: self.__log("Rejected for bad category\n") return False else: return True def __custom_test(self, result, entity, filters, verbose=False): if not self.__category_test(result,entity,filters,verbose): return False if self.__distance(result,filters) > 1: if verbose: self.__log("Rejected for distance\n") return False if not self.__phone_test(result,entity,filters,verbose): return False if result['similarity'] < .70: if verbose: self.__log("Rejected for similarity\n") return False if 'country' not in result: if verbose: self.__log("Missing country\n") return False return True def __acceptable(self, result, entity, filters, verbose=False): """ Determines whether a Resolve result is a positive match. Currently trusts the builtin 'resolved' field. """ good = result['resolved'] if not good: good = self.__custom_test(result,entity,filters,verbose) if not good and verbose: self.__log('FAILED:\n%s\n%s\n%s\n' % (result,entity,filters)) return good def __log(self,message): if self.__log_file: self.__log_file.write(message)