Ejemplo n.º 1
0
 def __init__(self, key=_API_V3_Key, secret=_API_V3_Secret, log=None):
     self.__v3_key = key
     self.__v3_secret = secret
     self.__singleplatform = StampedSinglePlatform()
     self.__log_file = log
     self.__limiter = RateLimiter(cpm=400,cpd=180000)
     self.__max_crosswalk_age = timedelta(30)
     self.__max_resolve_age = timedelta(30)
Ejemplo n.º 2
0
class Factual(object):
    """
    Factual API Wrapper
    """
    def __init__(self, key=_API_V3_Key, secret=_API_V3_Secret, log=None):
        self.__v3_key = key
        self.__v3_secret = secret
        self.__singleplatform = StampedSinglePlatform()
        self.__log_file = log
        self.__limiter = RateLimiter(cpm=400,cpd=180000)
        self.__max_crosswalk_age = timedelta(30)
        self.__max_resolve_age = timedelta(30)

    def search(self, query, limit=_limit, coordinates=None, radius=5000, priority="low"):
        params = {}
        params['prefix']    = 't'
        #params['limit']     = limit
        if isinstance(query, unicode):
            query = query.encode('utf-8')
        params['q']         = urllib.quote(query)
        params['filters']   = urllib.quote(json.dumps({
            '$or':[
                    {"category":{"$bw":"Food & Beverage"}},
                    {"category":{"$bw":"Arts, Entertainment & Nightlife"}},
                ]
            }))

        if coordinates is not None:
            params['geo'] = urllib.quote(json.dumps({
                '$circle':{'$center':[coordinates[0], coordinates[1]], '$meters':radius}
            }))

        return self.__factual('global', priority=priority, **params)

    def resolve(self, data, limit=_limit, priority="low"):
        """
        Use Resolve service to match entities to limited attributes, including partial names.

        Accepts a JSON compatible object such as: {'name':'ino','locality':'New York'}

        returns a JSON like object which includes the usual attributes and 'similarity'

        which indicates the quality of the match. Resolve does not require much information
        and will operate on partial names.
        """
        string = json.dumps(data)
        r = self.__factual('resolve', values=urllib.quote(string), priority=priority)
        if r != None and len(r) > limit:
            r = r[:limit]
        return r

    def places(self, data, limit=_limit, priority="low"):
        """
        A stricter search than resolve. Seems to only produce entities which exactly match the given fields (at least for name).
        """
        string = urllib.quote(json.dumps(data))
        return self.__factual('global', prefix='t', filters=string, priority=priority)

    def place(self, factual_id, priority="low"):
        result = self.places({'factual_id':factual_id}, 1, priority)
        if result:
            return result[0]
        else:
            return None

    def crosswalk_id(self, factual_id, namespace=None, limit=_limit, namespaces=None, priority="low"):
        """
        Use Crosswalk service to find urls and ids that match the given entity.

        If namespace is provided, it limits the scope of the search to that service.

        It appears that there are not necessarilly crosswalk results for every factual_id.

        Regardless of the options, every entry in the result will contain the following fields:

        factual_id - the given id
        namespace - the namespace of entry (i.e. 'singleplatform')
        namespace_id - the string id within the namespace (i.e. 'ino') or '' if unknown/non-existant
        url - the url associated with the entity or '' (i.e. 'http://www.menuism.com/restaurants/ino-new-york-253388')
        """
        args = {'factual_id':factual_id}
        if namespace != None:
            args['only'] = namespace
        elif namespaces != None:
            args['only'] = ','.join(namespaces)
        return self.__factual('crosswalk', priority=priority, **args)

    def crosswalk_external(self, space, space_id, namespace=None, limit=_limit, priority="low"):
        """
        Use Crosswalk service to find urls and ids that match the given external entity.

        If namespace is provided, it limits the scope of the search to that service.
        Regardless of the options, every entry in the result will contain the following fields:

        factual_id - the given id
        namespace - the namespace of entry (i.e. 'singleplatform')
        namespace_id - the string id within the namespace (i.e. 'ino') or '' if unknown/non-existant
        url - the url associated with the entity or '' (i.e. 'http://www.menuism.com/restaurants/ino-new-york-253388')
        """
        args = {'namespace':space,'namespace_id':space_id}
        if namespace != None:
            args['only'] = namespace
        return self.__factual('crosswalk', priority=priority, **args)

    def crossref_id(self, factual_id, limit=_limit, priority="low"):
        """
        Use Crossref service to find urls that pertain to the given entity.
        """
        return self.__factual('crossref', factual_id=factual_id, priority=priority)

    def crossref_url(self, url, limit=_limit, priority="low"):
        """
        User Crossref service to find the entities related/mentioned at the given url.
        """
        return self.__factual('crossref', url=urllib.quote(url), priority=priority)

    def restaurant(self, factual_id, priority="low"):
        """
        Get Factual restaurant data for a given factual_id.
        """
        string = json.dumps({'factual_id':factual_id})
        result = self.__factual('restaurants-us', 't', filters=urllib.quote(string), priority=priority)
        if result:
            return result[0]
        else:
            return None

    def entity(self, factual_id):
        """
        STUB Create a Stamped entity from a factual_id.
        """
        entity = BasicEntity()
        self.enrich(entity,factual_id)
        return entity

    def resolveEntity(self, entity):
        factual_id = None
        result = False
        if 'factual_id' in entity:
            factual_id = entity['factual_id']
        else:
            should_resolve = 'factual_timestamp' not in entity
            if 'factual_timestamp' in entity:
                resolve_age = datetime.utcnow() - entity['factual_timestamp']
                if resolve_age > self.__max_resolve_age:
                    should_resolve = True
            if should_resolve:
                factual_id = self.factual_from_entity(entity)
                entity.factual_timestamp = datetime.utcnow()
                entity.factual_id = factual_id
                result = True
        if factual_id is not None:
            should_crosswalk = 'factual_crosswalk' not in entity
            if 'factual_crosswalk' in entity:
                crosswalk_age = datetime.utcnow() - entity['factual_crosswalk']
                if crosswalk_age > self.__max_crosswalk_age:
                    should_crosswalk = True
            if should_crosswalk:
                data = self.crosswalk_id(factual_id,namespaces=['singleplatform', 'foursquare'])
                if data is not None:
                    for datum in data:
                        namespace = datum['namespace']
                        namespace_id = datum['namespace_id']
                        if namespace == 'singleplatform':
                            entity.singleplatform_id = namespace_id
                            entity.singleplatform_timestamp = datetime.utcnow()
                        elif namespace == 'foursquare':
                            entity.foursquare_id = namespace_id
                            entity.foursquare_timestamp = datetime.utcnow()
                entity.factual_crosswalk = datetime.utcnow()
                result = True
        return result

    def enrichEntity(self, entity):
        return self.enrich(entity)

    def enrich(self, entity, factual_id=None, data=None):
        result = False
        if factual_id is None:
            if 'factual_id' in entity:
                factual_id = entity.factual_id
            else:
                factual_id = self.factual_from_entity(entity)
                if factual_id is not None:
                    entity.factual_id = factual_id
                    entity.factual_timestamp = datetime.utcnow()
                    result = True
        if factual_id is None:
            return False
        if data is None:
            data = self.data(factual_id,entity=entity)
        if data is None:
            return result
        result = _enrich(entity,data)
        #if self.__sourceController.writeTo('address','factual',entity):
        if True:
            _populate(entity,data,_address_fields)
            entity.address_source = 'factual'
            entity.address_timestamp = datetime.utcnow()
            result = True
        return result

    def factual_from_entity(self, entity):
        """
        Get the factual_id (if any) associated with the given entity.

        This method iterates through all available filters for the given
        entity until one of them resolves acceptably.

        If the entity fails to resolve, None is returned.
        """
        first = True
        filters = _combos(entity)
        for f in filters:
            results = self.resolve(f,10)
            if results:
                for result in results:
                    if self.__acceptable(result,entity,f,first):
                        return result['factual_id']
                    first = False
        return None

    def factual_from_singleplatform(self, singleplatform_id):
        """
        Get the factual_id (if any) associated with the given singleplatform ID.

        Convenience method for crosswalk lookup from a singleplatform ID.
        """
        crosswalk_result = self.crosswalk_external('singleplatform',singleplatform_id,'singleplatform')
        if crosswalk_result:
            return crosswalk_result['factual_id']
        else:
            return None

    def singleplatform(self, factual_id):
        """
        Get singleplatform id from factual_id

        Convenience method for crosswalk lookup for singleplatform
        """
        singleplatform_info = self.crosswalk_id(factual_id,namespace='singleplatform')
        sp_id = None
        if singleplatform_info and 'namespace_id' in singleplatform_info[0]:
            sp_id = singleplatform_info[0]['namespace_id']
        if sp_id:
            return sp_id
        else:
            return None

    def factual_from_foursquare(self, foursquare_id):
        """
        Get the factual_id (if any) associated with the given foursquare ID.

        Convenience method for crosswalk lookup from a foursquare ID.
        """
        crosswalk_result = self.crosswalk_external('foursquare',foursquare_id,'foursquare')
        if crosswalk_result:
            return crosswalk_result['factual_id']
        else:
            return None

    def foursquare(self, factual_id):
        """
        Get foursquare id from factual_id

        Convenience method for crosswalk lookup for foursquare
        """
        foursquare_info = self.crosswalk_id(factual_id,namespace='foursquare')
        sp_id = None
        if foursquare_info and 'namespace_id' in foursquare_info[0]:
            sp_id = foursquare_info[0]['namespace_id']
        if sp_id:
            return sp_id
        else:
            return None


    def data(self, factual_id, entity=None, priority="low"):
        """
        Generate Factual data for given factual_id.

        The entity argument is optional but may allow the method to run more efficiently.
        """
        data = self.restaurant(factual_id, priority)
        if data is None:
            data = self.place(factual_id, priority=priority)
        return data

    def menu(self, factual_id):
        """
        Get menu for a factual_id

        Currently only supports singleplatform and returns singleplatform menu verbatim.
        """
        sp_id = self.singleplatform(factual_id)
        if sp_id:
            m = self.__singleplatform.get_menu_schema(sp_id)
            return m
        else:
            return None

    # note: these decorators add tiered caching to this function, such that
    # results will be cached locally with a very small LRU cache of 64 items
    # and also cached in Mongo or Memcached with the standard TTL of 7 days.
    @countedFn(name='Factual (before caching)')
    @lru_cache(maxsize=64)
    @cachedFn()
    @countedFn(name='Factual (after caching)')
    def __rawFactual(self, service, prefix='places', priority='low', **args):
        """
        Helper method for making OAuth Factual API calls.

        This code is based on the recommended Python sample code available at:

        http://developer.factual.com/display/docs/Core+API+-+Oauth

        The custom beginning constructs the url based on input parameters.

        The custom end parses the JSON response and abstracts the data portion if successful.
        """
        pairs = [ '%s=%s' % (k,v) for k,v in args.items() ]
        url =  "http://api.v3.factual.com/%s/%s?%s" % (prefix,service,'&'.join(pairs))
        params    = parse_qsl(urlparse(url).query)
        consumer  = oauth.OAuthConsumer(key=self.__v3_key, secret=self.__v3_secret)
        request   = oauth.OAuthRequest.from_consumer_and_token(consumer, http_method='GET', http_url=url, parameters=params)

        request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(), consumer, None)


        response, content = service_request('factual', 'GET', url, header=request.to_header(), priority=priority)

        return content

    def __factual(self, service, prefix='places', priority="low", **args):
        """
        Factual results are difficult to turn into mongo objects for the mongo cache because sometimes they contain
        dicts that use "$distance" as a key which is a problem for mongo. So we cache the result before it's parsed.
        """
        m = json.loads(self.__rawFactual(service, prefix, priority, **args))
        try:
            return m['response']['data']
        except:
            return None
    
    def __distance(self, a, b):
        if 'latitude' in a and 'latitude' in b and 'longitude' in a and 'longitude' in b:
            latA = a['latitude']
            latB = b['latitude']
            lonA = a['longitude']
            lonB = b['longitude']
            dLat = latA-latB
            dLon = lonA-lonB
            return (dLat**2+dLon**2)**.5
        else:
            #Don't disqualify if ommitted
            return 0

    def __phone_test(self, result, entity, filters, verbose=False):
        if 'tel' in filters and 'tel' in result:
            good = filters['tel'] == result['tel'] or result['similarity'] > .98
            if not good and verbose:
                self.__log("Rejected for different tel values\n")
            return good
        else:
            return True 
    
    def __category_test(self, result, entity, filters, verbose=False):
        if 'category' not in filters or 'category' not in result:
            # Don't reject things for no category
            return True
        if not result['category'].startswith(filters['category']):
            if verbose: self.__log("Rejected for bad category\n")
            return False
        else:
            return True

    def __custom_test(self, result, entity, filters, verbose=False):
        if not self.__category_test(result,entity,filters,verbose):
            return False
        if self.__distance(result,filters) > 1:
            if verbose: self.__log("Rejected for distance\n")
            return False
        if not self.__phone_test(result,entity,filters,verbose):
            return False
        if result['similarity'] < .70:
            if verbose: self.__log("Rejected for similarity\n")
            return False
        if 'country' not in result:
            if verbose: self.__log("Missing country\n")
            return False
        return True

    
    def __acceptable(self, result, entity, filters, verbose=False):
        """
        Determines whether a Resolve result is a positive match.
        
        Currently trusts the builtin 'resolved' field. 
        """
        good = result['resolved']
        if not good:
            good = self.__custom_test(result,entity,filters,verbose)
        if not good and verbose:
            self.__log('FAILED:\n%s\n%s\n%s\n' % (result,entity,filters))
        return good
    
    def __log(self,message):
        if self.__log_file:
            self.__log_file.write(message)