def _guessYelpId(placeName, lat, lon): safePlaceId = hashlib.md5(placeName).hexdigest() cachedId = db.child(eventsTable).child("cache/" + safePlaceId).get().val() if cachedId: return cachedId opts = { # 'term': placeName, # Yelp does a bad job with term searching 'limit': 20, #'radius_filter': 1000, #'sort_by': 'distance', 'sort': 1, } r = yelpClient.search_by_coordinates(lat, lon, **opts) if len(r.businesses) > 0: location = (lat, lon) businessesWithCoords = filter( lambda b: (b.location is not None) and (b.location.coordinate is not None), r.businesses) biz = min(businessesWithCoords, key=lambda b: geo.distance(location, (b.location.coordinate.latitude, b.location.coordinate.longitude)) ) log.debug("%s --> %s" % (placeName, biz.name)) researchVenue(biz) # Add bizId to cache record = { "cache/" + safePlaceId: str(biz.id) } db.child(eventsTable).update(record) return biz.id else: log.info("Can't find %s" % placeName) return None
def searchLocation(lat, lng, radius): yelpVenues = search.getVenuesFromIndex(lat, lng, radius) log.debug("Writing venues...") writeYelpRecords(yelpVenues) log.info("Wrote %d venues" % len(yelpVenues))
def _getVenuesFromIndex(lat, lon, radius, sortOrder): locality = _singleSearchAPIQuery(lat, lon, radius * YELP_RADIUS_FACTOR, sortOrder, 0) log.debug("Crawling point: (%s, %s), radius: %s meters" % (lat, lon, radius)) # If the current result set is greater than the max we can iterate through, # divide the search area and try again. Do this recursively so that we get # the full set of places. if locality.total > YELP_MAX_PER_SEARCH and radius > YELP_MIN_SEARCH_RADIUS: # We're collecting all places inside the square inscribed in the circle # defined by the location/radius. Each sub-circle is the smallest # circle that contains each quadrant of this square. dst = radius / sqrt(8) deltaLat = dst / geofire.g_METERS_PER_DEGREE_LATITUDE deltaLong = geofire.metersToLongitudeDegrees(dst, lat) def processQuadrant(quad): try: return _getVenuesFromIndex(lat + quad[0] * deltaLat, lon + quad[1] * deltaLong, radius / 2., sortOrder) except Exception as e: traceback.print_exc() return None pool = ThreadPool(4) yelpVenues = pool.map(processQuadrant, [(1, 1), (-1, 1), (-1, -1), (1, -1)]) pool.close() pool.join() # Abort if any child threw an exception. if None in yelpVenues: return None # Recursively calling _getVenuesFromIndex leaves us with a list of # lists, so flatten the result. yelpVenues = list(itertools.chain.from_iterable(yelpVenues)) return yelpVenues # Otherwise, iterate through all pages of the result set. yelpVenues = locality.businesses offset = YELP_MAX_PER_PAGE while offset < locality.total: locality = _singleSearchAPIQuery(lat, lon, radius * YELP_RADIUS_FACTOR, sortOrder, offset) yelpVenues += locality.businesses offset += YELP_MAX_PER_PAGE return yelpVenues
def getVenueIdentifiers(yelpID): yelpURL = "https://yelp.com/biz/%s" % yelpID mapping = { "id": yelpID, "version": CROSSWALK_CACHE_VERSION, "yelp": { "url": yelpURL } } try: if deployment == "production/": crosswalk = factualClient.table("crosswalk-us") else: crosswalk = factualClient.crosswalk() obj = crosswalk.filters({"url": yelpURL}).data() if len(obj) == 0: log.debug("Crosswalk empty for Yelp -> Factual " + yelpID) return mapping, True factualID = obj[0]["factual_id"] mapping["factualID"] = factualID mapping["factual"] = {"id": factualID} idList = crosswalk.filters({"factual_id": factualID}).data() if len(idList) == 0: log.warn("Crosswalk empty for Factual -> * " + yelpID + " " + factualID) for idObj in idList: namespace = idObj["namespace"] del idObj["factual_id"] del idObj["namespace"] mapping[namespace] = idObj return mapping, True except APIException: log.error("Factual API failed again") except Exception: log.exception("Factual problem " + yelpID) return mapping, False
def searchLocation(lat, lng, radius, maxNum): # Fetch locations searchRecord = findSearchRecord((lat, lng), searchCacheRadius) if searchRecord is not None: log.debug("searchRecord: %s" % searchRecord) return else: writeSearchRecord(lat, lng) yelpVenues = search.getVenuesFromIndex(lat, lng, radius, maxNum) pool = ThreadPool(5) res = pool.map(researchVenue, yelpVenues) # Fetch events from Eventful eventListings = events.fetchEventsFromLocation(lat, lng) eRes = pool.map(researchEvent, eventListings) pool.close() pool.join() import json log.info("Found %d: %s" % (len(res), json.dumps(res)))
def searchLocation(lat, lng, radius=None): # Fetch locations searchRecord = findSearchRecord((lat, lng), searchCacheRadius) if searchRecord is not None: log.debug("searchRecord: %s" % searchRecord) return else: writeSearchRecord(lat, lng) if radius is None: radius = venueSearchRadius total = 1 offset = 0 yelpVenues = [] while offset < total: locality = search._getVenuesFromIndex(lat, lng, offset=offset, radius=radius) total = locality.total yelpVenues += locality.businesses offset = len(yelpVenues) pool = ThreadPool(5) res = pool.map(researchVenue, yelpVenues) # Fetch events from Eventful eventListings = events.fetchEventsFromLocation(lat, lng) eRes = pool.map(researchEvent, eventListings) pool.close() pool.join() import json log.info("Finished: " + json.dumps(res))
def updateRecord(yelpID, **details): providers = {} # Yelp v3. if "yelp3" in details: info = details["yelp3"] categories = None if "categories" in info: categories = [ c["alias"] for c in info["categories"] if "alias" in c ] providers["yelp3"] = { "images" : _imageRecords(info.get("photos", []), info["url"]), "hours" : _yelpHoursRecord(info.get("hours", None)), "categories": categories, } # Wikipedia. if "wikipedia" in details: info = details["wikipedia"] providers["wikipedia"] = { "url" : info["url"], "description": info["summary"], "images" : _imageRecords(info["images"], info["url"]) } # TripAdvisor if "tripadvisor" in details: info = details["tripadvisor"] reviews = info.get("reviews", []) firstReview = "" if len(reviews) > 0: firstReview = reviews[0]["text"] try: providers["tripAdvisor"] = { "rating" : float(info["rating"]) if info["rating"] else None, # This is the aggregate rating, and can be empty "totalReviewCount": int(info["num_reviews"]), "description" : firstReview, # The rating of this review is not included "url" : info["web_url"] } except KeyError: log.debug("TripAdvisor weird for " + biz.id) # Foursquare if "foursquare" in details: info = details["foursquare"] providers["foursquare"] = { "images": _imageRecords(info["images"], info["url"]) } if "google" in details: info = details["google"] # No error, just no relevant website info providers["google"] = { "website": info.get("website", None) } # Factual Places if "factual" in details: info = details["factual"] providers["factual"] = { "url": info.get("website", None) } return { "providers": providers }
def venueRecord(biz, **details): # biz is the response object from the Yelp Search API from collections import OrderedDict # h is derived from the providers, but for the main body of the record. # h is for header. h = { "url": None, "description": [], "categories": OrderedDict(), "images": [], "hours": [], } providers = {} # Yelp. if "yelp" in details: info = details["yelp"] providers["yelp"] = { "rating": biz.rating, "totalReviewCount": biz.review_count, "ratingMax": 5, "description": biz.snippet_text, "url": biz.url } h["description"].append(_descriptionRecord("yelp", biz.snippet_text)) if "categories" in info: h["categories"].update([(c["title"], _categoryRecord(c["alias"], c["title"])) for c in info["categories"] if "title" in c]) h["images"] += _imageRecords("yelp", info.get("photos", []), biz.url) h["hours"] = _yelpHoursRecord(info.get("hours", None)) # Wikipedia. if "wikipedia" in details: info = details["wikipedia"] providers["wikipedia"] = { "url": info["url"], "description": info["summary"] } h["description"].append( _descriptionRecord("wikipedia", info["summary"])) h["images"] += _imageRecords("wikipedia", info["images"], info["url"]) # TripAdvisor if "tripadvisor" in details: info = details["tripadvisor"] reviews = info.get("reviews", []) firstReview = "" if len(reviews) > 0: firstReview = reviews[0]["text"] h["description"].append( _descriptionRecord("tripadvisor", firstReview)) try: providers["tripAdvisor"] = { "rating": float(info["rating"]), # This is the aggregate rating "totalReviewCount": int(info["num_reviews"]), "description": firstReview, # The rating of this review is not included "url": info["web_url"] } except KeyError: log.debug("TripAdvisor weird for " + biz.id) # Foursquare if "foursquare" in details: info = details["foursquare"] h["images"] += _imageRecords("foursquare", info["images"], info["url"]) # Factual Places if "factual" in details: info = details["factual"] h["url"] = info.get("website", None) images = h["images"] h["images"] = random.sample(images, len(images)) coord = None if biz.location is not None and biz.location.coordinate is not None: coord = { "lat": biz.location.coordinate.latitude, "lng": biz.location.coordinate.longitude } return { "version": 1, "id": biz.id, "name": biz.name, "description": h["description"], "url": h["url"], "phone": biz.display_phone, "address": biz.location.display_address, "coordinates": coord, "categories": list(h["categories"].values()), "providers": providers, "images": h["images"], "hours": h["hours"], }