def fetchDetails(placeID): placeStatus = statusTable[placeID] # Get a list of (src, version) pairs that could be updated, skip searched places # TODO: Gracefully handle if TripAdvisor-mapper runs out of API calls (25k) newProviders = [ src for src in config if src not in placeStatus or ( config[src] > placeStatus[src] and placeStatus[src] != Status.NOT_FOUND.value) ] if not newProviders: # log.info("No new sources for {}".format(placeID)) return try: placeProviderIDs = proxwalk.getAndCacheProviderIDs( placeID, newProviders, placeStatus["identifiers"]) except Exception as e: log.error("Error fetching or caching provider id: {}".format(e)) return updatedProviders = request_handler.researchPlace( placeID, placeProviderIDs) # Write updated sources to /status newStatus = makeNewStatusTable(config, updatedProviders, placeProviderIDs, newProviders) try: placeStatus.update(newStatus) db().child(venuesTable, "status", placeID).update(placeStatus) except Exception as e: log.error("Error accessing status table for {}: {}".format( placeID, e)) log.info("{} done: {}".format(placeID, str(updatedProviders)))
def _guessYelpId(placeName, lat, lon): safePlaceId = hashlib.md5(placeName).hexdigest() cachedId = db.child(eventsTable).child("cache/" + safePlaceId).get().val() if cachedId: return cachedId opts = { # 'term': placeName, # Yelp does a bad job with term searching 'limit': 20, #'radius_filter': 1000, #'sort_by': 'distance', 'sort': 1, } r = yelpClient.search_by_coordinates(lat, lon, **opts) if len(r.businesses) > 0: location = (lat, lon) businessesWithCoords = filter( lambda b: (b.location is not None) and (b.location.coordinate is not None), r.businesses) biz = min(businessesWithCoords, key=lambda b: geo.distance(location, (b.location.coordinate.latitude, b.location.coordinate.longitude)) ) log.debug("%s --> %s" % (placeName, biz.name)) researchVenue(biz) # Add bizId to cache record = { "cache/" + safePlaceId: str(biz.id) } db.child(eventsTable).update(record) return biz.id else: log.info("Can't find %s" % placeName) return None
def _print(self, result) -> bool: if len(result) == 0: info(f"[{self.store_name}] {self.product_name} not available.") return False for elem in result: success(f"[{self.store_name}] {self.product_name} Available {elem}!") return True
def searchLocation(lat, lng, radius): yelpVenues = search.getVenuesFromIndex(lat, lng, radius) log.debug("Writing venues...") writeYelpRecords(yelpVenues) log.info("Wrote %d venues" % len(yelpVenues))
def searchLocationWithErrorRecovery(lat, lng, radius=None): try: searchLocation(lat, lng, radius=radius) except KeyboardInterrupt: log.info("GOODBYE") sys.exit(1) except Exception: from app.util import log log.exception("Unknown exception")
def getVenuesFromIndex(lat, lon, radius): all = _getVenuesFromIndex(lat, lon, radius, YELP_SORT_ORDER) seen = set() unique = [ biz for biz in all if biz.id not in seen and not seen.add(biz.id) and biz.location.coordinate != None ] log.info("Found %d unique venues with locations" % len(unique)) return unique
def expandPlaces(config, center, radius_km): """ Expands cached venue details by fetching additional sources Config is of the form: { <provider>: <version> } where version is the newest version status """ statusTable = db().child(venuesTable).child("status").get().val() # Fetch placeIDs to expand location_table = db().child(locationsTable).get().val() placeIDs = geo.get_place_ids_in_radius(center, radius_km, location_table) log.info("{} places found".format(len(placeIDs))) def fetchDetails(placeID): placeStatus = statusTable[placeID] # Get a list of (src, version) pairs that could be updated, skip searched places # TODO: Gracefully handle if TripAdvisor-mapper runs out of API calls (25k) newProviders = [ src for src in config if src not in placeStatus or ( config[src] > placeStatus[src] and placeStatus[src] != Status.NOT_FOUND.value) ] if not newProviders: # log.info("No new sources for {}".format(placeID)) return try: placeProviderIDs = proxwalk.getAndCacheProviderIDs( placeID, newProviders, placeStatus["identifiers"]) except Exception as e: log.error("Error fetching or caching provider id: {}".format(e)) return updatedProviders = request_handler.researchPlace( placeID, placeProviderIDs) # Write updated sources to /status newStatus = makeNewStatusTable(config, updatedProviders, placeProviderIDs, newProviders) try: placeStatus.update(newStatus) db().child(venuesTable, "status", placeID).update(placeStatus) except Exception as e: log.error("Error accessing status table for {}: {}".format( placeID, e)) log.info("{} done: {}".format(placeID, str(updatedProviders))) pool = ThreadPool(8) pool.map(fetchDetails, placeIDs) log.info("Finished crawling other sources")
def check_availability(self) -> bool: if not self.cfg.best_buy_api_key: info(f"[{self.store_name}] api key missing. Skipping...") return False sleep(1) is_available = False if self._print(self.available_within_zip(int(self.product_id))): is_available = True sleep(1) if self._print(self.available_online(int(self.product_id))): is_available = True return is_available
def check_availability(self) -> bool: is_available = False for zip_code in self.cfg.target_zip_codes: try: amount, location = self.check_specific(zip_code) if amount > 0: is_available = True success( f"[{self.store_name}] {self.product_name} {amount} units found {location}" ) else: info( f"[{self.store_name}] {self.product_name} not available in {zip_code}" ) except KeyError: fail(f"[{self.store_name}] {self.product_name} could not find " f"something in the list for {self.store_name}") return is_available
def findSearchRecord(center, radius=1000): import app.geofire as geo import time queries = geo.geohashQueries(center, radius) now = time.time() for query in queries: results = db.child(searchesTable).order_by_key().start_at(query[0]).end_at(query[1]).get() for result in results.each(): record = result.val() if record.get("time", 0) + searchCacheExpiry < now: db.child(searchesTable).child(result.key()).remove() continue # double check that we're within distance circleDistance = geo.distance(center, record["l"]) * 1000 # 1000 m in 1 km (geo.distance is in km, searchCacheRadius is in m) if circleDistance < searchCacheRadius: return record log.info("Circle distance is " + str(circleDistance))
def getGcalEventObj(event): if ("dateTime" not in event["start"]) or ("dateTime" not in event["end"]) or ("location" not in event): return None eventLoc = event["location"] name, address = events.getNameAndAddress(eventLoc) mapping = search._getAddressIdentifiers(eventLoc) if mapping: try: location = mapping['location'] placeName = '%s, %s' % (mapping['name'], mapping['zipcode']) yelpId = _guessYelpId(eventLoc, location['lat'], location['lng']) if yelpId: optUrl = event["description"] if "description" in event else None eventObj = representation.eventRecord(yelpId, location['lat'], location['lng'], event['summary'], event['start']['dateTime'], event['end']['dateTime'], optUrl) return eventObj except Exception as err: log.exception("getGcalEventObj") log.info("Unable to find corresponding location for %s" % eventLoc)
def check_for_inventory(self, content): doc = html.fromstring(content) try: raw_availability = doc.xpath( '//div[@id ="ProductBuy"]//span[contains(@class, "btn-message")]//text()' ) result = "".join( raw_availability).strip() if raw_availability else None if str(result) in str("Sold Out"): info(f"[{self.store_name}] {self.product_name} not available") return False raw_availability = doc.xpath( '//div[contains(@class, "flags-body")]//text()') result = "".join( raw_availability).strip() if raw_availability else None if str(result) in str("CURRENTLY SOLD OUT"): info(f"[{self.store_name}] {self.product_name} not available") return False except: time.sleep(1) raw_availability = doc.xpath( '//div[@id ="ProductBuy"]//button[contains(@class, "btn-primary")]//text()' ) result = "".join( raw_availability).strip() if raw_availability else None if str(result).lower() in str("add to cart"): success(f"[{self.store_name}] {self.product_name} Available!") return True info(f"[{self.store_name}] {self.product_name} not available") return False
def crawlPoints(search_center, search_radius): lat, lng = search_center if dryRun: log.info("Dry run - center: (%.8f, %.8f) radius: %d meters" % (lat, lng, search_radius)) yelpVenues = search.getVenuesFromIndex(lat, lng, search_radius) for biz in yelpVenues: log.info(biz.id) log.info("%d unique results found." % len(yelpVenues)) return log.info("starting: %.8f, %.8f -------------------" % search_center) searchLocation(lat, lng, search_radius)
def searchLocation(lat, lng, radius, maxNum): # Fetch locations searchRecord = findSearchRecord((lat, lng), searchCacheRadius) if searchRecord is not None: log.debug("searchRecord: %s" % searchRecord) return else: writeSearchRecord(lat, lng) yelpVenues = search.getVenuesFromIndex(lat, lng, radius, maxNum) pool = ThreadPool(5) res = pool.map(researchVenue, yelpVenues) # Fetch events from Eventful eventListings = events.fetchEventsFromLocation(lat, lng) eRes = pool.map(researchEvent, eventListings) pool.close() pool.join() import json log.info("Found %d: %s" % (len(res), json.dumps(res)))
def searchLocation(lat, lng, radius=None): # Fetch locations searchRecord = findSearchRecord((lat, lng), searchCacheRadius) if searchRecord is not None: log.debug("searchRecord: %s" % searchRecord) return else: writeSearchRecord(lat, lng) if radius is None: radius = venueSearchRadius total = 1 offset = 0 yelpVenues = [] while offset < total: locality = search._getVenuesFromIndex(lat, lng, offset=offset, radius=radius) total = locality.total yelpVenues += locality.businesses offset = len(yelpVenues) pool = ThreadPool(5) res = pool.map(researchVenue, yelpVenues) # Fetch events from Eventful eventListings = events.fetchEventsFromLocation(lat, lng) eRes = pool.map(researchEvent, eventListings) pool.close() pool.join() import json log.info("Finished: " + json.dumps(res))
def check_availability(self) -> bool: if self.product_link is None: return False r = requests.get( url=self.product_link, headers={ "Accept": "application/json", "Referer": "https://www.walmart.com/", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36", }, ) ans = self.check_inventory(r.content) if not ans: return False if ans in "add to cart": success(f"[{self.store_name}] {self.product_name} Available!") return True info(f"[{self.store_name}] {self.product_name} not available") return False
def check_availability(self) -> bool: if self.product_link is None: return False page = requests.get( url=self.product_link, headers={ "Referer": "https://www.gamestop.com/", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "authority": "9300303.fls.doubleclick.net", "scheme": "https", "sec-fetch-dest": "iframe", "sec-fetch-mode": "navigate", "sec-fetch-site": "cross-site", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36", }, ) if page.status_code > 300: return False doc = html.fromstring(page.content) raw_availability = doc.xpath( '//div[contains(@class, "primary-details-row")]//' 'button[contains(@class, "add-to-cart")]//text()' ) result = "".join(raw_availability).strip() if raw_availability else None if str(result).lower() in str("Not Available").lower(): info(f"[{self.store_name}] {self.product_name} not available.") return False if str(result).lower() in str("Add to cart").lower(): success(f"[{self.store_name}] {self.product_name} Available!") return True info(f"[{self.store_name}] {self.product_name} not available.") return False
def crawlPoints(grid, search_radius, max_venue_per_search): for center in grid: lat, lng = center # Now actually do the search. if not dryRun: #from app.queue.enqueue import searchLocation from app.request_handler import searchLocationWithErrorRecovery as searchLocation log.info("starting: %.8f, %.8f -------------------" % center) searchLocation(lat, lng, search_radius, max_venue_per_search) else: print("%.8f, %.8f" % center) count = len(grid) log.info("Number of points: %d" % (count)) log.info("Number of Yelp searches: %d" % (count * maxVenuesPerSearch / 20)) log.info("Distance between points is %.2f km" % (grid_size_m / 1000)) log.info("Search radius is %.2f km" % (search_radius / 1000)) log.info("Maximum number of venues: %d" % (count * maxVenuesPerSearch))